#include <nsCharSetProber.h>
Public Member Functions | |
virtual | ~nsCharSetProber () |
virtual const char * | GetCharSetName ()=0 |
virtual nsProbingState | HandleData (const char *aBuf, PRUint32 aLen)=0 |
virtual nsProbingState | GetState (void)=0 |
virtual void | Reset (void)=0 |
virtual float | GetConfidence (void)=0 |
virtual void | SetOpion ()=0 |
Static Public Member Functions | |
static PRBool | FilterWithoutEnglishLetters (const char *aBuf, PRUint32 aLen, char **newBuf, PRUint32 &newLen) |
static PRBool | FilterWithEnglishLetters (const char *aBuf, PRUint32 aLen, char **newBuf, PRUint32 &newLen) |
Definition at line 53 of file nsCharSetProber.h.
virtual nsCharSetProber::~nsCharSetProber | ( | ) | [inline, virtual] |
Definition at line 55 of file nsCharSetProber.h.
PRBool nsCharSetProber::FilterWithEnglishLetters | ( | const char * | aBuf, | |
PRUint32 | aLen, | |||
char ** | newBuf, | |||
PRUint32 & | newLen | |||
) | [static] |
Definition at line 83 of file nsCharSetProber.cpp.
References PR_FALSE, PR_Malloc, and PR_TRUE.
Referenced by nsLatin1Prober::HandleData().
00084 { 00085 //do filtering to reduce load to probers 00086 char *newptr; 00087 char *prevPtr, *curPtr; 00088 PRBool isInTag = PR_FALSE; 00089 00090 newptr = *newBuf = (char*)PR_Malloc(aLen); 00091 if (!newptr) 00092 return PR_FALSE; 00093 00094 for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 00095 { 00096 if (*curPtr == '>') 00097 isInTag = PR_FALSE; 00098 else if (*curPtr == '<') 00099 isInTag = PR_TRUE; 00100 00101 if (!(*curPtr & 0x80) && 00102 (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 00103 { 00104 if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 00105 // and it is not inside a tag, keep it. 00106 { 00107 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 00108 prevPtr++; 00109 *newptr++ = ' '; 00110 } 00111 else 00112 prevPtr = curPtr+1; 00113 } 00114 } 00115 00116 // If the current segment contains more than just a symbol 00117 // and it is not inside a tag then keep it. 00118 if (!isInTag) 00119 while (prevPtr < curPtr) 00120 *newptr++ = *prevPtr++; 00121 00122 newLen = newptr - *newBuf; 00123 00124 return PR_TRUE; 00125 }
PRBool nsCharSetProber::FilterWithoutEnglishLetters | ( | const char * | aBuf, | |
PRUint32 | aLen, | |||
char ** | newBuf, | |||
PRUint32 & | newLen | |||
) | [static] |
Definition at line 43 of file nsCharSetProber.cpp.
References PR_FALSE, PR_Malloc, and PR_TRUE.
Referenced by nsSBCSGroupProber::HandleData().
00044 { 00045 char *newptr; 00046 char *prevPtr, *curPtr; 00047 00048 PRBool meetMSB = PR_FALSE; 00049 newptr = *newBuf = (char*)PR_Malloc(aLen); 00050 if (!newptr) 00051 return PR_FALSE; 00052 00053 for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 00054 { 00055 if (*curPtr & 0x80) 00056 { 00057 meetMSB = PR_TRUE; 00058 } 00059 else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 00060 { 00061 //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 00062 if (meetMSB && curPtr > prevPtr) 00063 //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 00064 { 00065 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 00066 prevPtr++; 00067 *newptr++ = ' '; 00068 meetMSB = PR_FALSE; 00069 } 00070 else //ignore current segment. (either because it is just a symbol or just an English word) 00071 prevPtr = curPtr+1; 00072 } 00073 } 00074 if (meetMSB && curPtr > prevPtr) 00075 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 00076 00077 newLen = newptr - *newBuf; 00078 00079 return PR_TRUE; 00080 }
virtual const char* nsCharSetProber::GetCharSetName | ( | ) | [pure virtual] |
Implemented in nsBig5Prober, nsEscCharSetProber, nsEUCJPProber, nsEUCKRProber, nsEUCTWProber, nsGB18030Prober, nsHebrewProber, nsLatin1Prober, nsMBCSGroupProber, nsSingleByteCharSetProber, nsSBCSGroupProber, nsSJISProber, and nsUTF8Prober.
Referenced by nsSBCSGroupProber::GetCharSetName(), nsSingleByteCharSetProber::GetCharSetName(), nsMBCSGroupProber::GetCharSetName(), and nsUniversalDetector::HandleData().
virtual float nsCharSetProber::GetConfidence | ( | void | ) | [pure virtual] |
Implemented in nsBig5Prober, nsEscCharSetProber, nsEUCJPProber, nsEUCKRProber, nsEUCTWProber, nsGB18030Prober, nsHebrewProber, nsLatin1Prober, nsMBCSGroupProber, nsSingleByteCharSetProber, nsSBCSGroupProber, nsSJISProber, and nsUTF8Prober.
Referenced by nsUniversalDetector::DataEnd(), nsHebrewProber::GetCharSetName(), nsSBCSGroupProber::GetConfidence(), and nsMBCSGroupProber::GetConfidence().
virtual nsProbingState nsCharSetProber::GetState | ( | void | ) | [pure virtual] |
Implemented in nsBig5Prober, nsEscCharSetProber, nsEUCJPProber, nsEUCKRProber, nsEUCTWProber, nsGB18030Prober, nsHebrewProber, nsLatin1Prober, nsMBCSGroupProber, nsSingleByteCharSetProber, nsSBCSGroupProber, nsSJISProber, and nsUTF8Prober.
Referenced by nsHebrewProber::GetState().
virtual nsProbingState nsCharSetProber::HandleData | ( | const char * | aBuf, | |
PRUint32 | aLen | |||
) | [pure virtual] |
Implemented in nsBig5Prober, nsEscCharSetProber, nsEUCJPProber, nsEUCKRProber, nsEUCTWProber, nsGB18030Prober, nsHebrewProber, nsLatin1Prober, nsMBCSGroupProber, nsSingleByteCharSetProber, nsSBCSGroupProber, nsSJISProber, and nsUTF8Prober.
Referenced by nsUniversalDetector::HandleData(), nsSBCSGroupProber::HandleData(), and nsMBCSGroupProber::HandleData().
virtual void nsCharSetProber::Reset | ( | void | ) | [pure virtual] |
Implemented in nsBig5Prober, nsEscCharSetProber, nsEUCJPProber, nsEUCKRProber, nsEUCTWProber, nsGB18030Prober, nsHebrewProber, nsLatin1Prober, nsMBCSGroupProber, nsSingleByteCharSetProber, nsSBCSGroupProber, nsSJISProber, and nsUTF8Prober.
Referenced by nsUniversalDetector::Reset(), nsSBCSGroupProber::Reset(), and nsMBCSGroupProber::Reset().
virtual void nsCharSetProber::SetOpion | ( | ) | [pure virtual] |