#include <nsUniversalDetector.h>


| Public Member Functions | |
| nsUniversalDetector () | |
| virtual | ~nsUniversalDetector () | 
| virtual nsresult | HandleData (const char *aBuf, PRUint32 aLen) | 
| virtual void | DataEnd (void) | 
| Protected Member Functions | |
| virtual void | Report (const char *aCharset)=0 | 
| virtual void | Reset () | 
| Protected Attributes | |
| nsInputState | mInputState | 
| PRBool | mDone | 
| PRBool | mInTag | 
| PRBool | mStart | 
| PRBool | mGotData | 
| char | mLastChar | 
| const char * | mDetectedCharset | 
| PRInt32 | mBestGuess | 
| nsCharSetProber * | mCharSetProbers [NUM_OF_CHARSET_PROBERS] | 
| nsCharSetProber * | mEscCharSetProber | 
Definition at line 53 of file nsUniversalDetector.h.
| nsUniversalDetector::nsUniversalDetector | ( | ) | 
Definition at line 48 of file nsUniversalDetector.cpp.
References ePureAscii, mBestGuess, mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mInTag, mLastChar, mStart, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, and PR_TRUE.
00049 { 00050 mDone = PR_FALSE; 00051 mBestGuess = -1; //illegal value as signal 00052 mInTag = PR_FALSE; 00053 mEscCharSetProber = nsnull; 00054 00055 mStart = PR_TRUE; 00056 mDetectedCharset = nsnull; 00057 mGotData = PR_FALSE; 00058 mInputState = ePureAscii; 00059 mLastChar = '\0'; 00060 00061 PRUint32 i; 00062 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00063 mCharSetProbers[i] = nsnull; 00064 }
| nsUniversalDetector::~nsUniversalDetector | ( | ) |  [virtual] | 
Definition at line 66 of file nsUniversalDetector.cpp.
References mCharSetProbers, mEscCharSetProber, and NUM_OF_CHARSET_PROBERS.
00067 { 00068 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00069 if (mCharSetProbers[i]) 00070 delete mCharSetProbers[i]; 00071 if (mEscCharSetProber) 00072 delete mEscCharSetProber; 00073 }
| void nsUniversalDetector::DataEnd | ( | void | ) |  [virtual] | 
Definition at line 236 of file nsUniversalDetector.cpp.
References eEscAscii, eHighbyte, nsCharSetProber::GetConfidence(), mCharSetProbers, mDetectedCharset, mDone, mGotData, MINIMUM_THRESHOLD, mInputState, NUM_OF_CHARSET_PROBERS, PR_TRUE, and Report().
00237 { 00238 if (!mGotData) 00239 { 00240 // we haven't got any data yet, return immediately 00241 // caller program sometimes call DataEnd before anything has been sent to detector 00242 return; 00243 } 00244 00245 if (mDetectedCharset) 00246 { 00247 mDone = PR_TRUE; 00248 Report(mDetectedCharset); 00249 return; 00250 } 00251 00252 switch (mInputState) 00253 { 00254 case eHighbyte: 00255 { 00256 float proberConfidence; 00257 float maxProberConfidence = (float)0.0; 00258 PRInt32 maxProber = 0; 00259 00260 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00261 { 00262 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00263 if (proberConfidence > maxProberConfidence) 00264 { 00265 maxProberConfidence = proberConfidence; 00266 maxProber = i; 00267 } 00268 } 00269 //do not report anything because we are not confident of it, that's in fact a negative answer 00270 if (maxProberConfidence > MINIMUM_THRESHOLD) 00271 Report(mCharSetProbers[maxProber]->GetCharSetName()); 00272 } 00273 break; 00274 case eEscAscii: 00275 break; 00276 default: 00277 ; 00278 } 00279 return; 00280 }

Definition at line 101 of file nsUniversalDetector.cpp.
References eEscAscii, eFoundIt, eHighbyte, ePureAscii, nsCharSetProber::GetCharSetName(), nsCharSetProber::HandleData(), mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mLastChar, mStart, NS_ERROR_OUT_OF_MEMORY, NS_OK, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, and PR_TRUE.
00102 { 00103 if(mDone) 00104 return NS_OK; 00105 00106 if (aLen > 0) 00107 mGotData = PR_TRUE; 00108 00109 //If the data starts with BOM, we know it is UTF 00110 if (mStart) 00111 { 00112 mStart = PR_FALSE; 00113 if (aLen > 3) 00114 switch (aBuf[0]) 00115 { 00116 case '\xEF': 00117 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 00118 // EF BB BF UTF-8 encoded BOM 00119 mDetectedCharset = "UTF-8"; 00120 break; 00121 case '\xFE': 00122 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00123 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 00124 mDetectedCharset = "X-ISO-10646-UCS-4-3412"; 00125 else if ('\xFF' == aBuf[1]) 00126 // FE FF UTF-16, big endian BOM 00127 mDetectedCharset = "UTF-16BE"; 00128 break; 00129 case '\x00': 00130 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 00131 // 00 00 FE FF UTF-32, big-endian BOM 00132 mDetectedCharset = "UTF-32BE"; 00133 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 00134 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 00135 mDetectedCharset = "X-ISO-10646-UCS-4-2143"; 00136 break; 00137 case '\xFF': 00138 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00139 // FF FE 00 00 UTF-32, little-endian BOM 00140 mDetectedCharset = "UTF-32LE"; 00141 else if ('\xFE' == aBuf[1]) 00142 // FF FE UTF-16, little endian BOM 00143 mDetectedCharset = "UTF-16LE"; 00144 break; 00145 } // switch 00146 00147 if (mDetectedCharset) 00148 { 00149 mDone = PR_TRUE; 00150 return NS_OK; 00151 } 00152 } 00153 00154 PRUint32 i; 00155 for (i = 0; i < aLen; i++) 00156 { 00157 //other than 0xa0, if every othe character is ascii, the page is ascii 00158 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 00159 { 00160 //we got a non-ascii byte (high-byte) 00161 if (mInputState != eHighbyte) 00162 { 00163 //adjust state 00164 mInputState = eHighbyte; 00165 00166 //kill mEscCharSetProber if it is active 00167 if (mEscCharSetProber) { 00168 delete mEscCharSetProber; 00169 mEscCharSetProber = nsnull; 00170 } 00171 00172 //start multibyte and singlebyte charset prober 00173 if (nsnull == mCharSetProbers[0]) 00174 mCharSetProbers[0] = new nsMBCSGroupProber; 00175 if (nsnull == mCharSetProbers[1]) 00176 mCharSetProbers[1] = new nsSBCSGroupProber; 00177 if (nsnull == mCharSetProbers[2]) 00178 mCharSetProbers[2] = new nsLatin1Prober; 00179 00180 if ((nsnull == mCharSetProbers[0]) || 00181 (nsnull == mCharSetProbers[1]) || 00182 (nsnull == mCharSetProbers[2])) 00183 return NS_ERROR_OUT_OF_MEMORY; 00184 } 00185 } 00186 else 00187 { 00188 //ok, just pure ascii so far 00189 if ( ePureAscii == mInputState && 00190 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 00191 { 00192 //found escape character or HZ "~{" 00193 mInputState = eEscAscii; 00194 } 00195 mLastChar = aBuf[i]; 00196 } 00197 } 00198 00199 nsProbingState st; 00200 switch (mInputState) 00201 { 00202 case eEscAscii: 00203 if (nsnull == mEscCharSetProber) { 00204 mEscCharSetProber = new nsEscCharSetProber; 00205 if (nsnull == mEscCharSetProber) 00206 return NS_ERROR_OUT_OF_MEMORY; 00207 } 00208 st = mEscCharSetProber->HandleData(aBuf, aLen); 00209 if (st == eFoundIt) 00210 { 00211 mDone = PR_TRUE; 00212 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 00213 } 00214 break; 00215 case eHighbyte: 00216 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00217 { 00218 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 00219 if (st == eFoundIt) 00220 { 00221 mDone = PR_TRUE; 00222 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 00223 return NS_OK; 00224 } 00225 } 00226 break; 00227 00228 default: //pure ascii 00229 ;//do nothing here 00230 } 00231 return NS_OK; 00232 }

| virtual void nsUniversalDetector::Report | ( | const char * | aCharset | ) |  [protected, pure virtual] | 
Implemented in nsUniversalXPCOMDetector, nsUniversalXPCOMStringDetector, and nsUniversalDetectorImpl.
Referenced by DataEnd().

| void nsUniversalDetector::Reset | ( | void | ) |  [protected, virtual] | 
Definition at line 76 of file nsUniversalDetector.cpp.
References ePureAscii, mBestGuess, mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mInTag, mLastChar, mStart, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, PR_TRUE, and nsCharSetProber::Reset().
00077 { 00078 mDone = PR_FALSE; 00079 mBestGuess = -1; //illegal value as signal 00080 mInTag = PR_FALSE; 00081 00082 mStart = PR_TRUE; 00083 mDetectedCharset = nsnull; 00084 mGotData = PR_FALSE; 00085 mInputState = ePureAscii; 00086 mLastChar = '\0'; 00087 00088 if (mEscCharSetProber) 00089 mEscCharSetProber->Reset(); 00090 00091 PRUint32 i; 00092 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00093 if (mCharSetProbers[i]) 00094 mCharSetProbers[i]->Reset(); 00095 }

| PRInt32 nsUniversalDetector::mBestGuess  [protected] | 
Definition at line 70 of file nsUniversalDetector.h.
Referenced by nsUniversalDetector(), and Reset().
| nsCharSetProber* nsUniversalDetector::mCharSetProbers[NUM_OF_CHARSET_PROBERS]  [protected] | 
Definition at line 72 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), Reset(), and ~nsUniversalDetector().
| const char* nsUniversalDetector::mDetectedCharset  [protected] | 
Definition at line 69 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
| PRBool nsUniversalDetector::mDone  [protected] | 
Definition at line 64 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
| nsCharSetProber* nsUniversalDetector::mEscCharSetProber  [protected] | 
Definition at line 73 of file nsUniversalDetector.h.
Referenced by HandleData(), nsUniversalDetector(), Reset(), and ~nsUniversalDetector().
| PRBool nsUniversalDetector::mGotData  [protected] | 
Definition at line 67 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
| nsInputState nsUniversalDetector::mInputState  [protected] | 
Definition at line 63 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
| PRBool nsUniversalDetector::mInTag  [protected] | 
Definition at line 65 of file nsUniversalDetector.h.
Referenced by nsUniversalDetector(), and Reset().
| char nsUniversalDetector::mLastChar  [protected] | 
Definition at line 68 of file nsUniversalDetector.h.
Referenced by HandleData(), nsUniversalDetector(), and Reset().
| PRBool nsUniversalDetector::mStart  [protected] | 
Definition at line 66 of file nsUniversalDetector.h.
Referenced by HandleData(), nsUniversalDetector(), and Reset().
 1.6.2-20100208
 1.6.2-20100208