#include <nsUniversalDetector.h>
Public Member Functions | |
nsUniversalDetector () | |
virtual | ~nsUniversalDetector () |
virtual nsresult | HandleData (const char *aBuf, PRUint32 aLen) |
virtual void | DataEnd (void) |
Protected Member Functions | |
virtual void | Report (const char *aCharset)=0 |
virtual void | Reset () |
Protected Attributes | |
nsInputState | mInputState |
PRBool | mDone |
PRBool | mInTag |
PRBool | mStart |
PRBool | mGotData |
char | mLastChar |
const char * | mDetectedCharset |
PRInt32 | mBestGuess |
nsCharSetProber * | mCharSetProbers [NUM_OF_CHARSET_PROBERS] |
nsCharSetProber * | mEscCharSetProber |
Definition at line 53 of file nsUniversalDetector.h.
nsUniversalDetector::nsUniversalDetector | ( | ) |
Definition at line 48 of file nsUniversalDetector.cpp.
References ePureAscii, mBestGuess, mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mInTag, mLastChar, mStart, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, and PR_TRUE.
00049 { 00050 mDone = PR_FALSE; 00051 mBestGuess = -1; //illegal value as signal 00052 mInTag = PR_FALSE; 00053 mEscCharSetProber = nsnull; 00054 00055 mStart = PR_TRUE; 00056 mDetectedCharset = nsnull; 00057 mGotData = PR_FALSE; 00058 mInputState = ePureAscii; 00059 mLastChar = '\0'; 00060 00061 PRUint32 i; 00062 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00063 mCharSetProbers[i] = nsnull; 00064 }
nsUniversalDetector::~nsUniversalDetector | ( | ) | [virtual] |
Definition at line 66 of file nsUniversalDetector.cpp.
References mCharSetProbers, mEscCharSetProber, and NUM_OF_CHARSET_PROBERS.
00067 { 00068 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00069 if (mCharSetProbers[i]) 00070 delete mCharSetProbers[i]; 00071 if (mEscCharSetProber) 00072 delete mEscCharSetProber; 00073 }
void nsUniversalDetector::DataEnd | ( | void | ) | [virtual] |
Definition at line 236 of file nsUniversalDetector.cpp.
References eEscAscii, eHighbyte, nsCharSetProber::GetConfidence(), mCharSetProbers, mDetectedCharset, mDone, mGotData, MINIMUM_THRESHOLD, mInputState, NUM_OF_CHARSET_PROBERS, PR_TRUE, and Report().
00237 { 00238 if (!mGotData) 00239 { 00240 // we haven't got any data yet, return immediately 00241 // caller program sometimes call DataEnd before anything has been sent to detector 00242 return; 00243 } 00244 00245 if (mDetectedCharset) 00246 { 00247 mDone = PR_TRUE; 00248 Report(mDetectedCharset); 00249 return; 00250 } 00251 00252 switch (mInputState) 00253 { 00254 case eHighbyte: 00255 { 00256 float proberConfidence; 00257 float maxProberConfidence = (float)0.0; 00258 PRInt32 maxProber = 0; 00259 00260 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00261 { 00262 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00263 if (proberConfidence > maxProberConfidence) 00264 { 00265 maxProberConfidence = proberConfidence; 00266 maxProber = i; 00267 } 00268 } 00269 //do not report anything because we are not confident of it, that's in fact a negative answer 00270 if (maxProberConfidence > MINIMUM_THRESHOLD) 00271 Report(mCharSetProbers[maxProber]->GetCharSetName()); 00272 } 00273 break; 00274 case eEscAscii: 00275 break; 00276 default: 00277 ; 00278 } 00279 return; 00280 }
Definition at line 101 of file nsUniversalDetector.cpp.
References eEscAscii, eFoundIt, eHighbyte, ePureAscii, nsCharSetProber::GetCharSetName(), nsCharSetProber::HandleData(), mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mLastChar, mStart, NS_ERROR_OUT_OF_MEMORY, NS_OK, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, and PR_TRUE.
00102 { 00103 if(mDone) 00104 return NS_OK; 00105 00106 if (aLen > 0) 00107 mGotData = PR_TRUE; 00108 00109 //If the data starts with BOM, we know it is UTF 00110 if (mStart) 00111 { 00112 mStart = PR_FALSE; 00113 if (aLen > 3) 00114 switch (aBuf[0]) 00115 { 00116 case '\xEF': 00117 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 00118 // EF BB BF UTF-8 encoded BOM 00119 mDetectedCharset = "UTF-8"; 00120 break; 00121 case '\xFE': 00122 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00123 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 00124 mDetectedCharset = "X-ISO-10646-UCS-4-3412"; 00125 else if ('\xFF' == aBuf[1]) 00126 // FE FF UTF-16, big endian BOM 00127 mDetectedCharset = "UTF-16BE"; 00128 break; 00129 case '\x00': 00130 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 00131 // 00 00 FE FF UTF-32, big-endian BOM 00132 mDetectedCharset = "UTF-32BE"; 00133 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 00134 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 00135 mDetectedCharset = "X-ISO-10646-UCS-4-2143"; 00136 break; 00137 case '\xFF': 00138 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00139 // FF FE 00 00 UTF-32, little-endian BOM 00140 mDetectedCharset = "UTF-32LE"; 00141 else if ('\xFE' == aBuf[1]) 00142 // FF FE UTF-16, little endian BOM 00143 mDetectedCharset = "UTF-16LE"; 00144 break; 00145 } // switch 00146 00147 if (mDetectedCharset) 00148 { 00149 mDone = PR_TRUE; 00150 return NS_OK; 00151 } 00152 } 00153 00154 PRUint32 i; 00155 for (i = 0; i < aLen; i++) 00156 { 00157 //other than 0xa0, if every othe character is ascii, the page is ascii 00158 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 00159 { 00160 //we got a non-ascii byte (high-byte) 00161 if (mInputState != eHighbyte) 00162 { 00163 //adjust state 00164 mInputState = eHighbyte; 00165 00166 //kill mEscCharSetProber if it is active 00167 if (mEscCharSetProber) { 00168 delete mEscCharSetProber; 00169 mEscCharSetProber = nsnull; 00170 } 00171 00172 //start multibyte and singlebyte charset prober 00173 if (nsnull == mCharSetProbers[0]) 00174 mCharSetProbers[0] = new nsMBCSGroupProber; 00175 if (nsnull == mCharSetProbers[1]) 00176 mCharSetProbers[1] = new nsSBCSGroupProber; 00177 if (nsnull == mCharSetProbers[2]) 00178 mCharSetProbers[2] = new nsLatin1Prober; 00179 00180 if ((nsnull == mCharSetProbers[0]) || 00181 (nsnull == mCharSetProbers[1]) || 00182 (nsnull == mCharSetProbers[2])) 00183 return NS_ERROR_OUT_OF_MEMORY; 00184 } 00185 } 00186 else 00187 { 00188 //ok, just pure ascii so far 00189 if ( ePureAscii == mInputState && 00190 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 00191 { 00192 //found escape character or HZ "~{" 00193 mInputState = eEscAscii; 00194 } 00195 mLastChar = aBuf[i]; 00196 } 00197 } 00198 00199 nsProbingState st; 00200 switch (mInputState) 00201 { 00202 case eEscAscii: 00203 if (nsnull == mEscCharSetProber) { 00204 mEscCharSetProber = new nsEscCharSetProber; 00205 if (nsnull == mEscCharSetProber) 00206 return NS_ERROR_OUT_OF_MEMORY; 00207 } 00208 st = mEscCharSetProber->HandleData(aBuf, aLen); 00209 if (st == eFoundIt) 00210 { 00211 mDone = PR_TRUE; 00212 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 00213 } 00214 break; 00215 case eHighbyte: 00216 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00217 { 00218 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 00219 if (st == eFoundIt) 00220 { 00221 mDone = PR_TRUE; 00222 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 00223 return NS_OK; 00224 } 00225 } 00226 break; 00227 00228 default: //pure ascii 00229 ;//do nothing here 00230 } 00231 return NS_OK; 00232 }
virtual void nsUniversalDetector::Report | ( | const char * | aCharset | ) | [protected, pure virtual] |
Implemented in nsUniversalXPCOMDetector, nsUniversalXPCOMStringDetector, and nsUniversalDetectorImpl.
Referenced by DataEnd().
void nsUniversalDetector::Reset | ( | void | ) | [protected, virtual] |
Definition at line 76 of file nsUniversalDetector.cpp.
References ePureAscii, mBestGuess, mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mInTag, mLastChar, mStart, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, PR_TRUE, and nsCharSetProber::Reset().
00077 { 00078 mDone = PR_FALSE; 00079 mBestGuess = -1; //illegal value as signal 00080 mInTag = PR_FALSE; 00081 00082 mStart = PR_TRUE; 00083 mDetectedCharset = nsnull; 00084 mGotData = PR_FALSE; 00085 mInputState = ePureAscii; 00086 mLastChar = '\0'; 00087 00088 if (mEscCharSetProber) 00089 mEscCharSetProber->Reset(); 00090 00091 PRUint32 i; 00092 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00093 if (mCharSetProbers[i]) 00094 mCharSetProbers[i]->Reset(); 00095 }
PRInt32 nsUniversalDetector::mBestGuess [protected] |
Definition at line 70 of file nsUniversalDetector.h.
Referenced by nsUniversalDetector(), and Reset().
nsCharSetProber* nsUniversalDetector::mCharSetProbers[NUM_OF_CHARSET_PROBERS] [protected] |
Definition at line 72 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), Reset(), and ~nsUniversalDetector().
const char* nsUniversalDetector::mDetectedCharset [protected] |
Definition at line 69 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
PRBool nsUniversalDetector::mDone [protected] |
Definition at line 64 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
nsCharSetProber* nsUniversalDetector::mEscCharSetProber [protected] |
Definition at line 73 of file nsUniversalDetector.h.
Referenced by HandleData(), nsUniversalDetector(), Reset(), and ~nsUniversalDetector().
PRBool nsUniversalDetector::mGotData [protected] |
Definition at line 67 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
nsInputState nsUniversalDetector::mInputState [protected] |
Definition at line 63 of file nsUniversalDetector.h.
Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().
PRBool nsUniversalDetector::mInTag [protected] |
Definition at line 65 of file nsUniversalDetector.h.
Referenced by nsUniversalDetector(), and Reset().
char nsUniversalDetector::mLastChar [protected] |
Definition at line 68 of file nsUniversalDetector.h.
Referenced by HandleData(), nsUniversalDetector(), and Reset().
PRBool nsUniversalDetector::mStart [protected] |
Definition at line 66 of file nsUniversalDetector.h.
Referenced by HandleData(), nsUniversalDetector(), and Reset().