00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* ***** BEGIN LICENSE BLOCK ***** 00003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 00004 * 00005 * The contents of this file are subject to the Mozilla Public License Version 00006 * 1.1 (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * http://www.mozilla.org/MPL/ 00009 * 00010 * Software distributed under the License is distributed on an "AS IS" basis, 00011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 00012 * for the specific language governing rights and limitations under the 00013 * License. 00014 * 00015 * The Original Code is Mozilla Universal charset detector code. 00016 * 00017 * The Initial Developer of the Original Code is 00018 * Netscape Communications Corporation. 00019 * Portions created by the Initial Developer are Copyright (C) 2001 00020 * the Initial Developer. All Rights Reserved. 00021 * 00022 * Contributor(s): 00023 * Shy Shalom <shooshX@gmail.com> 00024 * 00025 * Alternatively, the contents of this file may be used under the terms of 00026 * either the GNU General Public License Version 2 or later (the "GPL"), or 00027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 00028 * in which case the provisions of the GPL or the LGPL are applicable instead 00029 * of those above. If you wish to allow use of your version of this file only 00030 * under the terms of either the GPL or the LGPL, and not to allow others to 00031 * use your version of this file under the terms of the MPL, indicate your 00032 * decision by deleting the provisions above and replace them with the notice 00033 * and other provisions required by the GPL or the LGPL. If you do not delete 00034 * the provisions above, a recipient may use your version of this file under 00035 * the terms of any one of the MPL, the GPL or the LGPL. 00036 * 00037 * ***** END LICENSE BLOCK ***** */ 00038 00039 #include "nscore.h" 00040 00041 #include "nsUniversalDetector.h" 00042 00043 #include "nsMBCSGroupProber.h" 00044 #include "nsSBCSGroupProber.h" 00045 #include "nsEscCharsetProber.h" 00046 #include "nsLatin1Prober.h" 00047 00048 nsUniversalDetector::nsUniversalDetector() 00049 { 00050 mDone = PR_FALSE; 00051 mBestGuess = -1; //illegal value as signal 00052 mInTag = PR_FALSE; 00053 mEscCharSetProber = nsnull; 00054 00055 mStart = PR_TRUE; 00056 mDetectedCharset = nsnull; 00057 mGotData = PR_FALSE; 00058 mInputState = ePureAscii; 00059 mLastChar = '\0'; 00060 00061 PRUint32 i; 00062 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00063 mCharSetProbers[i] = nsnull; 00064 } 00065 00066 nsUniversalDetector::~nsUniversalDetector() 00067 { 00068 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00069 if (mCharSetProbers[i]) 00070 delete mCharSetProbers[i]; 00071 if (mEscCharSetProber) 00072 delete mEscCharSetProber; 00073 } 00074 00075 void 00076 nsUniversalDetector::Reset() 00077 { 00078 mDone = PR_FALSE; 00079 mBestGuess = -1; //illegal value as signal 00080 mInTag = PR_FALSE; 00081 00082 mStart = PR_TRUE; 00083 mDetectedCharset = nsnull; 00084 mGotData = PR_FALSE; 00085 mInputState = ePureAscii; 00086 mLastChar = '\0'; 00087 00088 if (mEscCharSetProber) 00089 mEscCharSetProber->Reset(); 00090 00091 PRUint32 i; 00092 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00093 if (mCharSetProbers[i]) 00094 mCharSetProbers[i]->Reset(); 00095 } 00096 00097 //--------------------------------------------------------------------- 00098 #define SHORTCUT_THRESHOLD (float)0.95 00099 #define MINIMUM_THRESHOLD (float)0.20 00100 00101 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) 00102 { 00103 if(mDone) 00104 return NS_OK; 00105 00106 if (aLen > 0) 00107 mGotData = PR_TRUE; 00108 00109 //If the data starts with BOM, we know it is UTF 00110 if (mStart) 00111 { 00112 mStart = PR_FALSE; 00113 if (aLen > 3) 00114 switch (aBuf[0]) 00115 { 00116 case '\xEF': 00117 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) 00118 // EF BB BF UTF-8 encoded BOM 00119 mDetectedCharset = "UTF-8"; 00120 break; 00121 case '\xFE': 00122 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00123 // FE FF 00 00 UCS-4, unusual octet order BOM (3412) 00124 mDetectedCharset = "X-ISO-10646-UCS-4-3412"; 00125 else if ('\xFF' == aBuf[1]) 00126 // FE FF UTF-16, big endian BOM 00127 mDetectedCharset = "UTF-16BE"; 00128 break; 00129 case '\x00': 00130 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) 00131 // 00 00 FE FF UTF-32, big-endian BOM 00132 mDetectedCharset = "UTF-32BE"; 00133 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) 00134 // 00 00 FF FE UCS-4, unusual octet order BOM (2143) 00135 mDetectedCharset = "X-ISO-10646-UCS-4-2143"; 00136 break; 00137 case '\xFF': 00138 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) 00139 // FF FE 00 00 UTF-32, little-endian BOM 00140 mDetectedCharset = "UTF-32LE"; 00141 else if ('\xFE' == aBuf[1]) 00142 // FF FE UTF-16, little endian BOM 00143 mDetectedCharset = "UTF-16LE"; 00144 break; 00145 } // switch 00146 00147 if (mDetectedCharset) 00148 { 00149 mDone = PR_TRUE; 00150 return NS_OK; 00151 } 00152 } 00153 00154 PRUint32 i; 00155 for (i = 0; i < aLen; i++) 00156 { 00157 //other than 0xa0, if every othe character is ascii, the page is ascii 00158 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 00159 { 00160 //we got a non-ascii byte (high-byte) 00161 if (mInputState != eHighbyte) 00162 { 00163 //adjust state 00164 mInputState = eHighbyte; 00165 00166 //kill mEscCharSetProber if it is active 00167 if (mEscCharSetProber) { 00168 delete mEscCharSetProber; 00169 mEscCharSetProber = nsnull; 00170 } 00171 00172 //start multibyte and singlebyte charset prober 00173 if (nsnull == mCharSetProbers[0]) 00174 mCharSetProbers[0] = new nsMBCSGroupProber; 00175 if (nsnull == mCharSetProbers[1]) 00176 mCharSetProbers[1] = new nsSBCSGroupProber; 00177 if (nsnull == mCharSetProbers[2]) 00178 mCharSetProbers[2] = new nsLatin1Prober; 00179 00180 if ((nsnull == mCharSetProbers[0]) || 00181 (nsnull == mCharSetProbers[1]) || 00182 (nsnull == mCharSetProbers[2])) 00183 return NS_ERROR_OUT_OF_MEMORY; 00184 } 00185 } 00186 else 00187 { 00188 //ok, just pure ascii so far 00189 if ( ePureAscii == mInputState && 00190 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 00191 { 00192 //found escape character or HZ "~{" 00193 mInputState = eEscAscii; 00194 } 00195 mLastChar = aBuf[i]; 00196 } 00197 } 00198 00199 nsProbingState st; 00200 switch (mInputState) 00201 { 00202 case eEscAscii: 00203 if (nsnull == mEscCharSetProber) { 00204 mEscCharSetProber = new nsEscCharSetProber; 00205 if (nsnull == mEscCharSetProber) 00206 return NS_ERROR_OUT_OF_MEMORY; 00207 } 00208 st = mEscCharSetProber->HandleData(aBuf, aLen); 00209 if (st == eFoundIt) 00210 { 00211 mDone = PR_TRUE; 00212 mDetectedCharset = mEscCharSetProber->GetCharSetName(); 00213 } 00214 break; 00215 case eHighbyte: 00216 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00217 { 00218 st = mCharSetProbers[i]->HandleData(aBuf, aLen); 00219 if (st == eFoundIt) 00220 { 00221 mDone = PR_TRUE; 00222 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 00223 return NS_OK; 00224 } 00225 } 00226 break; 00227 00228 default: //pure ascii 00229 ;//do nothing here 00230 } 00231 return NS_OK; 00232 } 00233 00234 00235 //--------------------------------------------------------------------- 00236 void nsUniversalDetector::DataEnd() 00237 { 00238 if (!mGotData) 00239 { 00240 // we haven't got any data yet, return immediately 00241 // caller program sometimes call DataEnd before anything has been sent to detector 00242 return; 00243 } 00244 00245 if (mDetectedCharset) 00246 { 00247 mDone = PR_TRUE; 00248 Report(mDetectedCharset); 00249 return; 00250 } 00251 00252 switch (mInputState) 00253 { 00254 case eHighbyte: 00255 { 00256 float proberConfidence; 00257 float maxProberConfidence = (float)0.0; 00258 PRInt32 maxProber = 0; 00259 00260 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 00261 { 00262 proberConfidence = mCharSetProbers[i]->GetConfidence(); 00263 if (proberConfidence > maxProberConfidence) 00264 { 00265 maxProberConfidence = proberConfidence; 00266 maxProber = i; 00267 } 00268 } 00269 //do not report anything because we are not confident of it, that's in fact a negative answer 00270 if (maxProberConfidence > MINIMUM_THRESHOLD) 00271 Report(mCharSetProbers[maxProber]->GetCharSetName()); 00272 } 00273 break; 00274 case eEscAscii: 00275 break; 00276 default: 00277 ; 00278 } 00279 return; 00280 }