nsUniversalDetector.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Universal charset detector code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 2001
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *          Shy Shalom <shooshX@gmail.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either the GNU General Public License Version 2 or later (the "GPL"), or
00027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 #include "nscore.h"
00040 
00041 #include "nsUniversalDetector.h"
00042 
00043 #include "nsMBCSGroupProber.h"
00044 #include "nsSBCSGroupProber.h"
00045 #include "nsEscCharsetProber.h"
00046 #include "nsLatin1Prober.h"
00047 
00048 nsUniversalDetector::nsUniversalDetector()
00049 {
00050   mDone = PR_FALSE;
00051   mBestGuess = -1;   //illegal value as signal
00052   mInTag = PR_FALSE;
00053   mEscCharSetProber = nsnull;
00054 
00055   mStart = PR_TRUE;
00056   mDetectedCharset = nsnull;
00057   mGotData = PR_FALSE;
00058   mInputState = ePureAscii;
00059   mLastChar = '\0';
00060 
00061   PRUint32 i;
00062   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00063     mCharSetProbers[i] = nsnull;
00064 }
00065 
00066 nsUniversalDetector::~nsUniversalDetector() 
00067 {
00068   for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00069     if (mCharSetProbers[i])      
00070       delete mCharSetProbers[i];
00071   if (mEscCharSetProber)
00072     delete mEscCharSetProber;
00073 }
00074 
00075 void 
00076 nsUniversalDetector::Reset()
00077 {
00078   mDone = PR_FALSE;
00079   mBestGuess = -1;   //illegal value as signal
00080   mInTag = PR_FALSE;
00081 
00082   mStart = PR_TRUE;
00083   mDetectedCharset = nsnull;
00084   mGotData = PR_FALSE;
00085   mInputState = ePureAscii;
00086   mLastChar = '\0';
00087 
00088   if (mEscCharSetProber)
00089     mEscCharSetProber->Reset();
00090 
00091   PRUint32 i;
00092   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00093     if (mCharSetProbers[i])
00094       mCharSetProbers[i]->Reset();
00095 }
00096 
00097 //---------------------------------------------------------------------
00098 #define SHORTCUT_THRESHOLD      (float)0.95
00099 #define MINIMUM_THRESHOLD      (float)0.20
00100 
00101 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
00102 {
00103   if(mDone) 
00104     return NS_OK;
00105 
00106   if (aLen > 0)
00107     mGotData = PR_TRUE;
00108 
00109   //If the data starts with BOM, we know it is UTF
00110   if (mStart)
00111   {
00112     mStart = PR_FALSE;
00113     if (aLen > 3)
00114       switch (aBuf[0])
00115         {
00116         case '\xEF':
00117           if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00118             // EF BB BF  UTF-8 encoded BOM
00119             mDetectedCharset = "UTF-8";
00120         break;
00121         case '\xFE':
00122           if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00123             // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
00124             mDetectedCharset = "X-ISO-10646-UCS-4-3412";
00125           else if ('\xFF' == aBuf[1])
00126             // FE FF  UTF-16, big endian BOM
00127             mDetectedCharset = "UTF-16BE";
00128         break;
00129         case '\x00':
00130           if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00131             // 00 00 FE FF  UTF-32, big-endian BOM
00132             mDetectedCharset = "UTF-32BE";
00133           else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00134             // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
00135             mDetectedCharset = "X-ISO-10646-UCS-4-2143";
00136         break;
00137         case '\xFF':
00138           if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00139             // FF FE 00 00  UTF-32, little-endian BOM
00140             mDetectedCharset = "UTF-32LE";
00141           else if ('\xFE' == aBuf[1])
00142             // FF FE  UTF-16, little endian BOM
00143             mDetectedCharset = "UTF-16LE";
00144         break;
00145       }  // switch
00146 
00147       if (mDetectedCharset)
00148       {
00149         mDone = PR_TRUE;
00150         return NS_OK;
00151       }
00152   }
00153   
00154   PRUint32 i;
00155   for (i = 0; i < aLen; i++)
00156   {
00157     //other than 0xa0, if every othe character is ascii, the page is ascii
00158     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
00159     {
00160       //we got a non-ascii byte (high-byte)
00161       if (mInputState != eHighbyte)
00162       {
00163         //adjust state
00164         mInputState = eHighbyte;
00165 
00166         //kill mEscCharSetProber if it is active
00167         if (mEscCharSetProber) {
00168           delete mEscCharSetProber;
00169           mEscCharSetProber = nsnull;
00170         }
00171 
00172         //start multibyte and singlebyte charset prober
00173         if (nsnull == mCharSetProbers[0])
00174           mCharSetProbers[0] = new nsMBCSGroupProber;
00175         if (nsnull == mCharSetProbers[1])
00176           mCharSetProbers[1] = new nsSBCSGroupProber;
00177         if (nsnull == mCharSetProbers[2])
00178           mCharSetProbers[2] = new nsLatin1Prober; 
00179 
00180         if ((nsnull == mCharSetProbers[0]) ||
00181             (nsnull == mCharSetProbers[1]) ||
00182             (nsnull == mCharSetProbers[2]))
00183             return NS_ERROR_OUT_OF_MEMORY;
00184       }
00185     }
00186     else
00187     {
00188       //ok, just pure ascii so far
00189       if ( ePureAscii == mInputState &&
00190         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
00191       {
00192         //found escape character or HZ "~{"
00193         mInputState = eEscAscii;
00194       }
00195       mLastChar = aBuf[i];
00196     }
00197   }
00198 
00199   nsProbingState st;
00200   switch (mInputState)
00201   {
00202   case eEscAscii:
00203     if (nsnull == mEscCharSetProber) {
00204       mEscCharSetProber = new nsEscCharSetProber;
00205       if (nsnull == mEscCharSetProber)
00206         return NS_ERROR_OUT_OF_MEMORY;
00207     }
00208     st = mEscCharSetProber->HandleData(aBuf, aLen);
00209     if (st == eFoundIt)
00210     {
00211       mDone = PR_TRUE;
00212       mDetectedCharset = mEscCharSetProber->GetCharSetName();
00213     }
00214     break;
00215   case eHighbyte:
00216     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00217     {
00218       st = mCharSetProbers[i]->HandleData(aBuf, aLen);
00219       if (st == eFoundIt) 
00220       {
00221         mDone = PR_TRUE;
00222         mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
00223         return NS_OK;
00224       } 
00225     }
00226     break;
00227 
00228   default:  //pure ascii
00229     ;//do nothing here
00230   }
00231   return NS_OK;
00232 }
00233 
00234 
00235 //---------------------------------------------------------------------
00236 void nsUniversalDetector::DataEnd()
00237 {
00238   if (!mGotData)
00239   {
00240     // we haven't got any data yet, return immediately 
00241     // caller program sometimes call DataEnd before anything has been sent to detector
00242     return;
00243   }
00244 
00245   if (mDetectedCharset)
00246   {
00247     mDone = PR_TRUE;
00248     Report(mDetectedCharset);
00249     return;
00250   }
00251   
00252   switch (mInputState)
00253   {
00254   case eHighbyte:
00255     {
00256       float proberConfidence;
00257       float maxProberConfidence = (float)0.0;
00258       PRInt32 maxProber = 0;
00259 
00260       for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00261       {
00262         proberConfidence = mCharSetProbers[i]->GetConfidence();
00263         if (proberConfidence > maxProberConfidence)
00264         {
00265           maxProberConfidence = proberConfidence;
00266           maxProber = i;
00267         }
00268       }
00269       //do not report anything because we are not confident of it, that's in fact a negative answer
00270       if (maxProberConfidence > MINIMUM_THRESHOLD)
00271         Report(mCharSetProbers[maxProber]->GetCharSetName());
00272     }
00273     break;
00274   case eEscAscii:
00275     break;
00276   default:
00277     ;
00278   }
00279   return;
00280 }