nsUniversalDetector Class Reference

#include <nsUniversalDetector.h>

Inheritance diagram for nsUniversalDetector:
Inheritance graph
[legend]
Collaboration diagram for nsUniversalDetector:
Collaboration graph
[legend]

Public Member Functions

 nsUniversalDetector ()
virtual ~nsUniversalDetector ()
virtual nsresult HandleData (const char *aBuf, PRUint32 aLen)
virtual void DataEnd (void)

Protected Member Functions

virtual void Report (const char *aCharset)=0
virtual void Reset ()

Protected Attributes

nsInputState mInputState
PRBool mDone
PRBool mInTag
PRBool mStart
PRBool mGotData
char mLastChar
const char * mDetectedCharset
PRInt32 mBestGuess
nsCharSetProbermCharSetProbers [NUM_OF_CHARSET_PROBERS]
nsCharSetProbermEscCharSetProber

Detailed Description

Definition at line 53 of file nsUniversalDetector.h.


Constructor & Destructor Documentation

nsUniversalDetector::nsUniversalDetector (  ) 

Definition at line 48 of file nsUniversalDetector.cpp.

References ePureAscii, mBestGuess, mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mInTag, mLastChar, mStart, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, and PR_TRUE.

00049 {
00050   mDone = PR_FALSE;
00051   mBestGuess = -1;   //illegal value as signal
00052   mInTag = PR_FALSE;
00053   mEscCharSetProber = nsnull;
00054 
00055   mStart = PR_TRUE;
00056   mDetectedCharset = nsnull;
00057   mGotData = PR_FALSE;
00058   mInputState = ePureAscii;
00059   mLastChar = '\0';
00060 
00061   PRUint32 i;
00062   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00063     mCharSetProbers[i] = nsnull;
00064 }

nsUniversalDetector::~nsUniversalDetector (  )  [virtual]

Definition at line 66 of file nsUniversalDetector.cpp.

References mCharSetProbers, mEscCharSetProber, and NUM_OF_CHARSET_PROBERS.

00067 {
00068   for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00069     if (mCharSetProbers[i])      
00070       delete mCharSetProbers[i];
00071   if (mEscCharSetProber)
00072     delete mEscCharSetProber;
00073 }


Member Function Documentation

void nsUniversalDetector::DataEnd ( void   )  [virtual]

Definition at line 236 of file nsUniversalDetector.cpp.

References eEscAscii, eHighbyte, nsCharSetProber::GetConfidence(), mCharSetProbers, mDetectedCharset, mDone, mGotData, MINIMUM_THRESHOLD, mInputState, NUM_OF_CHARSET_PROBERS, PR_TRUE, and Report().

00237 {
00238   if (!mGotData)
00239   {
00240     // we haven't got any data yet, return immediately 
00241     // caller program sometimes call DataEnd before anything has been sent to detector
00242     return;
00243   }
00244 
00245   if (mDetectedCharset)
00246   {
00247     mDone = PR_TRUE;
00248     Report(mDetectedCharset);
00249     return;
00250   }
00251   
00252   switch (mInputState)
00253   {
00254   case eHighbyte:
00255     {
00256       float proberConfidence;
00257       float maxProberConfidence = (float)0.0;
00258       PRInt32 maxProber = 0;
00259 
00260       for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00261       {
00262         proberConfidence = mCharSetProbers[i]->GetConfidence();
00263         if (proberConfidence > maxProberConfidence)
00264         {
00265           maxProberConfidence = proberConfidence;
00266           maxProber = i;
00267         }
00268       }
00269       //do not report anything because we are not confident of it, that's in fact a negative answer
00270       if (maxProberConfidence > MINIMUM_THRESHOLD)
00271         Report(mCharSetProbers[maxProber]->GetCharSetName());
00272     }
00273     break;
00274   case eEscAscii:
00275     break;
00276   default:
00277     ;
00278   }
00279   return;
00280 }

Here is the call graph for this function:

nsresult nsUniversalDetector::HandleData ( const char *  aBuf,
PRUint32  aLen 
) [virtual]

Definition at line 101 of file nsUniversalDetector.cpp.

References eEscAscii, eFoundIt, eHighbyte, ePureAscii, nsCharSetProber::GetCharSetName(), nsCharSetProber::HandleData(), mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mLastChar, mStart, NS_ERROR_OUT_OF_MEMORY, NS_OK, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, and PR_TRUE.

00102 {
00103   if(mDone) 
00104     return NS_OK;
00105 
00106   if (aLen > 0)
00107     mGotData = PR_TRUE;
00108 
00109   //If the data starts with BOM, we know it is UTF
00110   if (mStart)
00111   {
00112     mStart = PR_FALSE;
00113     if (aLen > 3)
00114       switch (aBuf[0])
00115         {
00116         case '\xEF':
00117           if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00118             // EF BB BF  UTF-8 encoded BOM
00119             mDetectedCharset = "UTF-8";
00120         break;
00121         case '\xFE':
00122           if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00123             // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
00124             mDetectedCharset = "X-ISO-10646-UCS-4-3412";
00125           else if ('\xFF' == aBuf[1])
00126             // FE FF  UTF-16, big endian BOM
00127             mDetectedCharset = "UTF-16BE";
00128         break;
00129         case '\x00':
00130           if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00131             // 00 00 FE FF  UTF-32, big-endian BOM
00132             mDetectedCharset = "UTF-32BE";
00133           else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00134             // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
00135             mDetectedCharset = "X-ISO-10646-UCS-4-2143";
00136         break;
00137         case '\xFF':
00138           if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00139             // FF FE 00 00  UTF-32, little-endian BOM
00140             mDetectedCharset = "UTF-32LE";
00141           else if ('\xFE' == aBuf[1])
00142             // FF FE  UTF-16, little endian BOM
00143             mDetectedCharset = "UTF-16LE";
00144         break;
00145       }  // switch
00146 
00147       if (mDetectedCharset)
00148       {
00149         mDone = PR_TRUE;
00150         return NS_OK;
00151       }
00152   }
00153   
00154   PRUint32 i;
00155   for (i = 0; i < aLen; i++)
00156   {
00157     //other than 0xa0, if every othe character is ascii, the page is ascii
00158     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
00159     {
00160       //we got a non-ascii byte (high-byte)
00161       if (mInputState != eHighbyte)
00162       {
00163         //adjust state
00164         mInputState = eHighbyte;
00165 
00166         //kill mEscCharSetProber if it is active
00167         if (mEscCharSetProber) {
00168           delete mEscCharSetProber;
00169           mEscCharSetProber = nsnull;
00170         }
00171 
00172         //start multibyte and singlebyte charset prober
00173         if (nsnull == mCharSetProbers[0])
00174           mCharSetProbers[0] = new nsMBCSGroupProber;
00175         if (nsnull == mCharSetProbers[1])
00176           mCharSetProbers[1] = new nsSBCSGroupProber;
00177         if (nsnull == mCharSetProbers[2])
00178           mCharSetProbers[2] = new nsLatin1Prober; 
00179 
00180         if ((nsnull == mCharSetProbers[0]) ||
00181             (nsnull == mCharSetProbers[1]) ||
00182             (nsnull == mCharSetProbers[2]))
00183             return NS_ERROR_OUT_OF_MEMORY;
00184       }
00185     }
00186     else
00187     {
00188       //ok, just pure ascii so far
00189       if ( ePureAscii == mInputState &&
00190         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
00191       {
00192         //found escape character or HZ "~{"
00193         mInputState = eEscAscii;
00194       }
00195       mLastChar = aBuf[i];
00196     }
00197   }
00198 
00199   nsProbingState st;
00200   switch (mInputState)
00201   {
00202   case eEscAscii:
00203     if (nsnull == mEscCharSetProber) {
00204       mEscCharSetProber = new nsEscCharSetProber;
00205       if (nsnull == mEscCharSetProber)
00206         return NS_ERROR_OUT_OF_MEMORY;
00207     }
00208     st = mEscCharSetProber->HandleData(aBuf, aLen);
00209     if (st == eFoundIt)
00210     {
00211       mDone = PR_TRUE;
00212       mDetectedCharset = mEscCharSetProber->GetCharSetName();
00213     }
00214     break;
00215   case eHighbyte:
00216     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00217     {
00218       st = mCharSetProbers[i]->HandleData(aBuf, aLen);
00219       if (st == eFoundIt) 
00220       {
00221         mDone = PR_TRUE;
00222         mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
00223         return NS_OK;
00224       } 
00225     }
00226     break;
00227 
00228   default:  //pure ascii
00229     ;//do nothing here
00230   }
00231   return NS_OK;
00232 }

Here is the call graph for this function:

virtual void nsUniversalDetector::Report ( const char *  aCharset  )  [protected, pure virtual]

Implemented in nsUniversalXPCOMDetector, nsUniversalXPCOMStringDetector, and nsUniversalDetectorImpl.

Referenced by DataEnd().

Here is the caller graph for this function:

void nsUniversalDetector::Reset ( void   )  [protected, virtual]

Definition at line 76 of file nsUniversalDetector.cpp.

References ePureAscii, mBestGuess, mCharSetProbers, mDetectedCharset, mDone, mEscCharSetProber, mGotData, mInputState, mInTag, mLastChar, mStart, nsnull, NUM_OF_CHARSET_PROBERS, PR_FALSE, PR_TRUE, and nsCharSetProber::Reset().

00077 {
00078   mDone = PR_FALSE;
00079   mBestGuess = -1;   //illegal value as signal
00080   mInTag = PR_FALSE;
00081 
00082   mStart = PR_TRUE;
00083   mDetectedCharset = nsnull;
00084   mGotData = PR_FALSE;
00085   mInputState = ePureAscii;
00086   mLastChar = '\0';
00087 
00088   if (mEscCharSetProber)
00089     mEscCharSetProber->Reset();
00090 
00091   PRUint32 i;
00092   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00093     if (mCharSetProbers[i])
00094       mCharSetProbers[i]->Reset();
00095 }

Here is the call graph for this function:


Field Documentation

Definition at line 70 of file nsUniversalDetector.h.

Referenced by nsUniversalDetector(), and Reset().

nsCharSetProber* nsUniversalDetector::mCharSetProbers[NUM_OF_CHARSET_PROBERS] [protected]
const char* nsUniversalDetector::mDetectedCharset [protected]

Definition at line 69 of file nsUniversalDetector.h.

Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().

Definition at line 64 of file nsUniversalDetector.h.

Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().

Definition at line 73 of file nsUniversalDetector.h.

Referenced by HandleData(), nsUniversalDetector(), Reset(), and ~nsUniversalDetector().

Definition at line 67 of file nsUniversalDetector.h.

Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().

Definition at line 63 of file nsUniversalDetector.h.

Referenced by DataEnd(), HandleData(), nsUniversalDetector(), and Reset().

Definition at line 65 of file nsUniversalDetector.h.

Referenced by nsUniversalDetector(), and Reset().

Definition at line 68 of file nsUniversalDetector.h.

Referenced by HandleData(), nsUniversalDetector(), and Reset().

Definition at line 66 of file nsUniversalDetector.h.

Referenced by HandleData(), nsUniversalDetector(), and Reset().


The documentation for this class was generated from the following files:
Generated by  doxygen 1.6.2-20100208