nsMBCSGroupProber.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Universal charset detector code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 2001
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *          Shy Shalom <shooshX@gmail.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either the GNU General Public License Version 2 or later (the "GPL"), or
00027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 #include <stdio.h>
00039 #include "prmem.h"
00040 
00041 #include "nsMBCSGroupProber.h"
00042 
00043 #ifdef DEBUG_chardet
00044 char *ProberName[] = 
00045 {
00046   "UTF8",
00047   "SJIS",
00048   "EUCJP",
00049   "GB18030",
00050   "EUCKR",
00051   "Big5",
00052   "EUCTW",
00053 };
00054 
00055 #endif
00056 
00057 nsMBCSGroupProber::nsMBCSGroupProber()
00058 {
00059   mProbers[0] = new nsUTF8Prober();
00060   mProbers[1] = new nsSJISProber();
00061   mProbers[2] = new nsEUCJPProber();
00062   mProbers[3] = new nsGB18030Prober();
00063   mProbers[4] = new nsEUCKRProber();
00064   mProbers[5] = new nsBig5Prober();
00065   mProbers[6] = new nsEUCTWProber();
00066   Reset();
00067 }
00068 
00069 nsMBCSGroupProber::~nsMBCSGroupProber()
00070 {
00071   for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
00072   {
00073     delete mProbers[i];
00074   }
00075 }
00076 
00077 const char* nsMBCSGroupProber::GetCharSetName()
00078 {
00079   if (mBestGuess == -1)
00080   {
00081     GetConfidence();
00082     if (mBestGuess == -1)
00083       mBestGuess = 0;
00084   }
00085   return mProbers[mBestGuess]->GetCharSetName();
00086 }
00087 
00088 void  nsMBCSGroupProber::Reset(void)
00089 {
00090   mActiveNum = 0;
00091   for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
00092   {
00093     if (mProbers[i])
00094     {
00095       mProbers[i]->Reset();
00096       mIsActive[i] = PR_TRUE;
00097       ++mActiveNum;
00098     }
00099     else
00100       mIsActive[i] = PR_FALSE;
00101   }
00102   mBestGuess = -1;
00103   mState = eDetecting;
00104 }
00105 
00106 nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
00107 {
00108   nsProbingState st;
00109   PRUint32 i;
00110 
00111   //do filtering to reduce load to probers
00112   char *highbyteBuf;
00113   char *hptr;
00114   PRBool keepNext = PR_TRUE;   //assume previous is not ascii, it will do no harm except add some noise
00115   hptr = highbyteBuf = (char*)PR_Malloc(aLen);
00116   if (!hptr)
00117       return mState;
00118   for (i = 0; i < aLen; i++)
00119   {
00120     if (aBuf[i] & 0x80)
00121     {
00122       *hptr++ = aBuf[i];
00123       keepNext = PR_TRUE;
00124     }
00125     else
00126     {
00127       //if previous is highbyte, keep this even it is a ASCII
00128       if (keepNext)
00129       {
00130           *hptr++ = aBuf[i];
00131           keepNext = PR_FALSE;
00132       }
00133     }
00134   }
00135 
00136   for (i = 0; i < NUM_OF_PROBERS; i++)
00137   {
00138      if (!mIsActive[i])
00139        continue;
00140      st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf);
00141      if (st == eFoundIt)
00142      {
00143        mBestGuess = i;
00144        mState = eFoundIt;
00145        break;
00146      }
00147      else if (st == eNotMe)
00148      {
00149        mIsActive[i] = PR_FALSE;
00150        mActiveNum--;
00151        if (mActiveNum <= 0)
00152        {
00153          mState = eNotMe;
00154          break;
00155        }
00156      }
00157   }
00158 
00159   PR_FREEIF(highbyteBuf);
00160 
00161   return mState;
00162 }
00163 
00164 float nsMBCSGroupProber::GetConfidence(void)
00165 {
00166   PRUint32 i;
00167   float bestConf = 0.0, cf;
00168 
00169   switch (mState)
00170   {
00171   case eFoundIt:
00172     return (float)0.99;
00173   case eNotMe:
00174     return (float)0.01;
00175   default:
00176     for (i = 0; i < NUM_OF_PROBERS; i++)
00177     {
00178       if (!mIsActive[i])
00179         continue;
00180       cf = mProbers[i]->GetConfidence();
00181       if (bestConf < cf)
00182       {
00183         bestConf = cf;
00184         mBestGuess = i;
00185       }
00186     }
00187   }
00188   return bestConf;
00189 }
00190 
00191 #ifdef DEBUG_chardet
00192 void nsMBCSGroupProber::DumpStatus()
00193 {
00194   PRUint32 i;
00195   float cf;
00196   
00197   GetConfidence();
00198   for (i = 0; i < NUM_OF_PROBERS; i++)
00199   {
00200     if (!mIsActive[i])
00201       printf("  MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
00202     else
00203     {
00204       cf = mProbers[i]->GetConfidence();
00205       printf("  MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
00206     }
00207   }
00208 }
00209 #endif
Generated by  doxygen 1.6.2-20100208