nsLatin1Prober.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Universal charset detector code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 2001
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *          Shy Shalom <shooshX@gmail.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either the GNU General Public License Version 2 or later (the "GPL"), or
00027  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 #include "nsLatin1Prober.h"
00040 #include "prmem.h"
00041 #include <stdio.h>
00042 
00043 #define UDF    0        // undefined
00044 #define OTH    1        //other
00045 #define ASC    2        // ascii capital letter
00046 #define ASS    3        // ascii small letter
00047 #define ACV    4        // accent capital vowel
00048 #define ACO    5        // accent capital other
00049 #define ASV    6        // accent small vowel
00050 #define ASO    7        // accent small other
00051 #define CLASS_NUM   8    // total classes
00052 
00053 static unsigned char Latin1_CharToClass[] = 
00054 {
00055   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
00056   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
00057   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17
00058   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F
00059   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27
00060   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F
00061   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37
00062   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F
00063   OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47
00064   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F
00065   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57
00066   ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F
00067   OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67
00068   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F
00069   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77
00070   ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F
00071   OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87
00072   OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F
00073   UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97
00074   OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F
00075   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7
00076   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF
00077   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7
00078   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF
00079   ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7
00080   ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF
00081   ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7
00082   ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF
00083   ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7
00084   ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF
00085   ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7
00086   ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF
00087 };
00088 
00089 
00090 /* 0 : illegal 
00091    1 : very unlikely 
00092    2 : normal 
00093    3 : very likely
00094 */
00095 static unsigned char Latin1ClassModel[] = 
00096 {
00097 /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
00098 /*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
00099 /*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,
00100 /*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, 
00101 /*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,
00102 /*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,
00103 /*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, 
00104 /*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, 
00105 /*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,
00106 };
00107 
00108 void  nsLatin1Prober::Reset(void)
00109 {
00110   mState = eDetecting;
00111   mLastCharClass = OTH;
00112   for (int i = 0; i < FREQ_CAT_NUM; i++)
00113     mFreqCounter[i] = 0;
00114 }
00115 
00116 
00117 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
00118 {
00119   char *newBuf1 = 0;
00120   PRUint32 newLen1 = 0;
00121 
00122   if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
00123     newBuf1 = (char*)aBuf;
00124     newLen1 = aLen;
00125   }
00126   
00127   unsigned char charClass;
00128   unsigned char freq;
00129   for (PRUint32 i = 0; i < newLen1; i++)
00130   {
00131     charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
00132     freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
00133     if (freq == 0) {
00134       mState = eNotMe;
00135       break;
00136     }
00137     mFreqCounter[freq]++;
00138     mLastCharClass = charClass;
00139   }
00140 
00141   if (newBuf1 != aBuf)
00142     PR_FREEIF(newBuf1);
00143 
00144   return mState;
00145 }
00146 
00147 float nsLatin1Prober::GetConfidence(void)
00148 {
00149   if (mState == eNotMe)
00150     return 0.01f;
00151   
00152   float confidence;
00153   PRUint32 total = 0;
00154   for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
00155     total += mFreqCounter[i];
00156 
00157   if(!total)
00158     confidence = 0.0f;
00159   else
00160   {
00161     confidence = mFreqCounter[3]*1.0f / total;
00162     confidence -= mFreqCounter[1]*20.0f/total;
00163   }
00164 
00165   if (confidence < 0.0f)
00166     confidence = 0.0f;
00167   
00168   // lower the confidence of latin1 so that other more accurate detector 
00169   // can take priority.
00170   confidence *= 0.50f;
00171 
00172   return confidence;
00173 }
00174 
00175 #ifdef DEBUG_chardet
00176 void  nsLatin1Prober::DumpStatus()
00177 {
00178   printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
00179 }
00180 #endif
00181 
00182 
Generated by  doxygen 1.6.2-20100208