00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "nsLatin1Prober.h"
00040 #include "prmem.h"
00041 #include <stdio.h>
00042
00043 #define UDF 0 // undefined
00044 #define OTH 1 //other
00045 #define ASC 2 // ascii capital letter
00046 #define ASS 3 // ascii small letter
00047 #define ACV 4 // accent capital vowel
00048 #define ACO 5 // accent capital other
00049 #define ASV 6 // accent small vowel
00050 #define ASO 7 // accent small other
00051 #define CLASS_NUM 8 // total classes
00052
00053 static unsigned char Latin1_CharToClass[] =
00054 {
00055 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00056 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00057 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00058 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00059 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00060 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00061 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00062 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00063 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
00064 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
00065 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
00066 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,
00067 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,
00068 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,
00069 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,
00070 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,
00071 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,
00072 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,
00073 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00074 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,
00075 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00076 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00077 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00078 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00079 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,
00080 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,
00081 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,
00082 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,
00083 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,
00084 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,
00085 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,
00086 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,
00087 };
00088
00089
00090
00091
00092
00093
00094
00095 static unsigned char Latin1ClassModel[] =
00096 {
00097
00098 0, 0, 0, 0, 0, 0, 0, 0,
00099 0, 3, 3, 3, 3, 3, 3, 3,
00100 0, 3, 3, 3, 3, 3, 3, 3,
00101 0, 3, 3, 3, 1, 1, 3, 3,
00102 0, 3, 3, 3, 1, 2, 1, 2,
00103 0, 3, 3, 3, 3, 3, 3, 3,
00104 0, 3, 1, 3, 1, 1, 1, 3,
00105 0, 3, 1, 3, 1, 1, 3, 3,
00106 };
00107
00108 void nsLatin1Prober::Reset(void)
00109 {
00110 mState = eDetecting;
00111 mLastCharClass = OTH;
00112 for (int i = 0; i < FREQ_CAT_NUM; i++)
00113 mFreqCounter[i] = 0;
00114 }
00115
00116
00117 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
00118 {
00119 char *newBuf1 = 0;
00120 PRUint32 newLen1 = 0;
00121
00122 if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
00123 newBuf1 = (char*)aBuf;
00124 newLen1 = aLen;
00125 }
00126
00127 unsigned char charClass;
00128 unsigned char freq;
00129 for (PRUint32 i = 0; i < newLen1; i++)
00130 {
00131 charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
00132 freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
00133 if (freq == 0) {
00134 mState = eNotMe;
00135 break;
00136 }
00137 mFreqCounter[freq]++;
00138 mLastCharClass = charClass;
00139 }
00140
00141 if (newBuf1 != aBuf)
00142 PR_FREEIF(newBuf1);
00143
00144 return mState;
00145 }
00146
00147 float nsLatin1Prober::GetConfidence(void)
00148 {
00149 if (mState == eNotMe)
00150 return 0.01f;
00151
00152 float confidence;
00153 PRUint32 total = 0;
00154 for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
00155 total += mFreqCounter[i];
00156
00157 if(!total)
00158 confidence = 0.0f;
00159 else
00160 {
00161 confidence = mFreqCounter[3]*1.0f / total;
00162 confidence -= mFreqCounter[1]*20.0f/total;
00163 }
00164
00165 if (confidence < 0.0f)
00166 confidence = 0.0f;
00167
00168
00169
00170 confidence *= 0.50f;
00171
00172 return confidence;
00173 }
00174
00175 #ifdef DEBUG_chardet
00176 void nsLatin1Prober::DumpStatus()
00177 {
00178 printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
00179 }
00180 #endif
00181
00182