00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* ***** BEGIN LICENSE BLOCK ***** 00003 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 00004 * 00005 * The contents of this file are subject to the Mozilla Public License Version 00006 * 1.1 (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * http://www.mozilla.org/MPL/ 00009 * 00010 * Software distributed under the License is distributed on an "AS IS" basis, 00011 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 00012 * for the specific language governing rights and limitations under the 00013 * License. 00014 * 00015 * The Original Code is Mozilla Universal charset detector code. 00016 * 00017 * The Initial Developer of the Original Code is 00018 * Netscape Communications Corporation. 00019 * Portions created by the Initial Developer are Copyright (C) 2001 00020 * the Initial Developer. All Rights Reserved. 00021 * 00022 * Contributor(s): 00023 * Shy Shalom <shooshX@gmail.com> 00024 * 00025 * Alternatively, the contents of this file may be used under the terms of 00026 * either the GNU General Public License Version 2 or later (the "GPL"), or 00027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 00028 * in which case the provisions of the GPL or the LGPL are applicable instead 00029 * of those above. If you wish to allow use of your version of this file only 00030 * under the terms of either the GPL or the LGPL, and not to allow others to 00031 * use your version of this file under the terms of the MPL, indicate your 00032 * decision by deleting the provisions above and replace them with the notice 00033 * and other provisions required by the GPL or the LGPL. If you do not delete 00034 * the provisions above, a recipient may use your version of this file under 00035 * the terms of any one of the MPL, the GPL or the LGPL. 00036 * 00037 * ***** END LICENSE BLOCK ***** */ 00038 00039 #include "nsCharSetProber.h" 00040 #include "prmem.h" 00041 00042 //This filter applies to all scripts which do not use English characters 00043 PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 00044 { 00045 char *newptr; 00046 char *prevPtr, *curPtr; 00047 00048 PRBool meetMSB = PR_FALSE; 00049 newptr = *newBuf = (char*)PR_Malloc(aLen); 00050 if (!newptr) 00051 return PR_FALSE; 00052 00053 for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 00054 { 00055 if (*curPtr & 0x80) 00056 { 00057 meetMSB = PR_TRUE; 00058 } 00059 else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') 00060 { 00061 //current char is a symbol, most likely a punctuation. we treat it as segment delimiter 00062 if (meetMSB && curPtr > prevPtr) 00063 //this segment contains more than single symbol, and it has upper ASCII, we need to keep it 00064 { 00065 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 00066 prevPtr++; 00067 *newptr++ = ' '; 00068 meetMSB = PR_FALSE; 00069 } 00070 else //ignore current segment. (either because it is just a symbol or just an English word) 00071 prevPtr = curPtr+1; 00072 } 00073 } 00074 if (meetMSB && curPtr > prevPtr) 00075 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 00076 00077 newLen = newptr - *newBuf; 00078 00079 return PR_TRUE; 00080 } 00081 00082 //This filter applies to all scripts which contain both English characters and upper ASCII characters. 00083 PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen) 00084 { 00085 //do filtering to reduce load to probers 00086 char *newptr; 00087 char *prevPtr, *curPtr; 00088 PRBool isInTag = PR_FALSE; 00089 00090 newptr = *newBuf = (char*)PR_Malloc(aLen); 00091 if (!newptr) 00092 return PR_FALSE; 00093 00094 for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++) 00095 { 00096 if (*curPtr == '>') 00097 isInTag = PR_FALSE; 00098 else if (*curPtr == '<') 00099 isInTag = PR_TRUE; 00100 00101 if (!(*curPtr & 0x80) && 00102 (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') ) 00103 { 00104 if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol 00105 // and it is not inside a tag, keep it. 00106 { 00107 while (prevPtr < curPtr) *newptr++ = *prevPtr++; 00108 prevPtr++; 00109 *newptr++ = ' '; 00110 } 00111 else 00112 prevPtr = curPtr+1; 00113 } 00114 } 00115 00116 // If the current segment contains more than just a symbol 00117 // and it is not inside a tag then keep it. 00118 if (!isInTag) 00119 while (prevPtr < curPtr) 00120 *newptr++ = *prevPtr++; 00121 00122 newLen = newptr - *newBuf; 00123 00124 return PR_TRUE; 00125 }