00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 #include <stdio.h>
00028 #include <string.h>
00029 #include <glib.h>
00030 #include "text_model.h"
00031 #include "utils.h"
00032 #include "log.h"
00033 #include "nsUniversalDetectorImpl.h"
00034 
00035 namespace text
00036 {
00037 
00038 
00039 
00040 static const int SAMPLE_LEN = 1024 * 4;
00041 static const int BLOCK_SIZE = 4096;
00042 static const std::string TARGET_CODESET = "utf-8";
00043 
00044 TextModel::TextModel()
00045 : file_p(0)
00046 , encoding("")
00047 , path()
00048 , b_open(false)
00049 , aborting_search_task_id(0)
00050 , incomplete_line(false)
00051 {
00052 }
00053 
00054 TextModel::~TextModel()
00055 {
00056     if (b_open)
00057     {
00058         close();
00059     }
00060 }
00061 
00062 PluginStatus TextModel::open(const std::string& doc_path)
00063 {
00064     return open(doc_path, "");
00065 }
00066 
00067 PluginStatus TextModel::open(const std::string& doc_path,
00068                              const std::string& encoding)
00069 {
00070     PluginStatus result = PLUGIN_FAIL;
00071 
00072     
00073     if (b_open)
00074     {
00075         close();
00076     }
00077 
00078     
00079     file_p = fopen(doc_path.c_str(), "r");
00080     if (file_p == NULL)
00081     {
00082         return PLUGIN_FAIL;
00083     }
00084 
00085     
00086     if (encoding.empty())
00087     {
00088         detect_encoding();
00089     }
00090 
00091     
00092     path = doc_path;
00093     b_open = true;
00094 
00095     
00096     result = read_text();
00097     if (result != PLUGIN_OK)
00098     {
00099         close();
00100         return result;
00101     }
00102 
00103     if (doc.empty())
00104     {
00105         doc.push_back(Paragraph(0, new std::string(" ")));
00106     }
00107 
00108     return result;
00109 }
00110 
00111 void TextModel::close()
00112 {
00113     clear();
00114     fclose(file_p);
00115     file_p = NULL;
00116     b_open = false;
00117     encoding.clear();
00118 }
00119 
00120 
00121 void TextModel::detect_encoding()
00122 {
00123     char buf[SAMPLE_LEN];
00124     size_t bytes_read = fread(buf, 1, SAMPLE_LEN, file_p);
00125 
00126     nsUniversalDetectorImpl charset_detector;
00127     charset_detector.HandleData(buf, static_cast<unsigned int>(bytes_read));
00128     charset_detector.DataEnd();
00129     encoding = charset_detector.get_enc();
00130 
00131     if (encoding.empty())
00132     {
00133         
00134         ERRORPRINTF("Can't detect encodings, fall back to ISO8859-1.");
00135         encoding = DEFAULT_ENCODING;
00136     }
00137 }
00138 
00139 void TextModel::clear()
00140 {
00141     
00142     for (TextDocumentIter it = doc.begin(); it != doc.end(); ++it)
00143     {
00144         delete (*it).text;
00145     }
00146     doc.clear();
00147 }
00148 
00149 PluginStatus TextModel::read_text()
00150 {
00151     assert(b_open);
00152     
00153     clear();    
00154     if (encoding == TARGET_CODESET)
00155     {
00156         
00157         return read_utf8_text();
00158     }
00159     else
00160     {
00161         
00162         return read_non_utf8_text();
00163     }
00164 }
00165 
00166 PluginStatus TextModel::read_utf8_text()
00167 {
00168     char buf[BLOCK_SIZE];
00169     size_t bytes_read = 0;
00170 
00171     fseek(file_p, 0, SEEK_SET);
00172     while (!feof(file_p))
00173     {
00174         bytes_read = fread(buf, 1, BLOCK_SIZE, file_p);
00175         save_block_with_paragraphs(buf, bytes_read);
00176     }
00177 
00178     return PLUGIN_OK;
00179 }
00180 
00181 void TextModel::convert(iconv_t cd, char **in_buf, size_t *in_bytes_left, char **out_buf, size_t *out_bytes_left)
00182 {
00183     while (*in_bytes_left > 3)
00184     {
00185         size_t bytes_to_be_converted = *in_bytes_left;
00186 #ifdef WIN32
00187         iconv(cd, const_cast<const char **>(in_buf), in_bytes_left, out_buf, out_bytes_left);
00188 #else
00189         iconv(cd, in_buf, in_bytes_left, out_buf, out_bytes_left);
00190 #endif
00191 
00192         if (*in_bytes_left == bytes_to_be_converted)
00193         {
00194             
00195             
00196             ++(*in_buf);
00197             --(*in_bytes_left);
00198             *(*out_buf)++ = ' ';
00199             --(*out_bytes_left);
00200         }
00201     }
00202 }
00203 
00204 PluginStatus TextModel::read_non_utf8_text()
00205 {
00206     
00207     char in_buf[BLOCK_SIZE];
00208     char out_buf[3*BLOCK_SIZE];
00209 
00210     size_t partial_chars = 0;
00211     iconv_t conv = iconv_open(TARGET_CODESET.c_str(), encoding.c_str());
00212     if (conv == (iconv_t)(-1))
00213     {
00214         return PLUGIN_UNSUPPORTED_ENCODING;
00215     }
00216 
00217     fseek(file_p, 0, SEEK_SET);
00218     while (!feof(file_p))
00219     {
00220         
00221         size_t bytes_read = fread(in_buf+partial_chars, 1, BLOCK_SIZE-partial_chars, file_p);
00222 
00223         char *in_p  = in_buf;
00224         char *out_p = out_buf;
00225 
00226         
00227         
00228         size_t in_bytes_left  = bytes_read + partial_chars;
00229         size_t out_bytes_left = sizeof(out_buf);
00230 
00231         
00232         convert(conv, &in_p, &in_bytes_left, &out_p, &out_bytes_left);
00233 
00234         
00235         save_block_with_paragraphs(out_buf, sizeof(out_buf)-out_bytes_left);
00236 
00237         
00238         partial_chars = in_bytes_left;
00239         if (partial_chars > 0)
00240         {
00241             memcpy(in_buf, in_buf+BLOCK_SIZE-partial_chars, partial_chars);
00242         }
00243     }
00244 
00245     iconv_close(conv);
00246     return PLUGIN_OK;
00247 }
00248 
00249 
00250 
00251 
00252 
00253 
00254 
00255 
00256 
00257 
00258 
00259 
00260 
00261 
00262 
00263 
00264 
00265 
00266 
00267 
00268 void TextModel::save_block_with_paragraphs(const char *blk, size_t blk_size)
00269 {
00270     if (doc.empty())
00271     {
00272         
00273         incomplete_line = false;
00274     }
00275 
00276     const char *end_p = blk + blk_size; 
00277     const char *p     = blk;
00278 
00279     while (p < end_p)
00280     {
00281         gchar* find_p = g_utf8_strchr(p, static_cast<gssize>(end_p - p), '\n');
00282         if (find_p != NULL)
00283         {
00284             
00285             if (incomplete_line)
00286             {
00287                 
00288                 
00289                 doc.back().text->append(p, find_p - p + 1);
00290             }
00291             else
00292             {
00293                 
00294                 
00295                 size_t start_file_pos = 0;
00296                 if (doc.size() > 0)
00297                 {
00298                     start_file_pos = doc.back().start_file_pos + doc.back().text->size();
00299                 }
00300                 doc.push_back(Paragraph(start_file_pos, new std::string(p, find_p - p + 1)));
00301             }
00302             
00303             p = find_p + 1;
00304             incomplete_line = false;
00305         }
00306         else
00307         {
00308             
00309             size_t start_file_pos = 0;
00310             if (doc.size() > 0)
00311             {
00312                 start_file_pos = doc.back().start_file_pos + doc.back().text->size();
00313             }
00314             doc.push_back(Paragraph(start_file_pos, new std::string(p, end_p - p)));
00315 
00316             
00317             incomplete_line = true;
00318             break;
00319         }
00320     }
00321 }
00322 
00323 bool TextModel::is_new_search(SearchContext * sc)
00324 {
00325     bool is_new = true;
00326 
00327     if (sc)
00328     {
00329         if (   sc->search_type      == last_sc.search_type
00330             && sc->pattern          == last_sc.pattern
00331             && sc->case_sensitive   == last_sc.case_sensitive
00332             && sc->forward          == last_sc.forward
00333             && sc->match_whole_word == last_sc.match_whole_word
00334             && sc->from             == last_search_result.end)
00335         {
00336             
00337             
00338             
00339             is_new = false;
00340         }
00341     }
00342     return is_new;
00343 }
00344 
00345 bool TextModel::search(std::vector<Range>& result_ranges, SearchContext* sc)
00346 {
00347     
00348     SearchType search_type      = sc->search_type;
00349     Position   &from            = sc->from;
00350     const char *pattern         = sc->pattern.c_str();
00351     bool       case_sensitive   = sc->case_sensitive;
00352     bool       forward          = sc->forward;
00353     bool       match_whole_word = sc->match_whole_word;
00354 
00355     size_t pattern_len = strlen(pattern);
00356     const char *paragraph_head = doc[from.paragraph].text->c_str();
00357 
00358     bool new_search = is_new_search(sc);
00359     
00360     
00361     last_sc = *sc;
00362 
00363     
00364     Position s(0, 0), e(0,0);
00365     last_search_result.start = s;
00366     last_search_result.end = e;
00367 
00368     if (forward)
00369     {
00370         const char *p = paragraph_head + from.offset;
00371         if (p && !new_search)
00372         {
00373             
00374             
00375             
00376             
00377             
00378             
00379             
00380             
00381             
00382             p++;
00383         }
00384         while (true)
00385         {
00386             const char* find = utf8_strstr(p, pattern, case_sensitive);
00387             if (find)
00388             {
00389                 
00390                 if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
00391                 {
00392                     
00393                     Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
00394                     const char* last_char = g_utf8_prev_char(find + pattern_len);
00395                     Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
00396                     result_ranges.push_back(Range(start, end));
00397                     
00398                     if (search_type == SEARCH_NEXT)
00399                     {
00400                         
00401                         last_search_result.start = start;
00402                         last_search_result.end = end;
00403 
00404                          
00405                         return true;
00406                     }
00407 
00408                     
00409                 }
00410 
00411                 p = find + pattern_len;
00412             }
00413             else
00414             {
00415                 
00416                 from.offset = 0;
00417                 return ++(from.paragraph) == doc.size();
00418             }
00419         }
00420     }
00421     else
00422     {
00423         
00424         int len = static_cast<int>(from.offset);
00425         while (true)
00426         {
00427             const char *find = utf8_strrstr(paragraph_head, len, pattern, case_sensitive);
00428             if (find)
00429             {
00430                 
00431                 if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
00432                 {
00433                     
00434                     Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
00435                     const char* last_char = g_utf8_prev_char(find + pattern_len);
00436                     Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
00437                     result_ranges.push_back(Range(start, end));
00438 
00439                     
00440                     last_search_result.start = start;
00441                     last_search_result.end = end;
00442 
00443                     return true;
00444                 }
00445 
00446                 len = static_cast<int>(find - paragraph_head);
00447             }
00448             else
00449             {
00450                 
00451                 if (from.paragraph == 0)
00452                 {
00453                     return true;
00454                 }
00455                 else
00456                 {
00457                     from.paragraph--;
00458                     from.offset = static_cast<unsigned int>(doc[from.paragraph].text->size());
00459                     return false;
00460                 }
00461             }
00462         }
00463     }
00464 }
00465 
00466 bool TextModel::has_anchor(const Position &pos)
00467 {
00468     
00469     if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
00470     {
00471         return false;
00472     }
00473 
00474     return true;
00475 }
00476 
00477 bool TextModel::get_file_pos_from_anchor(size_t& file_pos, const Position &pos)
00478 {
00479     
00480     if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
00481     {
00482         return false;
00483     }
00484 
00485     file_pos = doc[pos.paragraph].start_file_pos + pos.offset;
00486     return true;
00487 }
00488 
00489 bool TextModel::is_seperator(const char* p)
00490 {
00491     gunichar ch = g_utf8_get_char(p);
00492 
00493     if (g_unichar_isspace(ch))
00494     {
00495         return true; 
00496     }
00497 
00498     if (g_unichar_ispunct(ch))
00499     {
00500         
00501         if (*p != '\'' && *p != '\"')
00502         {
00503             return true;
00504         }
00505     }
00506 
00507     return false;
00508 }
00509 
00510 bool TextModel::get_word_from_anchor(const Position& pos,
00511                                      Position& word_start_pos,
00512                                      Position& word_end_pos)
00513 {
00514     
00515     const char* paragraph = doc[pos.paragraph].text->c_str();
00516     word_start_pos.paragraph = word_end_pos.paragraph = pos.paragraph;
00517 
00518     const char* p = paragraph + pos.offset;
00519 
00520     
00521     if (is_seperator(p))
00522     {
00523         
00524         word_start_pos.offset = word_end_pos.offset = pos.offset;
00525         return false;
00526     }
00527 
00528     
00529     for (; p > paragraph; p = g_utf8_prev_char(p))
00530     {
00531         if (is_seperator(p))
00532         {
00533             p = g_utf8_next_char(p);
00534             break;
00535         }
00536     }
00537     word_start_pos.offset = static_cast<int>(p - paragraph);
00538 
00539     
00540     for (p = paragraph + pos.offset; *p != 0; p = g_utf8_next_char(p))
00541     {
00542         if (is_seperator(p))
00543         {
00544             p = g_utf8_prev_char(p);
00545             break;
00546         }
00547     }
00548     word_end_pos.offset = static_cast<int>(p - paragraph);
00549 
00550     return true;
00551 }
00552 
00553 bool TextModel::get_words_from_range(const Position& range_start,
00554                                      const Position& range_end,
00555                                      Position& words_start,
00556                                      Position& words_end)
00557 {
00558     if (range_end < range_start)
00559     {
00560         ERRORPRINTF("Invalid range, range_start = %s, range_end = %s",
00561             range_start.to_string().c_str(),
00562             range_end.to_string().c_str());
00563         return false;
00564     }
00565 
00566     Position tmp;
00567 
00568     
00569     get_word_from_anchor(range_start, words_start, tmp);
00570 
00571     
00572     get_word_from_anchor(range_end, tmp, words_end);
00573 
00574     
00575     const char* start_paragraph = 0;
00576     const char* p = 0;
00577     while (true)
00578     {
00579         start_paragraph = doc[words_start.paragraph].text->c_str();
00580         for (p = start_paragraph + words_start.offset; *p != 0; p = g_utf8_next_char(p))
00581         {
00582             if (!is_seperator(p))
00583             {
00584                 break;
00585             }
00586         }
00587 
00588         if (*p == 0)
00589         {
00590             words_start.paragraph++;
00591             words_start.offset = 0;
00592         }
00593         else
00594         {
00595             break;
00596         }
00597     }
00598     words_start.offset = static_cast<int>(p - start_paragraph);
00599 
00600     
00601     const char* end_paragraph = doc[words_end.paragraph].text->c_str();
00602     for (p = end_paragraph + words_end.offset; p > end_paragraph; p = g_utf8_prev_char(p))
00603     {
00604         if (!is_seperator(p))
00605         {
00606             break;
00607         }
00608     }
00609     words_end.offset = static_cast<int>(p - end_paragraph);
00610 
00611     return words_end >= words_start;
00612 }
00613 
00614 bool TextModel::get_text_from_range(std::string& result,
00615                                     const Position& start_pos,
00616                                     const Position& end_pos)
00617 {
00618     unsigned int start_paragraph = start_pos.paragraph;
00619     unsigned int end_paragraph = end_pos.paragraph;
00620 
00621     for (unsigned int i = start_paragraph; 
00622             (i <= end_paragraph) && (i < doc.size()); 
00623             i++)
00624     {
00625         if (doc[i].text)
00626         {
00627             const char* start_p = doc[i].text->c_str();
00628             if (i == start_paragraph)
00629             {
00630                 start_p += start_pos.offset;
00631             }
00632 
00633             size_t len = doc[i].text->length();
00634             if (i == end_paragraph)
00635             {
00636                 const char* p = doc[i].text->c_str() + end_pos.offset;
00637                 len = g_utf8_next_char(p) - start_p;
00638             }
00639 
00640             result.append(start_p, len);
00641         }
00642     }
00643 
00644     return true;
00645 }
00646 
00647 void TextModel::dump()
00648 {
00649     
00650     std::string dump_path = path + ".converted";
00651     FILE* fp = fopen(dump_path.c_str(), "w");
00652     
00653     if (fp != NULL)
00654     {
00655         for (unsigned int i=0; i<doc.size(); i++)
00656         {
00657             fputs(doc[i].text->c_str(), fp);
00658         }
00659 
00660         fclose(fp);
00661     }
00662 }
00663 
00664 }