text_model.cpp

Go to the documentation of this file.
00001 /*
00002  * File Name: text_model.cpp
00003  */
00004 
00005 /*
00006  * This file is part of uds-plugin-plaintext.
00007  *
00008  * uds-plugin-plaintext is free software: you can redistribute it and/or modify
00009  * it under the terms of the GNU General Public License as published by
00010  * the Free Software Foundation, either version 2 of the License, or
00011  * (at your option) any later version.
00012  *
00013  * uds-plugin-plaintext is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program. If not, see <http://www.gnu.org/licenses/>.
00020  */
00021 
00022 /**
00023  * Copyright (C) 2008 iRex Technologies B.V.
00024  * All rights reserved.
00025  */
00026 
00027 #include <stdio.h>
00028 #include <string.h>
00029 #include <glib.h>
00030 #include "text_model.h"
00031 #include "utils.h"
00032 #include "log.h"
00033 #include "nsUniversalDetectorImpl.h"
00034 
00035 namespace text
00036 {
00037 
00038 /// You must feed enca with some text to get the possible encoding.
00039 /// If the sample text is too short, you will get "Unknown" encoding.
00040 static const int SAMPLE_LEN = 1024 * 4;
00041 static const int BLOCK_SIZE = 4096;
00042 static const std::string TARGET_CODESET = "utf-8";
00043 
00044 TextModel::TextModel()
00045 : file_p(0)
00046 , encoding("")
00047 , path()
00048 , b_open(false)
00049 , aborting_search_task_id(0)
00050 , incomplete_line(false)
00051 {
00052 }
00053 
00054 TextModel::~TextModel()
00055 {
00056     if (b_open)
00057     {
00058         close();
00059     }
00060 }
00061 
00062 PluginStatus TextModel::open(const std::string& doc_path)
00063 {
00064     return open(doc_path, "");
00065 }
00066 
00067 PluginStatus TextModel::open(const std::string& doc_path,
00068                              const std::string& encoding)
00069 {
00070     PluginStatus result = PLUGIN_FAIL;
00071 
00072     // The document was opened already, close it first.
00073     if (b_open)
00074     {
00075         close();
00076     }
00077 
00078     // Try to open specified file.
00079     file_p = fopen(doc_path.c_str(), "r");
00080     if (file_p == NULL)
00081     {
00082         return PLUGIN_FAIL;
00083     }
00084 
00085     // Detect encodings if necessary.
00086     if (encoding.empty())
00087     {
00088         detect_encoding();
00089     }
00090 
00091     // Update document information.
00092     path = doc_path;
00093     b_open = true;
00094 
00095     // Build up paragraphs.
00096     result = read_text();
00097     if (result != PLUGIN_OK)
00098     {
00099         close();
00100         return result;
00101     }
00102 
00103     if (doc.empty())
00104     {
00105         doc.push_back(Paragraph(0, new std::string(" ")));
00106     }
00107 
00108     return result;
00109 }
00110 
00111 void TextModel::close()
00112 {
00113     clear();
00114     fclose(file_p);
00115     file_p = NULL;
00116     b_open = false;
00117     encoding.clear();
00118 }
00119 
00120 /// Before using this function, make sure the file is already opened.
00121 void TextModel::detect_encoding()
00122 {
00123     char buf[SAMPLE_LEN];
00124     size_t bytes_read = fread(buf, 1, SAMPLE_LEN, file_p);
00125 
00126     nsUniversalDetectorImpl charset_detector;
00127     charset_detector.HandleData(buf, static_cast<unsigned int>(bytes_read));
00128     charset_detector.DataEnd();
00129     encoding = charset_detector.get_enc();
00130 
00131     if (encoding.empty())
00132     {
00133         // Fall back to ISO8859-1.
00134         ERRORPRINTF("Can't detect encodings, fall back to ISO8859-1.");
00135         encoding = DEFAULT_ENCODING;
00136     }
00137 }
00138 
00139 void TextModel::clear()
00140 {
00141     // Clear current paragraphs
00142     for (TextDocumentIter it = doc.begin(); it != doc.end(); ++it)
00143     {
00144         delete (*it).text;
00145     }
00146     doc.clear();
00147 }
00148 
00149 PluginStatus TextModel::read_text()
00150 {
00151     assert(b_open);
00152     
00153     clear();    
00154     if (encoding == TARGET_CODESET)
00155     {
00156         // Already UTF-8 encoded
00157         return read_utf8_text();
00158     }
00159     else
00160     {
00161         // Need to convert
00162         return read_non_utf8_text();
00163     }
00164 }
00165 
00166 PluginStatus TextModel::read_utf8_text()
00167 {
00168     char buf[BLOCK_SIZE];
00169     size_t bytes_read = 0;
00170 
00171     fseek(file_p, 0, SEEK_SET);
00172     while (!feof(file_p))
00173     {
00174         bytes_read = fread(buf, 1, BLOCK_SIZE, file_p);
00175         save_block_with_paragraphs(buf, bytes_read);
00176     }
00177 
00178     return PLUGIN_OK;
00179 }
00180 
00181 void TextModel::convert(iconv_t cd, char **in_buf, size_t *in_bytes_left, char **out_buf, size_t *out_bytes_left)
00182 {
00183     while (*in_bytes_left > 3)
00184     {
00185         size_t bytes_to_be_converted = *in_bytes_left;
00186 #ifdef WIN32
00187         iconv(cd, const_cast<const char **>(in_buf), in_bytes_left, out_buf, out_bytes_left);
00188 #else
00189         iconv(cd, in_buf, in_bytes_left, out_buf, out_bytes_left);
00190 #endif
00191 
00192         if (*in_bytes_left == bytes_to_be_converted)
00193         {
00194             // 0 bytes converted, maybe we have incorrect characters at start
00195             // Replace it with blank.
00196             ++(*in_buf);
00197             --(*in_bytes_left);
00198             *(*out_buf)++ = ' ';
00199             --(*out_bytes_left);
00200         }
00201     }
00202 }
00203 
00204 PluginStatus TextModel::read_non_utf8_text()
00205 {
00206     // Read content from disk file
00207     char in_buf[BLOCK_SIZE];
00208     char out_buf[3*BLOCK_SIZE];
00209 
00210     size_t partial_chars = 0;
00211     iconv_t conv = iconv_open(TARGET_CODESET.c_str(), encoding.c_str());
00212     if (conv == (iconv_t)(-1))
00213     {
00214         return PLUGIN_UNSUPPORTED_ENCODING;
00215     }
00216 
00217     fseek(file_p, 0, SEEK_SET);
00218     while (!feof(file_p))
00219     {
00220         // Read 1 block from disk file
00221         size_t bytes_read = fread(in_buf+partial_chars, 1, BLOCK_SIZE-partial_chars, file_p);
00222 
00223         char *in_p  = in_buf;
00224         char *out_p = out_buf;
00225 
00226         // The number of bytes to be converted equals to the bytes read plus
00227         // the bytes unconverted since last conversion.
00228         size_t in_bytes_left  = bytes_read + partial_chars;
00229         size_t out_bytes_left = sizeof(out_buf);
00230 
00231         // Do conversion, use wrapper instead of using iconv directly.
00232         convert(conv, &in_p, &in_bytes_left, &out_p, &out_bytes_left);
00233 
00234         // Put converted string to paragraph list
00235         save_block_with_paragraphs(out_buf, sizeof(out_buf)-out_bytes_left);
00236 
00237         // Check if we have partial chars unconverted
00238         partial_chars = in_bytes_left;
00239         if (partial_chars > 0)
00240         {
00241             memcpy(in_buf, in_buf+BLOCK_SIZE-partial_chars, partial_chars);
00242         }
00243     }
00244 
00245     iconv_close(conv);
00246     return PLUGIN_OK;
00247 }
00248 
00249 /*
00250 void TextModel::read_non_utf8_text()
00251 {
00252     // Read content from disk file
00253     char in_buf[BLOCK_SIZE];
00254 
00255     IConvEncodingConverter conv(enc);
00256     fseek(file_p, 0, SEEK_SET);
00257     while (!feof(file_p))
00258     {
00259         // Read 1 block from disk file
00260         size_t bytes_read = fread(in_buf, 1, BLOCK_SIZE, file_p);
00261         std::string str = "";
00262         conv.convert(str, in_buf, in_buf + bytes_read);
00263         save_block_with_paragraphs(str.c_str(), str.size());
00264     }
00265 }
00266 */
00267 
00268 void TextModel::save_block_with_paragraphs(const char *blk, size_t blk_size)
00269 {
00270     if (doc.empty())
00271     {
00272         // We are saving the first block
00273         incomplete_line = false;
00274     }
00275 
00276     const char *end_p = blk + blk_size; // end_p points to '\0'
00277     const char *p     = blk;
00278 
00279     while (p < end_p)
00280     {
00281         gchar* find_p = g_utf8_strchr(p, static_cast<gssize>(end_p - p), '\n');
00282         if (find_p != NULL)
00283         {
00284             // We find a new paragraph, append it to the paragraph list
00285             if (incomplete_line)
00286             {
00287                 // We have a incomplete line since last read, so this time we
00288                 // must append the string to the last paragraph
00289                 doc.back().text->append(p, find_p - p + 1);
00290             }
00291             else
00292             {
00293                 // Just create a new string and append it to the end of the
00294                 // paragraph list
00295                 size_t start_file_pos = 0;
00296                 if (doc.size() > 0)
00297                 {
00298                     start_file_pos = doc.back().start_file_pos + doc.back().text->size();
00299                 }
00300                 doc.push_back(Paragraph(start_file_pos, new std::string(p, find_p - p + 1)));
00301             }
00302             
00303             p = find_p + 1;
00304             incomplete_line = false;
00305         }
00306         else
00307         {
00308             // Can't find paragraph
00309             size_t start_file_pos = 0;
00310             if (doc.size() > 0)
00311             {
00312                 start_file_pos = doc.back().start_file_pos + doc.back().text->size();
00313             }
00314             doc.push_back(Paragraph(start_file_pos, new std::string(p, end_p - p)));
00315 
00316             // We have an incomplete paragraph, mark it
00317             incomplete_line = true;
00318             break;
00319         }
00320     }
00321 }
00322 
00323 bool TextModel::is_new_search(SearchContext * sc)
00324 {
00325     bool is_new = true;
00326 
00327     if (sc)
00328     {
00329         if (   sc->search_type      == last_sc.search_type
00330             && sc->pattern          == last_sc.pattern
00331             && sc->case_sensitive   == last_sc.case_sensitive
00332             && sc->forward          == last_sc.forward
00333             && sc->match_whole_word == last_sc.match_whole_word
00334             && sc->from             == last_search_result.end)
00335         {
00336             // If the end anchor of last search result is the same as the start position 
00337             // of searching and the other search options are also the same that means 
00338             // it's 'search next again' after finding the text.
00339             is_new = false;
00340         }
00341     }
00342     return is_new;
00343 }
00344 
00345 bool TextModel::search(std::vector<Range>& result_ranges, SearchContext* sc)
00346 {
00347     // Exact search type and search criteria from search context.
00348     SearchType search_type      = sc->search_type;
00349     Position   &from            = sc->from;
00350     const char *pattern         = sc->pattern.c_str();
00351     bool       case_sensitive   = sc->case_sensitive;
00352     bool       forward          = sc->forward;
00353     bool       match_whole_word = sc->match_whole_word;
00354 
00355     size_t pattern_len = strlen(pattern);
00356     const char *paragraph_head = doc[from.paragraph].text->c_str();
00357 
00358     bool new_search = is_new_search(sc);
00359     
00360     // Remember last search context.
00361     last_sc = *sc;
00362 
00363     // Clear the last search result range.
00364     Position s(0, 0), e(0,0);
00365     last_search_result.start = s;
00366     last_search_result.end = e;
00367 
00368     if (forward)
00369     {
00370         const char *p = paragraph_head + from.offset;
00371         if (p && !new_search)
00372         {
00373             // It's searching next again, move the start position of searching one character.
00374             // Otherwize, searching one letter results in deadlock.
00375             //
00376             // See more in  0003861: [UDS TXT plugin] TXT Search next 
00377             // for single character word does not work.
00378             // 
00379             // The solution is very dirty. But txt plugin has low priority 
00380             // which is not specified for dr1000 and dr800. 
00381             // And redefinition what is range takes much more time and has more effects.
00382             p++;
00383         }
00384         while (true)
00385         {
00386             const char* find = utf8_strstr(p, pattern, case_sensitive);
00387             if (find)
00388             {
00389                 // See if matching whole word.
00390                 if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
00391                 {
00392                     // Pattern found.
00393                     Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
00394                     const char* last_char = g_utf8_prev_char(find + pattern_len);
00395                     Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
00396                     result_ranges.push_back(Range(start, end));
00397                     
00398                     if (search_type == SEARCH_NEXT)
00399                     {
00400                         // Remember last search result.
00401                         last_search_result.start = start;
00402                         last_search_result.end = end;
00403 
00404                          // Search complete.
00405                         return true;
00406                     }
00407 
00408                     // If SEARCH_ALL we must continue with current paragraph.
00409                 }
00410 
00411                 p = find + pattern_len;
00412             }
00413             else
00414             {
00415                 // Can't find any match in current paragraph.
00416                 from.offset = 0;
00417                 return ++(from.paragraph) == doc.size();
00418             }
00419         }
00420     }
00421     else
00422     {
00423         // Backward search.
00424         int len = static_cast<int>(from.offset);
00425         while (true)
00426         {
00427             const char *find = utf8_strrstr(paragraph_head, len, pattern, case_sensitive);
00428             if (find)
00429             {
00430                 // See if matching whole word.
00431                 if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
00432                 {
00433                     // Pattern found.
00434                     Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
00435                     const char* last_char = g_utf8_prev_char(find + pattern_len);
00436                     Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
00437                     result_ranges.push_back(Range(start, end));
00438 
00439                     // Remember last search result.
00440                     last_search_result.start = start;
00441                     last_search_result.end = end;
00442 
00443                     return true;
00444                 }
00445 
00446                 len = static_cast<int>(find - paragraph_head);
00447             }
00448             else
00449             {
00450                 // Can't find any match in current paragraph.
00451                 if (from.paragraph == 0)
00452                 {
00453                     return true;
00454                 }
00455                 else
00456                 {
00457                     from.paragraph--;
00458                     from.offset = static_cast<unsigned int>(doc[from.paragraph].text->size());
00459                     return false;
00460                 }
00461             }
00462         }
00463     }
00464 }
00465 
00466 bool TextModel::has_anchor(const Position &pos)
00467 {
00468     // Sanity check.
00469     if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
00470     {
00471         return false;
00472     }
00473 
00474     return true;
00475 }
00476 
00477 bool TextModel::get_file_pos_from_anchor(size_t& file_pos, const Position &pos)
00478 {
00479     // Sanity check.
00480     if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
00481     {
00482         return false;
00483     }
00484 
00485     file_pos = doc[pos.paragraph].start_file_pos + pos.offset;
00486     return true;
00487 }
00488 
00489 bool TextModel::is_seperator(const char* p)
00490 {
00491     gunichar ch = g_utf8_get_char(p);
00492 
00493     if (g_unichar_isspace(ch))
00494     {
00495         return true; 
00496     }
00497 
00498     if (g_unichar_ispunct(ch))
00499     {
00500         // Punctuation.
00501         if (*p != '\'' && *p != '\"')
00502         {
00503             return true;
00504         }
00505     }
00506 
00507     return false;
00508 }
00509 
00510 bool TextModel::get_word_from_anchor(const Position& pos,
00511                                      Position& word_start_pos,
00512                                      Position& word_end_pos)
00513 {
00514     
00515     const char* paragraph = doc[pos.paragraph].text->c_str();
00516     word_start_pos.paragraph = word_end_pos.paragraph = pos.paragraph;
00517 
00518     const char* p = paragraph + pos.offset;
00519 
00520     // Check if the character at pos is a seperator.
00521     if (is_seperator(p))
00522     {
00523         // Then there is no word at pos.
00524         word_start_pos.offset = word_end_pos.offset = pos.offset;
00525         return false;
00526     }
00527 
00528     // Find the first space before pos.
00529     for (; p > paragraph; p = g_utf8_prev_char(p))
00530     {
00531         if (is_seperator(p))
00532         {
00533             p = g_utf8_next_char(p);
00534             break;
00535         }
00536     }
00537     word_start_pos.offset = static_cast<int>(p - paragraph);
00538 
00539     // Find the first space after pos.
00540     for (p = paragraph + pos.offset; *p != 0; p = g_utf8_next_char(p))
00541     {
00542         if (is_seperator(p))
00543         {
00544             p = g_utf8_prev_char(p);
00545             break;
00546         }
00547     }
00548     word_end_pos.offset = static_cast<int>(p - paragraph);
00549 
00550     return true;
00551 }
00552 
00553 bool TextModel::get_words_from_range(const Position& range_start,
00554                                      const Position& range_end,
00555                                      Position& words_start,
00556                                      Position& words_end)
00557 {
00558     if (range_end < range_start)
00559     {
00560         ERRORPRINTF("Invalid range, range_start = %s, range_end = %s",
00561             range_start.to_string().c_str(),
00562             range_end.to_string().c_str());
00563         return false;
00564     }
00565 
00566     Position tmp;
00567 
00568     // Get the object range the range_start anchor points to.
00569     get_word_from_anchor(range_start, words_start, tmp);
00570 
00571     // Get the object range the range_end anchor points to.
00572     get_word_from_anchor(range_end, tmp, words_end);
00573 
00574     // Strip any leading seperators.
00575     const char* start_paragraph = 0;
00576     const char* p = 0;
00577     while (true)
00578     {
00579         start_paragraph = doc[words_start.paragraph].text->c_str();
00580         for (p = start_paragraph + words_start.offset; *p != 0; p = g_utf8_next_char(p))
00581         {
00582             if (!is_seperator(p))
00583             {
00584                 break;
00585             }
00586         }
00587 
00588         if (*p == 0)
00589         {
00590             words_start.paragraph++;
00591             words_start.offset = 0;
00592         }
00593         else
00594         {
00595             break;
00596         }
00597     }
00598     words_start.offset = static_cast<int>(p - start_paragraph);
00599 
00600     // Strip any trailing seperators.
00601     const char* end_paragraph = doc[words_end.paragraph].text->c_str();
00602     for (p = end_paragraph + words_end.offset; p > end_paragraph; p = g_utf8_prev_char(p))
00603     {
00604         if (!is_seperator(p))
00605         {
00606             break;
00607         }
00608     }
00609     words_end.offset = static_cast<int>(p - end_paragraph);
00610 
00611     return words_end >= words_start;
00612 }
00613 
00614 bool TextModel::get_text_from_range(std::string& result,
00615                                     const Position& start_pos,
00616                                     const Position& end_pos)
00617 {
00618     unsigned int start_paragraph = start_pos.paragraph;
00619     unsigned int end_paragraph = end_pos.paragraph;
00620 
00621     for (unsigned int i = start_paragraph; 
00622             (i <= end_paragraph) && (i < doc.size()); 
00623             i++)
00624     {
00625         if (doc[i].text)
00626         {
00627             const char* start_p = doc[i].text->c_str();
00628             if (i == start_paragraph)
00629             {
00630                 start_p += start_pos.offset;
00631             }
00632 
00633             size_t len = doc[i].text->length();
00634             if (i == end_paragraph)
00635             {
00636                 const char* p = doc[i].text->c_str() + end_pos.offset;
00637                 len = g_utf8_next_char(p) - start_p;
00638             }
00639 
00640             result.append(start_p, len);
00641         }
00642     }
00643 
00644     return true;
00645 }
00646 
00647 void TextModel::dump()
00648 {
00649     // Generate the dump file.
00650     std::string dump_path = path + ".converted";
00651     FILE* fp = fopen(dump_path.c_str(), "w");
00652     
00653     if (fp != NULL)
00654     {
00655         for (unsigned int i=0; i<doc.size(); i++)
00656         {
00657             fputs(doc[i].text->c_str(), fp);
00658         }
00659 
00660         fclose(fp);
00661     }
00662 }
00663 
00664 }