00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #include <stdio.h>
00028 #include <string.h>
00029 #include <glib.h>
00030 #include "text_model.h"
00031 #include "utils.h"
00032 #include "log.h"
00033 #include "nsUniversalDetectorImpl.h"
00034
00035 namespace text
00036 {
00037
00038
00039
00040 static const int SAMPLE_LEN = 1024 * 4;
00041 static const int BLOCK_SIZE = 4096;
00042 static const std::string TARGET_CODESET = "utf-8";
00043
00044 TextModel::TextModel()
00045 : file_p(0)
00046 , encoding("")
00047 , path()
00048 , b_open(false)
00049 , aborting_search_task_id(0)
00050 , incomplete_line(false)
00051 {
00052 }
00053
00054 TextModel::~TextModel()
00055 {
00056 if (b_open)
00057 {
00058 close();
00059 }
00060 }
00061
00062 PluginStatus TextModel::open(const std::string& doc_path)
00063 {
00064 return open(doc_path, "");
00065 }
00066
00067 PluginStatus TextModel::open(const std::string& doc_path,
00068 const std::string& encoding)
00069 {
00070 PluginStatus result = PLUGIN_FAIL;
00071
00072
00073 if (b_open)
00074 {
00075 close();
00076 }
00077
00078
00079 file_p = fopen(doc_path.c_str(), "r");
00080 if (file_p == NULL)
00081 {
00082 return PLUGIN_FAIL;
00083 }
00084
00085
00086 if (encoding.empty())
00087 {
00088 detect_encoding();
00089 }
00090
00091
00092 path = doc_path;
00093 b_open = true;
00094
00095
00096 result = read_text();
00097 if (result != PLUGIN_OK)
00098 {
00099 close();
00100 return result;
00101 }
00102
00103 if (doc.empty())
00104 {
00105 doc.push_back(Paragraph(0, new std::string(" ")));
00106 }
00107
00108 return result;
00109 }
00110
00111 void TextModel::close()
00112 {
00113 clear();
00114 fclose(file_p);
00115 file_p = NULL;
00116 b_open = false;
00117 encoding.clear();
00118 }
00119
00120
00121 void TextModel::detect_encoding()
00122 {
00123 char buf[SAMPLE_LEN];
00124 size_t bytes_read = fread(buf, 1, SAMPLE_LEN, file_p);
00125
00126 nsUniversalDetectorImpl charset_detector;
00127 charset_detector.HandleData(buf, static_cast<unsigned int>(bytes_read));
00128 charset_detector.DataEnd();
00129 encoding = charset_detector.get_enc();
00130
00131 if (encoding.empty())
00132 {
00133
00134 ERRORPRINTF("Can't detect encodings, fall back to ISO8859-1.");
00135 encoding = DEFAULT_ENCODING;
00136 }
00137 }
00138
00139 void TextModel::clear()
00140 {
00141
00142 for (TextDocumentIter it = doc.begin(); it != doc.end(); ++it)
00143 {
00144 delete (*it).text;
00145 }
00146 doc.clear();
00147 }
00148
00149 PluginStatus TextModel::read_text()
00150 {
00151 assert(b_open);
00152
00153 clear();
00154 if (encoding == TARGET_CODESET)
00155 {
00156
00157 return read_utf8_text();
00158 }
00159 else
00160 {
00161
00162 return read_non_utf8_text();
00163 }
00164 }
00165
00166 PluginStatus TextModel::read_utf8_text()
00167 {
00168 char buf[BLOCK_SIZE];
00169 size_t bytes_read = 0;
00170
00171 fseek(file_p, 0, SEEK_SET);
00172 while (!feof(file_p))
00173 {
00174 bytes_read = fread(buf, 1, BLOCK_SIZE, file_p);
00175 save_block_with_paragraphs(buf, bytes_read);
00176 }
00177
00178 return PLUGIN_OK;
00179 }
00180
00181 void TextModel::convert(iconv_t cd, char **in_buf, size_t *in_bytes_left, char **out_buf, size_t *out_bytes_left)
00182 {
00183 while (*in_bytes_left > 3)
00184 {
00185 size_t bytes_to_be_converted = *in_bytes_left;
00186 #ifdef WIN32
00187 iconv(cd, const_cast<const char **>(in_buf), in_bytes_left, out_buf, out_bytes_left);
00188 #else
00189 iconv(cd, in_buf, in_bytes_left, out_buf, out_bytes_left);
00190 #endif
00191
00192 if (*in_bytes_left == bytes_to_be_converted)
00193 {
00194
00195
00196 ++(*in_buf);
00197 --(*in_bytes_left);
00198 *(*out_buf)++ = ' ';
00199 --(*out_bytes_left);
00200 }
00201 }
00202 }
00203
00204 PluginStatus TextModel::read_non_utf8_text()
00205 {
00206
00207 char in_buf[BLOCK_SIZE];
00208 char out_buf[3*BLOCK_SIZE];
00209
00210 size_t partial_chars = 0;
00211 iconv_t conv = iconv_open(TARGET_CODESET.c_str(), encoding.c_str());
00212 if (conv == (iconv_t)(-1))
00213 {
00214 return PLUGIN_UNSUPPORTED_ENCODING;
00215 }
00216
00217 fseek(file_p, 0, SEEK_SET);
00218 while (!feof(file_p))
00219 {
00220
00221 size_t bytes_read = fread(in_buf+partial_chars, 1, BLOCK_SIZE-partial_chars, file_p);
00222
00223 char *in_p = in_buf;
00224 char *out_p = out_buf;
00225
00226
00227
00228 size_t in_bytes_left = bytes_read + partial_chars;
00229 size_t out_bytes_left = sizeof(out_buf);
00230
00231
00232 convert(conv, &in_p, &in_bytes_left, &out_p, &out_bytes_left);
00233
00234
00235 save_block_with_paragraphs(out_buf, sizeof(out_buf)-out_bytes_left);
00236
00237
00238 partial_chars = in_bytes_left;
00239 if (partial_chars > 0)
00240 {
00241 memcpy(in_buf, in_buf+BLOCK_SIZE-partial_chars, partial_chars);
00242 }
00243 }
00244
00245 iconv_close(conv);
00246 return PLUGIN_OK;
00247 }
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268 void TextModel::save_block_with_paragraphs(const char *blk, size_t blk_size)
00269 {
00270 if (doc.empty())
00271 {
00272
00273 incomplete_line = false;
00274 }
00275
00276 const char *end_p = blk + blk_size;
00277 const char *p = blk;
00278
00279 while (p < end_p)
00280 {
00281 gchar* find_p = g_utf8_strchr(p, static_cast<gssize>(end_p - p), '\n');
00282 if (find_p != NULL)
00283 {
00284
00285 if (incomplete_line)
00286 {
00287
00288
00289 doc.back().text->append(p, find_p - p + 1);
00290 }
00291 else
00292 {
00293
00294
00295 size_t start_file_pos = 0;
00296 if (doc.size() > 0)
00297 {
00298 start_file_pos = doc.back().start_file_pos + doc.back().text->size();
00299 }
00300 doc.push_back(Paragraph(start_file_pos, new std::string(p, find_p - p + 1)));
00301 }
00302
00303 p = find_p + 1;
00304 incomplete_line = false;
00305 }
00306 else
00307 {
00308
00309 size_t start_file_pos = 0;
00310 if (doc.size() > 0)
00311 {
00312 start_file_pos = doc.back().start_file_pos + doc.back().text->size();
00313 }
00314 doc.push_back(Paragraph(start_file_pos, new std::string(p, end_p - p)));
00315
00316
00317 incomplete_line = true;
00318 break;
00319 }
00320 }
00321 }
00322
00323 bool TextModel::is_new_search(SearchContext * sc)
00324 {
00325 bool is_new = true;
00326
00327 if (sc)
00328 {
00329 if ( sc->search_type == last_sc.search_type
00330 && sc->pattern == last_sc.pattern
00331 && sc->case_sensitive == last_sc.case_sensitive
00332 && sc->forward == last_sc.forward
00333 && sc->match_whole_word == last_sc.match_whole_word
00334 && sc->from == last_search_result.end)
00335 {
00336
00337
00338
00339 is_new = false;
00340 }
00341 }
00342 return is_new;
00343 }
00344
00345 bool TextModel::search(std::vector<Range>& result_ranges, SearchContext* sc)
00346 {
00347
00348 SearchType search_type = sc->search_type;
00349 Position &from = sc->from;
00350 const char *pattern = sc->pattern.c_str();
00351 bool case_sensitive = sc->case_sensitive;
00352 bool forward = sc->forward;
00353 bool match_whole_word = sc->match_whole_word;
00354
00355 size_t pattern_len = strlen(pattern);
00356 const char *paragraph_head = doc[from.paragraph].text->c_str();
00357
00358 bool new_search = is_new_search(sc);
00359
00360
00361 last_sc = *sc;
00362
00363
00364 Position s(0, 0), e(0,0);
00365 last_search_result.start = s;
00366 last_search_result.end = e;
00367
00368 if (forward)
00369 {
00370 const char *p = paragraph_head + from.offset;
00371 if (p && !new_search)
00372 {
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382 p++;
00383 }
00384 while (true)
00385 {
00386 const char* find = utf8_strstr(p, pattern, case_sensitive);
00387 if (find)
00388 {
00389
00390 if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
00391 {
00392
00393 Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
00394 const char* last_char = g_utf8_prev_char(find + pattern_len);
00395 Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
00396 result_ranges.push_back(Range(start, end));
00397
00398 if (search_type == SEARCH_NEXT)
00399 {
00400
00401 last_search_result.start = start;
00402 last_search_result.end = end;
00403
00404
00405 return true;
00406 }
00407
00408
00409 }
00410
00411 p = find + pattern_len;
00412 }
00413 else
00414 {
00415
00416 from.offset = 0;
00417 return ++(from.paragraph) == doc.size();
00418 }
00419 }
00420 }
00421 else
00422 {
00423
00424 int len = static_cast<int>(from.offset);
00425 while (true)
00426 {
00427 const char *find = utf8_strrstr(paragraph_head, len, pattern, case_sensitive);
00428 if (find)
00429 {
00430
00431 if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
00432 {
00433
00434 Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
00435 const char* last_char = g_utf8_prev_char(find + pattern_len);
00436 Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
00437 result_ranges.push_back(Range(start, end));
00438
00439
00440 last_search_result.start = start;
00441 last_search_result.end = end;
00442
00443 return true;
00444 }
00445
00446 len = static_cast<int>(find - paragraph_head);
00447 }
00448 else
00449 {
00450
00451 if (from.paragraph == 0)
00452 {
00453 return true;
00454 }
00455 else
00456 {
00457 from.paragraph--;
00458 from.offset = static_cast<unsigned int>(doc[from.paragraph].text->size());
00459 return false;
00460 }
00461 }
00462 }
00463 }
00464 }
00465
00466 bool TextModel::has_anchor(const Position &pos)
00467 {
00468
00469 if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
00470 {
00471 return false;
00472 }
00473
00474 return true;
00475 }
00476
00477 bool TextModel::get_file_pos_from_anchor(size_t& file_pos, const Position &pos)
00478 {
00479
00480 if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
00481 {
00482 return false;
00483 }
00484
00485 file_pos = doc[pos.paragraph].start_file_pos + pos.offset;
00486 return true;
00487 }
00488
00489 bool TextModel::is_seperator(const char* p)
00490 {
00491 gunichar ch = g_utf8_get_char(p);
00492
00493 if (g_unichar_isspace(ch))
00494 {
00495 return true;
00496 }
00497
00498 if (g_unichar_ispunct(ch))
00499 {
00500
00501 if (*p != '\'' && *p != '\"')
00502 {
00503 return true;
00504 }
00505 }
00506
00507 return false;
00508 }
00509
00510 bool TextModel::get_word_from_anchor(const Position& pos,
00511 Position& word_start_pos,
00512 Position& word_end_pos)
00513 {
00514
00515 const char* paragraph = doc[pos.paragraph].text->c_str();
00516 word_start_pos.paragraph = word_end_pos.paragraph = pos.paragraph;
00517
00518 const char* p = paragraph + pos.offset;
00519
00520
00521 if (is_seperator(p))
00522 {
00523
00524 word_start_pos.offset = word_end_pos.offset = pos.offset;
00525 return false;
00526 }
00527
00528
00529 for (; p > paragraph; p = g_utf8_prev_char(p))
00530 {
00531 if (is_seperator(p))
00532 {
00533 p = g_utf8_next_char(p);
00534 break;
00535 }
00536 }
00537 word_start_pos.offset = static_cast<int>(p - paragraph);
00538
00539
00540 for (p = paragraph + pos.offset; *p != 0; p = g_utf8_next_char(p))
00541 {
00542 if (is_seperator(p))
00543 {
00544 p = g_utf8_prev_char(p);
00545 break;
00546 }
00547 }
00548 word_end_pos.offset = static_cast<int>(p - paragraph);
00549
00550 return true;
00551 }
00552
00553 bool TextModel::get_words_from_range(const Position& range_start,
00554 const Position& range_end,
00555 Position& words_start,
00556 Position& words_end)
00557 {
00558 if (range_end < range_start)
00559 {
00560 ERRORPRINTF("Invalid range, range_start = %s, range_end = %s",
00561 range_start.to_string().c_str(),
00562 range_end.to_string().c_str());
00563 return false;
00564 }
00565
00566 Position tmp;
00567
00568
00569 get_word_from_anchor(range_start, words_start, tmp);
00570
00571
00572 get_word_from_anchor(range_end, tmp, words_end);
00573
00574
00575 const char* start_paragraph = 0;
00576 const char* p = 0;
00577 while (true)
00578 {
00579 start_paragraph = doc[words_start.paragraph].text->c_str();
00580 for (p = start_paragraph + words_start.offset; *p != 0; p = g_utf8_next_char(p))
00581 {
00582 if (!is_seperator(p))
00583 {
00584 break;
00585 }
00586 }
00587
00588 if (*p == 0)
00589 {
00590 words_start.paragraph++;
00591 words_start.offset = 0;
00592 }
00593 else
00594 {
00595 break;
00596 }
00597 }
00598 words_start.offset = static_cast<int>(p - start_paragraph);
00599
00600
00601 const char* end_paragraph = doc[words_end.paragraph].text->c_str();
00602 for (p = end_paragraph + words_end.offset; p > end_paragraph; p = g_utf8_prev_char(p))
00603 {
00604 if (!is_seperator(p))
00605 {
00606 break;
00607 }
00608 }
00609 words_end.offset = static_cast<int>(p - end_paragraph);
00610
00611 return words_end >= words_start;
00612 }
00613
00614 bool TextModel::get_text_from_range(std::string& result,
00615 const Position& start_pos,
00616 const Position& end_pos)
00617 {
00618 unsigned int start_paragraph = start_pos.paragraph;
00619 unsigned int end_paragraph = end_pos.paragraph;
00620
00621 for (unsigned int i = start_paragraph;
00622 (i <= end_paragraph) && (i < doc.size());
00623 i++)
00624 {
00625 if (doc[i].text)
00626 {
00627 const char* start_p = doc[i].text->c_str();
00628 if (i == start_paragraph)
00629 {
00630 start_p += start_pos.offset;
00631 }
00632
00633 size_t len = doc[i].text->length();
00634 if (i == end_paragraph)
00635 {
00636 const char* p = doc[i].text->c_str() + end_pos.offset;
00637 len = g_utf8_next_char(p) - start_p;
00638 }
00639
00640 result.append(start_p, len);
00641 }
00642 }
00643
00644 return true;
00645 }
00646
00647 void TextModel::dump()
00648 {
00649
00650 std::string dump_path = path + ".converted";
00651 FILE* fp = fopen(dump_path.c_str(), "w");
00652
00653 if (fp != NULL)
00654 {
00655 for (unsigned int i=0; i<doc.size(); i++)
00656 {
00657 fputs(doc[i].text->c_str(), fp);
00658 }
00659
00660 fclose(fp);
00661 }
00662 }
00663
00664 }