pdf_searcher.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #include "pdf_doc_controller.h"
00028 #include "pdf_searcher.h"
00029 #include "pdf_anchor.h"
00030 #include "pdf_search_task.h"
00031
00032
00033 namespace pdf
00034 {
00035
00036 using namespace std;
00037
00038 bool is_ignore_char(const char c)
00039 {
00040 return (c == ' ');
00041 }
00042
00043 bool is_end_char(const char c)
00044 {
00045 return (c == '\0');
00046 }
00047
00048 void PDFSearcher::clear_search_ctx()
00049 {
00050 search_ctx.dst_words.clear();
00051 }
00052
00053 bool PDFSearcher::begin_search_next(const PDFSearchCriteria &criteria
00054 , const string &from_anchor)
00055 {
00056 PDFAnchor from_param(from_anchor);
00057 if (from_param.page_num <= 0
00058 || from_param.page_num >
00059 static_cast<int>(doc_controller->page_count()))
00060 {
00061 from_param.page_num = 1;
00062 }
00063
00064 clear_search_ctx();
00065
00066
00067 search_ctx.case_sensitive = criteria.case_sensitive;
00068 search_ctx.match_whole_word = criteria.match_whole_word;
00069 search_ctx.page_num = from_param.page_num;
00070 search_ctx.forward = criteria.forward;
00071 search_ctx.search_all = false;
00072 search_ctx.word_cursor = from_param.word_num;
00073 search_ctx.char_cursor = from_param.char_idx;
00074
00075 if (search_ctx.forward)
00076 {
00077
00078 search_ctx.char_cursor++;
00079 }
00080 else
00081 {
00082
00083
00084
00085 search_ctx.char_cursor--;
00086 }
00087
00088 parse_dst_string(criteria.text, search_ctx.dst_words);
00089
00090 return true;
00091 }
00092
00093 bool PDFSearcher::begin_search_all(const PDFSearchCriteria &criteria)
00094 {
00095 clear_search_ctx();
00096
00097
00098 search_ctx.case_sensitive = criteria.case_sensitive;
00099 search_ctx.match_whole_word = criteria.match_whole_word;
00100
00101
00102 search_ctx.page_num = 1;
00103
00104 search_ctx.forward = true;
00105 search_ctx.search_all = true;
00106
00107
00108 search_ctx.word_cursor = 0;
00109 search_ctx.char_cursor = 0;
00110 parse_dst_string(criteria.text, search_ctx.dst_words);
00111
00112 return true;
00113 }
00114
00115 SearchResult PDFSearcher::search_next(PDFSearchDocument &results
00116 , PDFSearchTask *task)
00117 {
00118
00119 PDFSearchPage *search_page = new PDFSearchPage;
00120 SearchResult res = RES_NOT_FOUND;
00121 while(res != RES_OK
00122 && search_ctx.page_num > 0
00123 && search_ctx.page_num <=
00124 static_cast<int>(doc_controller->page_count()))
00125 {
00126 res = search_current_page(search_ctx, *search_page);
00127
00128 if (res != RES_OK)
00129 {
00130
00131 search_ctx.forward ? search_ctx.page_num++
00132 : search_ctx.page_num--;
00133
00134
00135
00136 search_ctx.word_cursor = -1;
00137 }
00138
00139
00140 if (task->is_aborted())
00141 {
00142 LOGPRINTF("Task Search Next canceled!\n");
00143 res = RES_ABORTED;
00144 break;
00145 }
00146 else if (task->is_paused())
00147 {
00148 LOGPRINTF("Task Search Next paused!\n");
00149 res = RES_PAUSED;
00150 break;
00151 }
00152 }
00153
00154 if (res != RES_OK)
00155 {
00156 delete search_page;
00157 }
00158 else
00159 {
00160 search_page->set_element(search_ctx.page_num);
00161 results.add(search_page);
00162 }
00163
00164 return res;
00165
00166 }
00167
00168 SearchResult PDFSearcher::seach_all(PDFSearchDocument &results
00169 , PDFSearchTask *task)
00170 {
00171
00172 SearchResult res = RES_NOT_FOUND;
00173
00174
00175 SearchResult res_once = res;
00176
00177 PDFSearchPage *search_page = new PDFSearchPage;
00178
00179 while ( search_ctx.page_num > 0
00180 && search_ctx.page_num <=
00181 static_cast<int>(doc_controller->page_count()))
00182 {
00183
00184
00185 res_once = search_current_page(search_ctx, *search_page);
00186
00187 if (res_once == RES_OK)
00188 {
00189 search_page->set_element(search_ctx.page_num);
00190 results.add(search_page);
00191 search_page = new PDFSearchPage;
00192 }
00193
00194
00195 search_ctx.page_num++;
00196
00197
00198 search_ctx.word_cursor = 0;
00199
00200
00201 if (task->is_aborted())
00202 {
00203 LOGPRINTF("Task Search All canceled!\n");
00204 res = RES_ABORTED;
00205 break;
00206 }
00207 else if (task->is_paused())
00208 {
00209 LOGPRINTF("Task Search All paused!\n");
00210 res = RES_PAUSED;
00211 break;
00212 }
00213 }
00214
00215 delete search_page;
00216
00217 if (results.size() > 0)
00218 {
00219 res = RES_OK;
00220 }
00221
00222 return res;
00223 }
00224
00225 bool PDFSearcher::dump_search_process(string &anchor)
00226 {
00227 PDFAnchor process;
00228 process.page_num = search_ctx.page_num;
00229 process.word_num = search_ctx.word_cursor;
00230 process.char_idx = search_ctx.char_cursor;
00231
00232
00233 anchor = process.get_string();
00234 return true;
00235 }
00236
00237 SearchResult PDFSearcher::search_current_page(SearchContext &ctx
00238 , PDFSearchPage &results)
00239 {
00240
00241
00242 PagePtr cur_page = doc_controller->get_page(ctx.page_num);
00243
00244
00245 if (cur_page == 0)
00246 {
00247 cur_page = doc_controller->get_renderer()->gen_page(ctx.page_num);
00248 if (cur_page == 0)
00249 {
00250 return RES_ERROR;
00251 }
00252 }
00253
00254 TextPage* text_page = cur_page->get_text();
00255 bool need_remove_text = false;
00256 if (text_page == 0)
00257 {
00258
00259 cur_page->render_text(doc_controller->get_renderer(), true);
00260 text_page = cur_page->get_text();
00261 if (text_page == 0)
00262 {
00263 return RES_ERROR;
00264 }
00265 need_remove_text = true;
00266 }
00267
00268 SearchResult res = cur_page->search(ctx, results);
00269
00270 if (need_remove_text)
00271 {
00272 cur_page->destroy_text();
00273 }
00274
00275 return res;
00276 }
00277
00278 void PDFSearcher::notify(SearchResult ret_code, PDFSearchDocument &results
00279 , unsigned int search_id)
00280 {
00281
00282 PDFRangeCollection *coll = new PDFRangeCollection;
00283 export_search_doc_to_coll(results, *coll);
00284
00285 doc_controller->sig_search_results_ready.broadcast(ret_code, coll
00286 , search_id);
00287 }
00288
00289 void PDFSearcher::parse_dst_string(const string &dst_str, stringlist &str_list)
00290 {
00291 typedef enum
00292 {
00293 PARSE_WORD = 0,
00294 PARSE_IGNORE_CHAR
00295 }ParseStatus;
00296
00297 const char *pchar = dst_str.c_str();
00298
00299
00300 string word;
00301 ParseStatus status = PARSE_IGNORE_CHAR;
00302 while(!is_end_char(*pchar))
00303 {
00304
00305 if (is_ignore_char(*pchar))
00306 {
00307 if (status == PARSE_WORD)
00308 {
00309 status = PARSE_IGNORE_CHAR;
00310 str_list.push_back(word);
00311 word.clear();
00312 }
00313 }
00314 else
00315 {
00316 if (status == PARSE_IGNORE_CHAR)
00317 {
00318 status = PARSE_WORD;
00319 }
00320 word.push_back(*pchar);
00321 }
00322 pchar++;
00323 }
00324
00325 if (status == PARSE_WORD)
00326 {
00327
00328 str_list.push_back(word);
00329 }
00330 }
00331
00332 void PDFSearcher::export_search_doc_to_coll(PDFSearchDocument &doc
00333 , PDFRangeCollection &collection)
00334 {
00335 for(int i = 0; i < doc.size(); ++i)
00336 {
00337 PDFSearchPage *page = doc.get(i);
00338 for(int k = 0; k < page->size(); ++k)
00339 {
00340 PluginRangeImpl *range = page->get(k);
00341 if (range == 0)
00342 {
00343 break;
00344 }
00345
00346 PluginRangeImpl *save_range = new PluginRangeImpl;
00347 save_range->start_anchor = new StringImpl(
00348 range->start_anchor->get_buffer(range->start_anchor));
00349 save_range->end_anchor = new StringImpl(
00350 range->end_anchor->get_buffer(range->end_anchor));
00351
00352 collection.add(save_range);
00353 }
00354 page->clear();
00355 }
00356 doc.clear();
00357 }
00358
00359 }
00360
00361