pdf_searcher.cpp
Go to the documentation of this file.00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 #include "pdf_doc_controller.h"
00028 #include "pdf_searcher.h"
00029 #include "pdf_anchor.h"
00030 #include "pdf_search_task.h"
00031 
00032 
00033 namespace pdf
00034 {
00035 
00036 using namespace std;
00037 
00038 bool is_ignore_char(const char c)
00039 {
00040     return (c == ' ');
00041 }
00042 
00043 bool is_end_char(const char c)
00044 {
00045     return (c == '\0');
00046 }
00047 
00048 void PDFSearcher::clear_search_ctx()
00049 {
00050     search_ctx.dst_words.clear();
00051 }
00052 
00053 bool PDFSearcher::begin_search_next(const PDFSearchCriteria &criteria
00054                                     , const string &from_anchor)
00055 {
00056     PDFAnchor from_param(from_anchor);
00057     if (from_param.page_num <= 0 
00058         || from_param.page_num > 
00059         static_cast<int>(doc_controller->page_count()))
00060     {
00061         from_param.page_num = 1;
00062     }
00063 
00064     clear_search_ctx();
00065     
00066     
00067     search_ctx.case_sensitive = criteria.case_sensitive;
00068     search_ctx.match_whole_word = criteria.match_whole_word;
00069     search_ctx.page_num = from_param.page_num;
00070     search_ctx.forward = criteria.forward;
00071     search_ctx.search_all = false;
00072     search_ctx.word_cursor = from_param.word_num;
00073     search_ctx.char_cursor = from_param.char_idx;
00074 
00075     if (search_ctx.forward)
00076     {
00077         
00078         search_ctx.char_cursor++;
00079     }
00080     else
00081     {
00082         
00083         
00084         
00085         search_ctx.char_cursor--;
00086     }
00087 
00088     parse_dst_string(criteria.text, search_ctx.dst_words);
00089 
00090     return true;
00091 }
00092 
00093 bool PDFSearcher::begin_search_all(const PDFSearchCriteria &criteria)
00094 {
00095     clear_search_ctx();
00096 
00097     
00098     search_ctx.case_sensitive = criteria.case_sensitive;
00099     search_ctx.match_whole_word = criteria.match_whole_word;
00100     
00101     
00102     search_ctx.page_num = 1;
00103     
00104     search_ctx.forward = true;
00105     search_ctx.search_all = true;
00106     
00107     
00108     search_ctx.word_cursor = 0;
00109     search_ctx.char_cursor = 0;
00110     parse_dst_string(criteria.text, search_ctx.dst_words);
00111 
00112     return true;
00113 }
00114 
00115 SearchResult PDFSearcher::search_next(PDFSearchDocument &results
00116                                       , PDFSearchTask *task)
00117 {
00118 
00119     PDFSearchPage *search_page = new PDFSearchPage;
00120     SearchResult res = RES_NOT_FOUND;
00121     while(res != RES_OK
00122           && search_ctx.page_num > 0
00123           && search_ctx.page_num <=
00124              static_cast<int>(doc_controller->page_count()))
00125     {
00126         res = search_current_page(search_ctx, *search_page);
00127 
00128         if (res != RES_OK)
00129         {
00130             
00131             search_ctx.forward ? search_ctx.page_num++
00132                 : search_ctx.page_num--;
00133 
00134             
00135             
00136             search_ctx.word_cursor = -1;
00137         }
00138 
00139         
00140         if (task->is_aborted())
00141         {
00142             LOGPRINTF("Task Search Next canceled!\n");
00143             res = RES_ABORTED;
00144             break;
00145         }
00146         else if (task->is_paused())
00147         {
00148             LOGPRINTF("Task Search Next paused!\n");
00149             res = RES_PAUSED;
00150             break;
00151         }
00152     }
00153 
00154     if (res != RES_OK)
00155     {
00156         delete search_page;
00157     }
00158     else
00159     {
00160         search_page->set_element(search_ctx.page_num);
00161         results.add(search_page);
00162     }
00163 
00164     return res;
00165 
00166 }
00167 
00168 SearchResult PDFSearcher::seach_all(PDFSearchDocument &results
00169                                     , PDFSearchTask *task)
00170 {
00171     
00172     SearchResult res = RES_NOT_FOUND;
00173 
00174     
00175     SearchResult res_once = res;
00176 
00177     PDFSearchPage *search_page = new PDFSearchPage;
00178 
00179     while ( search_ctx.page_num > 0
00180             && search_ctx.page_num <=
00181                static_cast<int>(doc_controller->page_count()))
00182     {
00183         
00184         
00185         res_once = search_current_page(search_ctx, *search_page);
00186 
00187         if (res_once == RES_OK)
00188         {
00189             search_page->set_element(search_ctx.page_num);
00190             results.add(search_page);
00191             search_page = new PDFSearchPage;
00192         }
00193         
00194         
00195         search_ctx.page_num++;
00196  
00197         
00198         search_ctx.word_cursor = 0;
00199 
00200          
00201         if (task->is_aborted())
00202         {
00203             LOGPRINTF("Task Search All canceled!\n");
00204             res = RES_ABORTED;
00205             break;
00206         }
00207         else if (task->is_paused())
00208         {
00209             LOGPRINTF("Task Search All paused!\n");
00210             res = RES_PAUSED;
00211             break;
00212         }
00213     }
00214 
00215     delete search_page;
00216 
00217     if (results.size() > 0)
00218     {
00219         res = RES_OK;
00220     }
00221     
00222     return res;
00223 }
00224 
00225 bool PDFSearcher::dump_search_process(string &anchor)
00226 {
00227     PDFAnchor process;
00228     process.page_num = search_ctx.page_num;
00229     process.word_num = search_ctx.word_cursor;
00230     process.char_idx = search_ctx.char_cursor;
00231     
00232 
00233     anchor = process.get_string();
00234     return true;
00235 }
00236 
00237 SearchResult PDFSearcher::search_current_page(SearchContext &ctx
00238                                               , PDFSearchPage &results)
00239 {
00240     
00241     
00242     PagePtr cur_page = doc_controller->get_page(ctx.page_num);
00243 
00244     
00245     if (cur_page == 0)
00246     {
00247         cur_page = doc_controller->get_renderer()->gen_page(ctx.page_num);
00248         if (cur_page == 0)
00249         {
00250             return RES_ERROR;
00251         }
00252     }
00253 
00254     TextPage* text_page = cur_page->get_text();
00255     bool need_remove_text = false;
00256     if (text_page == 0)
00257     {
00258         
00259         cur_page->render_text(doc_controller->get_renderer(), true);
00260         text_page = cur_page->get_text();
00261         if (text_page == 0)
00262         {
00263             return RES_ERROR;
00264         }
00265         need_remove_text = true;
00266     }
00267 
00268     SearchResult res = cur_page->search(ctx, results);
00269 
00270     if (need_remove_text)
00271     {
00272         cur_page->destroy_text();
00273     }
00274 
00275     return res;
00276 }
00277 
00278 void PDFSearcher::notify(SearchResult ret_code, PDFSearchDocument &results
00279                          , unsigned int search_id)
00280 {
00281     
00282     PDFRangeCollection *coll = new PDFRangeCollection;
00283     export_search_doc_to_coll(results, *coll);
00284 
00285     doc_controller->sig_search_results_ready.broadcast(ret_code, coll
00286         , search_id);
00287 }
00288 
00289 void PDFSearcher::parse_dst_string(const string &dst_str, stringlist &str_list)
00290 {
00291     typedef enum
00292     {
00293         PARSE_WORD = 0,
00294         PARSE_IGNORE_CHAR
00295     }ParseStatus;
00296 
00297     const char *pchar = dst_str.c_str();
00298 
00299     
00300     string word;
00301     ParseStatus status = PARSE_IGNORE_CHAR;
00302     while(!is_end_char(*pchar))
00303     {
00304         
00305         if (is_ignore_char(*pchar))
00306         {
00307             if (status == PARSE_WORD)
00308             {
00309                 status = PARSE_IGNORE_CHAR;
00310                 str_list.push_back(word);
00311                 word.clear();
00312             }
00313         }
00314         else
00315         {
00316             if (status == PARSE_IGNORE_CHAR)
00317             {
00318                 status = PARSE_WORD;
00319             }
00320             word.push_back(*pchar);
00321         }
00322         pchar++;
00323     }
00324 
00325     if (status == PARSE_WORD)
00326     {
00327         
00328         str_list.push_back(word);
00329     }
00330 }
00331 
00332 void PDFSearcher::export_search_doc_to_coll(PDFSearchDocument &doc
00333                                             , PDFRangeCollection &collection)
00334 {
00335     for(int i = 0; i < doc.size(); ++i)
00336     {
00337         PDFSearchPage *page = doc.get(i);
00338         for(int k = 0; k < page->size(); ++k)
00339         {
00340             PluginRangeImpl *range = page->get(k);
00341             if (range == 0)
00342             {
00343                 break;
00344             }
00345 
00346             PluginRangeImpl *save_range = new PluginRangeImpl;
00347             save_range->start_anchor = new StringImpl(
00348                 range->start_anchor->get_buffer(range->start_anchor));
00349             save_range->end_anchor = new StringImpl(
00350                 range->end_anchor->get_buffer(range->end_anchor));
00351 
00352             collection.add(save_range);
00353         }
00354         page->clear();
00355     }
00356     doc.clear();
00357 }
00358 
00359 } 
00360 
00361