pdf_searcher.cpp

Go to the documentation of this file.
00001 /*
00002  * File Name: pdf_searcher.cpp
00003  */
00004 
00005 /*
00006  * This file is part of uds-plugin-pdf.
00007  *
00008  * uds-plugin-pdf is free software: you can redistribute it and/or modify
00009  * it under the terms of the GNU General Public License as published by
00010  * the Free Software Foundation, either version 2 of the License, or
00011  * (at your option) any later version.
00012  *
00013  * uds-plugin-pdf is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program. If not, see <http://www.gnu.org/licenses/>.
00020  */
00021 
00022 /**
00023  * Copyright (C) 2008 iRex Technologies B.V.
00024  * All rights reserved.
00025  */
00026 
00027 #include "pdf_doc_controller.h"
00028 #include "pdf_searcher.h"
00029 #include "pdf_anchor.h"
00030 #include "pdf_search_task.h"
00031 
00032 
00033 namespace pdf
00034 {
00035 
00036 using namespace std;
00037 
00038 bool is_ignore_char(const char c)
00039 {
00040     return (c == ' ');
00041 }
00042 
00043 bool is_end_char(const char c)
00044 {
00045     return (c == '\0');
00046 }
00047 
00048 void PDFSearcher::clear_search_ctx()
00049 {
00050     search_ctx.dst_words.clear();
00051 }
00052 
00053 bool PDFSearcher::begin_search_next(const PDFSearchCriteria &criteria
00054                                     , const string &from_anchor)
00055 {
00056     PDFAnchor from_param(from_anchor);
00057     if (from_param.page_num <= 0 
00058         || from_param.page_num > 
00059         static_cast<int>(doc_controller->page_count()))
00060     {
00061         from_param.page_num = 1;
00062     }
00063 
00064     clear_search_ctx();
00065     // construct the search context
00066     
00067     search_ctx.case_sensitive = criteria.case_sensitive;
00068     search_ctx.match_whole_word = criteria.match_whole_word;
00069     search_ctx.page_num = from_param.page_num;
00070     search_ctx.forward = criteria.forward;
00071     search_ctx.search_all = false;
00072     search_ctx.word_cursor = from_param.word_num;
00073     search_ctx.char_cursor = from_param.char_idx;
00074 
00075     if (search_ctx.forward)
00076     {
00077         // move forward the start char index
00078         search_ctx.char_cursor++;
00079     }
00080     else
00081     {
00082         // move backward the start char index
00083         // NOTE: We should care about the boundary problem
00084         // the start anchor might be the last word and the last char
00085         search_ctx.char_cursor--;
00086     }
00087 
00088     parse_dst_string(criteria.text, search_ctx.dst_words);
00089 
00090     return true;
00091 }
00092 
00093 bool PDFSearcher::begin_search_all(const PDFSearchCriteria &criteria)
00094 {
00095     clear_search_ctx();
00096 
00097     // construct the search context
00098     search_ctx.case_sensitive = criteria.case_sensitive;
00099     search_ctx.match_whole_word = criteria.match_whole_word;
00100     
00101     // start from the first page
00102     search_ctx.page_num = 1;
00103     //search_ctx.forward = criteria.forward;
00104     search_ctx.forward = true;
00105     search_ctx.search_all = true;
00106     
00107     // start from the first word
00108     search_ctx.word_cursor = 0;
00109     search_ctx.char_cursor = 0;
00110     parse_dst_string(criteria.text, search_ctx.dst_words);
00111 
00112     return true;
00113 }
00114 
00115 SearchResult PDFSearcher::search_next(PDFSearchDocument &results
00116                                       , PDFSearchTask *task)
00117 {
00118 
00119     PDFSearchPage *search_page = new PDFSearchPage;
00120     SearchResult res = RES_NOT_FOUND;
00121     while(res != RES_OK
00122           && search_ctx.page_num > 0
00123           && search_ctx.page_num <=
00124              static_cast<int>(doc_controller->page_count()))
00125     {
00126         res = search_current_page(search_ctx, *search_page);
00127 
00128         if (res != RES_OK)
00129         {
00130             // forward : increase page number; otherwise decrease page number
00131             search_ctx.forward ? search_ctx.page_num++
00132                 : search_ctx.page_num--;
00133 
00134             // if it is not the first page, start from the first word if forward
00135             // else start from the last word
00136             search_ctx.word_cursor = -1;
00137         }
00138 
00139         // abort current task
00140         if (task->is_aborted())
00141         {
00142             LOGPRINTF("Task Search Next canceled!\n");
00143             res = RES_ABORTED;
00144             break;
00145         }
00146         else if (task->is_paused())
00147         {
00148             LOGPRINTF("Task Search Next paused!\n");
00149             res = RES_PAUSED;
00150             break;
00151         }
00152     }
00153 
00154     if (res != RES_OK)
00155     {
00156         delete search_page;
00157     }
00158     else
00159     {
00160         search_page->set_element(search_ctx.page_num);
00161         results.add(search_page);
00162     }
00163 
00164     return res;
00165 
00166 }
00167 
00168 SearchResult PDFSearcher::seach_all(PDFSearchDocument &results
00169                                     , PDFSearchTask *task)
00170 {
00171     // return code of this function
00172     SearchResult res = RES_NOT_FOUND;
00173 
00174     // return code of searching every page
00175     SearchResult res_once = res;
00176 
00177     PDFSearchPage *search_page = new PDFSearchPage;
00178 
00179     while ( search_ctx.page_num > 0
00180             && search_ctx.page_num <=
00181                static_cast<int>(doc_controller->page_count()))
00182     {
00183         
00184         // search the whole page if it is not the current page
00185         res_once = search_current_page(search_ctx, *search_page);
00186 
00187         if (res_once == RES_OK)
00188         {
00189             search_page->set_element(search_ctx.page_num);
00190             results.add(search_page);
00191             search_page = new PDFSearchPage;
00192         }
00193         
00194         // forward : increase page number; otherwise decrease page number
00195         search_ctx.page_num++;
00196  
00197         // reset the index of start word to be 0
00198         search_ctx.word_cursor = 0;
00199 
00200          // abort current task
00201         if (task->is_aborted())
00202         {
00203             LOGPRINTF("Task Search All canceled!\n");
00204             res = RES_ABORTED;
00205             break;
00206         }
00207         else if (task->is_paused())
00208         {
00209             LOGPRINTF("Task Search All paused!\n");
00210             res = RES_PAUSED;
00211             break;
00212         }
00213     }
00214 
00215     delete search_page;
00216 
00217     if (results.size() > 0)
00218     {
00219         res = RES_OK;
00220     }
00221     
00222     return res;
00223 }
00224 
00225 bool PDFSearcher::dump_search_process(string &anchor)
00226 {
00227     PDFAnchor process;
00228     process.page_num = search_ctx.page_num;
00229     process.word_num = search_ctx.word_cursor;
00230     process.char_idx = search_ctx.char_cursor;
00231     //process.file_name = get_doc_ctrl()->name();
00232 
00233     anchor = process.get_string();
00234     return true;
00235 }
00236 
00237 SearchResult PDFSearcher::search_current_page(SearchContext &ctx
00238                                               , PDFSearchPage &results)
00239 {
00240     // get the page directly from cache, we don't have to consider the
00241     // layout
00242     PagePtr cur_page = doc_controller->get_page(ctx.page_num);
00243 
00244     // if the page is cached in the rendering cache, return it
00245     if (cur_page == 0)
00246     {
00247         cur_page = doc_controller->get_renderer()->gen_page(ctx.page_num);
00248         if (cur_page == 0)
00249         {
00250             return RES_ERROR;
00251         }
00252     }
00253 
00254     TextPage* text_page = cur_page->get_text();
00255     bool need_remove_text = false;
00256     if (text_page == 0)
00257     {
00258         // render the text of current page
00259         cur_page->render_text(doc_controller->get_renderer(), true);
00260         text_page = cur_page->get_text();
00261         if (text_page == 0)
00262         {
00263             return RES_ERROR;
00264         }
00265         need_remove_text = true;
00266     }
00267 
00268     SearchResult res = cur_page->search(ctx, results);
00269 
00270     if (need_remove_text)
00271     {
00272         cur_page->destroy_text();
00273     }
00274 
00275     return res;
00276 }
00277 
00278 void PDFSearcher::notify(SearchResult ret_code, PDFSearchDocument &results
00279                          , unsigned int search_id)
00280 {
00281     // export the results from PDFSearchDocument to PDFRangeCollection
00282     PDFRangeCollection *coll = new PDFRangeCollection;
00283     export_search_doc_to_coll(results, *coll);
00284 
00285     doc_controller->sig_search_results_ready.broadcast(ret_code, coll
00286         , search_id);
00287 }
00288 
00289 void PDFSearcher::parse_dst_string(const string &dst_str, stringlist &str_list)
00290 {
00291     typedef enum
00292     {
00293         PARSE_WORD = 0,
00294         PARSE_IGNORE_CHAR
00295     }ParseStatus;
00296 
00297     const char *pchar = dst_str.c_str();
00298 
00299     //Parse the words
00300     string word;
00301     ParseStatus status = PARSE_IGNORE_CHAR;
00302     while(!is_end_char(*pchar))
00303     {
00304         //If the character is one of the ignore ones, ignore it.
00305         if (is_ignore_char(*pchar))
00306         {
00307             if (status == PARSE_WORD)
00308             {
00309                 status = PARSE_IGNORE_CHAR;
00310                 str_list.push_back(word);
00311                 word.clear();
00312             }
00313         }
00314         else
00315         {
00316             if (status == PARSE_IGNORE_CHAR)
00317             {
00318                 status = PARSE_WORD;
00319             }
00320             word.push_back(*pchar);
00321         }
00322         pchar++;
00323     }
00324 
00325     if (status == PARSE_WORD)
00326     {
00327         // add the last word
00328         str_list.push_back(word);
00329     }
00330 }
00331 
00332 void PDFSearcher::export_search_doc_to_coll(PDFSearchDocument &doc
00333                                             , PDFRangeCollection &collection)
00334 {
00335     for(int i = 0; i < doc.size(); ++i)
00336     {
00337         PDFSearchPage *page = doc.get(i);
00338         for(int k = 0; k < page->size(); ++k)
00339         {
00340             PluginRangeImpl *range = page->get(k);
00341             if (range == 0)
00342             {
00343                 break;
00344             }
00345 
00346             PluginRangeImpl *save_range = new PluginRangeImpl;
00347             save_range->start_anchor = new StringImpl(
00348                 range->start_anchor->get_buffer(range->start_anchor));
00349             save_range->end_anchor = new StringImpl(
00350                 range->end_anchor->get_buffer(range->end_anchor));
00351 
00352             collection.add(save_range);
00353         }
00354         page->clear();
00355     }
00356     doc.clear();
00357 }
00358 
00359 } // namespace pdf
00360 
00361