text_model.h

Go to the documentation of this file.
00001 /*
00002  * File Name: text_model.h
00003  */
00004 
00005 /*
00006  * This file is part of uds-plugin-plaintext.
00007  *
00008  * uds-plugin-plaintext is free software: you can redistribute it and/or modify
00009  * it under the terms of the GNU General Public License as published by
00010  * the Free Software Foundation, either version 2 of the License, or
00011  * (at your option) any later version.
00012  *
00013  * uds-plugin-plaintext is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program. If not, see <http://www.gnu.org/licenses/>.
00020  */
00021 
00022 /**
00023  * Copyright (C) 2008 iRex Technologies B.V.
00024  * All rights reserved.
00025  */
00026 
00027 #ifndef TEXT_MODEL_H
00028 #define TEXT_MODEL_H
00029 
00030 #include <string>
00031 #include <vector>
00032 #include <cassert>
00033 #include <glib.h>
00034 #include <iconv.h>
00035 #include "plugin_inc.h"
00036 #include "text_base_types.h"
00037 #include "signal_slot.h"
00038 
00039 namespace text
00040 {
00041 
00042 #define DEFAULT_ENCODING "iso8859-1"
00043 
00044 class TextModel
00045 {
00046 public:
00047     /// @brief Constructors and destructors
00048     TextModel();
00049     ~TextModel();
00050 
00051 public:
00052     /// @brief Open specified document with default encoding.
00053     PluginStatus open(const std::string& path);
00054 
00055     /// @brief Open document with specified encoding
00056     PluginStatus open(const std::string& path, const std::string& encoding);
00057 
00058     /// @brief Check if document is already open
00059     bool is_open() const
00060     {
00061         return b_open;
00062     }
00063 
00064     /// @brief Close document
00065     void close();
00066 
00067     /// @brief Get current encoding
00068     const std::string& get_encoding() const
00069     {
00070         return encoding;
00071     }
00072 
00073     const std::string& get_path() const
00074     {
00075         return path;
00076     }
00077 
00078     /// @brief Get number of paragraphs
00079     unsigned int get_paragraph_count() const
00080     {
00081         return static_cast<unsigned int>(doc.size());
00082     }
00083 
00084     /// @brief Get 1 paragraph
00085     const std::string* get_paragraph(unsigned int index) const
00086     {
00087         assert(index < doc.size());
00088         return doc[index].text;
00089     }
00090 
00091     /// @brief Search specified pattern given by search criteria.
00092     /// @param result_ranges Output range collection.
00093     /// @param search_context Search criteria.
00094     /// @return Return true if search is complete (i.e. we reach the start/end
00095     ///  of the document, or find an occurrence if SearchType is SEARCH_NEXT),
00096     ///  otherwise false is returned, user needs to call this function again to get
00097     ///  the search procedure complete.
00098     bool search(std::vector<Range>& result_ranges, SearchContext* sc);
00099 
00100     /// @brief Check if the document contains the anchor or not.
00101     bool has_anchor(const Position &pos);
00102 
00103     /// @brief Get absolute file position from anchor.
00104     bool get_file_pos_from_anchor(size_t& file_pos, const Position &pos);
00105 
00106     /// @brief Get a word from specified document position.
00107     bool get_word_from_anchor(const Position& pos,
00108                               Position& word_start_pos,
00109                               Position& word_end_pos);
00110 
00111     /// @brief Get the words from range, the range will be extended/shrinked to words boundary.
00112     bool get_words_from_range(const Position& range_start,
00113                               const Position& range_end,
00114                               Position& words_start,
00115                               Position& words_end);
00116 
00117     /// @brief Get the text between start_pos and end_pos.
00118     bool get_text_from_range(std::string& result,
00119                              const Position& start_pos,
00120                              const Position& end_pos);
00121 
00122     /// @brief Dump the content to disk file.
00123     void dump();
00124 
00125     /// @brief Abort specified search task.
00126     void set_aborting_search_task_id(unsigned int id)
00127     {
00128         aborting_search_task_id = id;
00129     }
00130 
00131     unsigned int get_aborting_search_task_id()
00132     {
00133         return aborting_search_task_id;
00134     }
00135 
00136 public:
00137     /// @brief Signals.
00138     utils::Signal<const std::vector<Range>&, const SearchContext*> search_done_signal;
00139 
00140 private:
00141     /// @brief Detect the encoding by reading data from text file.
00142     void detect_encoding();
00143 
00144     /// @brief Clear content read from disk file
00145     void clear();
00146 
00147     /// @breif Read content from disk file to internal vector
00148     PluginStatus read_text();
00149 
00150     /// @brief Read text from UTF-8 encoded text
00151     PluginStatus read_utf8_text();
00152 
00153     /// @brief Read text from UTF-8 encoded text
00154     PluginStatus read_non_utf8_text();
00155 
00156     /// @brief Save block with paragraphs
00157     void save_block_with_paragraphs(const char *blk, size_t blk_size);
00158 
00159     /// @brief A simple wrapper for iconv function
00160     void convert(iconv_t cd, char **in_buf, size_t *in_bytes_left, char **out_buf, size_t *out_bytes_left);
00161 
00162     /// @brief Check the char is a word seperator
00163     bool is_seperator(const char* p);
00164 
00165     /// @brief Judge whether it is a new searhing or searching next again.
00166     bool is_new_search(SearchContext * sc);
00167 
00168 private:
00169     /// @brief File pointer
00170     FILE *file_p;
00171 
00172     /// @brief Current encoding string
00173     std::string encoding;
00174 
00175     /// @brief Document path in file system
00176     std::string path;
00177 
00178     /// @brief Flag indicating that current file is successfully opened
00179     bool b_open;
00180 
00181     /// @brief The id of the search task which needs to be aborted.
00182     unsigned int aborting_search_task_id;
00183   
00184     /// @brief Remember the last search result range. 
00185     Range         last_search_result;
00186 
00187     /// @brief Remember the last search context.
00188     SearchContext last_sc; 
00189 
00190     /// @brief A flag used when we read content from disk file. If the block
00191     ///  contains incomplete paragraph, this variable is set to true, 
00192     ///  otherwise it is true.
00193     bool incomplete_line;
00194 
00195     struct Paragraph
00196     {
00197     public:
00198         size_t       start_file_pos;
00199         std::string* text;
00200 
00201     public:
00202         Paragraph(size_t pos, std::string* _text)
00203         : start_file_pos(pos), text(_text)
00204         {
00205         }
00206     };
00207 
00208     /// @brief text document
00209     typedef std::vector<Paragraph> TextDocument;
00210     typedef TextDocument::iterator TextDocumentIter;
00211 
00212     /// @brief Paragraph array, the paragraph is already in UTF-8 format
00213     TextDocument doc;
00214 };
00215 
00216 };  // text
00217 
00218 #endif // TEXT_MODEL_H
00219 
Generated by  doxygen 1.6.2-20100208