All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TextExtractor.h
Go to the documentation of this file.
1 //---------------------------------------------------------------------------------------
2 // Copyright (c) 2001-2018 by PDFTron Systems Inc. All Rights Reserved.
3 // Consult legal.txt regarding legal and license information.
4 //---------------------------------------------------------------------------------------
5 #ifndef PDFTRON_H_CPPPDFTextExtractor
6 #define PDFTRON_H_CPPPDFTextExtractor
7 
8 #include <PDF/Page.h>
9 #include <PDF/Rect.h>
10 #include <Common/UString.h>
11 #include <C/PDF/TRN_TextExtractor.h>
12 #include <vector>
13 
14 namespace pdftron {
15  namespace PDF {
16 
102 {
103 public:
104 
108  TextExtractor();
109  ~TextExtractor();
110 
116  {
117  // Disables expanding of ligatures using a predefined mapping.
118  // Default ligatures are: fi, ff, fl, ffi, ffl, ch, cl, ct, ll,
119  // ss, fs, st, oe, OE.
121 
122  // Disables removing duplicated text that is frequently used to
123  // achieve visual effects of drop shadow and fake bold.
125 
126  // Treat punctuation (e.g. full stop, comma, semicolon, etc.) as
127  // word break characters.
129 
130  // Enables removal of text that is obscured by images or
131  // rectangles. Since this option has small performance penalty
132  // on performance of text extraction, by default it is not
133  // enabled.
135 
136  // Enables removing text that uses rendering mode 3 (i.e. invisible text).
137  // Invisible text is usually used in 'PDF Searchable Images' (i.e. scanned
138  // pages with a corresponding OCR text). As a result, invisible text
139  // will be extracted by default.
141  };
142 
152  void Begin(Page page, const Rect* clip_ptr = 0, UInt32 flags = 0);
153 
157  int GetWordCount();
158 
164  void SetRightToLeftLanguage(bool rtl);
169  bool GetRightToLeftLanguage();
183  UString GetAsText(bool dehyphen = true);
184 
185 #ifndef SWIG
186  void GetAsText(UString& out_str, bool dehyphen = true);
187 #endif
188 
194  UString GetTextUnderAnnot(const Annot& annot);
195 
196 #ifndef SWIG
197  void GetTextUnderAnnot(UString& out_str, const Annot& annot);
198 #endif
199 
200 
205  {
206  // Output words as XML elements instead of inline text.
208 
209  // Include bounding box information for each XML element.
210  // The bounding box information will be stored as 'bbox' attribute.
212 
213  // Include font and styling information.
215  };
216 
259  UString GetAsXML(UInt32 xml_output_flags = 0);
260 
261 #ifndef SWIG
262  void GetAsXML(UString& out_xml, UInt32 xml_output_flags = 0);
263 #endif
264 
270  class Style
271  {
272  public:
273 
280  SDF::Obj GetFont();
281 
286 
295  double GetFontSize();
296 
305  int GetWeight();
306 
311  bool IsItalic();
312 
318  bool IsSerif();
319 
323  std::vector<int> GetColor();
324 
325 #ifndef SWIG
326  void GetColor(UInt8 rgb[3]);
327 #endif
328 
329  bool operator== (const Style& s);
330  bool operator!= (const Style& s);
331 
332  Style();
333 
335  #ifndef SWIGHIDDEN
336  Style(const Style& s);
337  Style(TRN_TextExtractorStyle impl);
338  TRN_TextExtractorStyle mp_style;
339  #endif
340  };
342 
348  class Word
349  {
350  public:
354  int GetNumGlyphs();
355 
362  Rect GetBBox();
363 
364 #ifndef SWIG
365  void GetBBox(double out_bbox[4]);
366 #endif
367 
372  std::vector<double> GetQuad();
373 
374 #ifndef SWIG
375  void GetQuad(double out_quad[8]);
376 #endif
377 
383  std::vector<double> GetGlyphQuad(int glyph_idx);
384 
385 #ifndef SWIG
386  void GetGlyphQuad(int glyph_idx, double out_quad[8]);
387 #endif
388 
393  Style GetCharStyle(int char_idx);
394 
398  Style GetStyle();
399 
403  int GetStringLen();
404 
408 #ifdef SWIG
409  UString GetString();
410 #else
411  const Unicode* GetString();
412 #endif
413 
417  Word GetNextWord();
418 
424  int GetCurrentNum();
425 
429  bool IsValid();
430 
431  bool operator== (const Word&);
432  bool operator!= (const Word&);
433  Word();
434 
436  #ifndef SWIGHIDDEN
437  Word(TRN_TextExtractorWord impl);
438  TRN_TextExtractorWord mp_word;
439  #endif
440  };
442 
448  class Line {
449  public:
450 
454  int GetNumWords();
455 
460  bool IsSimpleLine();
461 
468 #ifdef SWIG
469  Rect GetBBox();
470 #else
471  const double* GetBBox();
472 #endif
473 
478  std::vector<double> GetQuad();
479 
480 #ifndef SWIG
481  void GetQuad(double out_quad[8]);
482 #endif
483 
488  Word GetFirstWord();
489 
494  Word GetWord(int word_idx);
495 
499  Line GetNextLine();
500 
504  int GetCurrentNum();
505 
509  Style GetStyle();
510 
516  int GetParagraphID();
517 
523  int GetFlowID();
524 
529  bool EndsWithHyphen();
530 
534  bool IsValid();
535 
536  bool operator== (const Line&);
537  bool operator!= (const Line&);
538  Line();
539 
541  #ifndef SWIGHIDDEN
542  Line(TRN_TextExtractorLine impl);
543  TRN_TextExtractorLine mp_line;
544  #endif
545  };
547 
551  int GetNumLines();
552 
553 
560  Line GetFirstLine();
561 
565  void Destroy();
566 
568 private:
569  TRN_TextExtractor mp_extractor;
570 
571  // TextExtractor should not be copied
572  TextExtractor(const TextExtractor& other);
573  TextExtractor& operator= (const TextExtractor&);
575 };
576 
577 
578 
579 #include <Impl/TextExtractor.inl>
580 
581  }; // namespace PDF
582 }; // namespace pdftron
583 
584 #endif // PDFTRON_H_CPPPDFTextExtractor
UString GetAsText(bool dehyphen=true)
bool operator==(const Style &s)
bool operator!=(const Style &s)
TRN_UInt8 UInt8
Definition: BasicTypes.h:15
Style GetCharStyle(int char_idx)
TRN_Unicode Unicode
Definition: BasicTypes.h:22
TRN_UInt32 UInt32
Definition: BasicTypes.h:13
UString GetTextUnderAnnot(const Annot &annot)
std::vector< double > GetQuad()
std::vector< double > GetQuad()
void Begin(Page page, const Rect *clip_ptr=0, UInt32 flags=0)
UString GetAsXML(UInt32 xml_output_flags=0)
std::vector< double > GetGlyphQuad(int glyph_idx)
void SetRightToLeftLanguage(bool rtl)

© 2002-2014 PDFTron Systems Inc.