structured-text.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924
  1. // Copyright (C) 2004-2025 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
  23. #define MUPDF_FITZ_STRUCTURED_TEXT_H
  24. #include "mupdf/fitz/system.h"
  25. #include "mupdf/fitz/types.h"
  26. #include "mupdf/fitz/context.h"
  27. #include "mupdf/fitz/geometry.h"
  28. #include "mupdf/fitz/font.h"
  29. #include "mupdf/fitz/image.h"
  30. #include "mupdf/fitz/output.h"
  31. #include "mupdf/fitz/device.h"
  32. #include "mupdf/fitz/pool.h"
  33. /**
  34. Simple text layout (for use with annotation editing primarily).
  35. */
  36. typedef struct fz_layout_char
  37. {
  38. float x, advance;
  39. const char *p; /* location in source text of character */
  40. struct fz_layout_char *next;
  41. } fz_layout_char;
  42. typedef struct fz_layout_line
  43. {
  44. float x, y, font_size;
  45. const char *p; /* location in source text of start of line */
  46. fz_layout_char *text;
  47. struct fz_layout_line *next;
  48. } fz_layout_line;
  49. typedef struct
  50. {
  51. fz_pool *pool;
  52. fz_matrix matrix;
  53. fz_matrix inv_matrix;
  54. fz_layout_line *head, **tailp;
  55. fz_layout_char **text_tailp;
  56. } fz_layout_block;
  57. /**
  58. Create a new layout block, with new allocation pool, zero
  59. matrices, and initialise linked pointers.
  60. */
  61. fz_layout_block *fz_new_layout(fz_context *ctx);
  62. /**
  63. Drop layout block. Free the pool, and linked blocks.
  64. Never throws exceptions.
  65. */
  66. void fz_drop_layout(fz_context *ctx, fz_layout_block *block);
  67. /**
  68. Add a new line to the end of the layout block.
  69. */
  70. void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p);
  71. /**
  72. Add a new char to the line at the end of the layout block.
  73. */
  74. void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p);
  75. /**
  76. Text extraction device: Used for searching, format conversion etc.
  77. (In development - Subject to change in future versions)
  78. */
  79. typedef struct fz_stext_char fz_stext_char;
  80. typedef struct fz_stext_line fz_stext_line;
  81. typedef struct fz_stext_block fz_stext_block;
  82. typedef struct fz_stext_struct fz_stext_struct;
  83. typedef struct fz_stext_grid_positions fz_stext_grid_positions;
  84. /**
  85. FZ_STEXT_PRESERVE_LIGATURES: If this option is activated
  86. ligatures are passed through to the application in their
  87. original form. If this option is deactivated ligatures are
  88. expanded into their constituent parts, e.g. the ligature ffi is
  89. expanded into three separate characters f, f and i.
  90. FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated
  91. whitespace is passed through to the application in its original
  92. form. If this option is deactivated any type of horizontal
  93. whitespace (including horizontal tabs) will be replaced with
  94. space characters of variable width.
  95. FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images
  96. will be stored in the structured text structure. The default is
  97. to ignore all images.
  98. FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try
  99. to add missing space characters where there are large gaps
  100. between characters.
  101. FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the
  102. end of a line will be removed and the lines will be merged.
  103. FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line
  104. will not be merged. Each line will thus be a span of text with the same
  105. font, colour, and size.
  106. FZ_STEXT_CLIP: If this option is set, characters that would be entirely
  107. clipped away by the current clipping path (or, more accurate, the smallest
  108. bbox that contains the current clipping path) will be ignored. The
  109. clip path is guaranteed to be smaller then the page mediabox, hence
  110. this option subsumes an older, now deprecated, FZ_STEXT_MEDIABOX_CLIP
  111. option.
  112. FZ_STEXT_CLIP_RECT: If this option is set, characters that would be entirely
  113. clipped away by the specified 'clip' rectangle in the options struct
  114. will be ignored. This enables content from specific subsections of pages to
  115. be extracted.
  116. FZ_STEXT_COLLECT_STRUCTURE: If this option is set, we will collect
  117. the structure as specified using begin/end_structure calls. This will
  118. change the returned stext structure from being a simple list of blocks
  119. into effectively being a 'tree' that should be walked in depth-first
  120. order.
  121. FZ_STEXT_COLLECT_VECTORS: If this option is set, we will collect
  122. details (currently just the bbox) of vector graphics. This is intended
  123. to be of use in segmentation analysis.
  124. FZ_STEXT_IGNORE_ACTUALTEXT: If this option is set, we will no longer
  125. replace text by the ActualText replacement specified in the document.
  126. FZ_STEXT_SEGMENT: If this option is set, we will attempt to segment
  127. the page into different regions. This will deliberately not do anything
  128. to pages with structure information present.
  129. FZ_STEXT_PARAGRAPH_BREAK: If this option is set, we will break blocks
  130. of text at what appear to be paragraph boundaries. This only works
  131. for left-to-right, top-to-bottom paragraphs. Works best on a segmented
  132. page.
  133. FZ_STEXT_TABLE_HUNT: If this option is set, we will hunt for tables
  134. within the stext. Details of the potential tables found will be
  135. inserted into the stext for the caller to interpret. This will work
  136. best on a segmented page.
  137. FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE: If this option is set, then
  138. in the event that we fail to find a unicode value for a given
  139. character, we we instead return its CID in the unicode field. We
  140. will set the FZ_STEXT_UNICODE_IS_CID bit in the char flags word to
  141. indicate that this has happened.
  142. FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE: If this option is set, then
  143. in the event that we fail to find a unicode value for a given
  144. character, we we instead return its glyph in the unicode field.
  145. We will set the FZ_STEXT_UNICODE_IS_GID bit in the char flags word
  146. to indicate that this has happened.
  147. Setting both FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE and
  148. FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE will give undefined behaviour.
  149. */
  150. enum
  151. {
  152. FZ_STEXT_PRESERVE_LIGATURES = 1,
  153. FZ_STEXT_PRESERVE_WHITESPACE = 2,
  154. FZ_STEXT_PRESERVE_IMAGES = 4,
  155. FZ_STEXT_INHIBIT_SPACES = 8,
  156. FZ_STEXT_DEHYPHENATE = 16,
  157. FZ_STEXT_PRESERVE_SPANS = 32,
  158. FZ_STEXT_CLIP = 64,
  159. FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE = 128,
  160. FZ_STEXT_COLLECT_STRUCTURE = 256,
  161. FZ_STEXT_ACCURATE_BBOXES = 512,
  162. FZ_STEXT_COLLECT_VECTORS = 1024,
  163. FZ_STEXT_IGNORE_ACTUALTEXT = 2048,
  164. FZ_STEXT_SEGMENT = 4096,
  165. FZ_STEXT_PARAGRAPH_BREAK = 8192,
  166. FZ_STEXT_TABLE_HUNT = 16384,
  167. FZ_STEXT_COLLECT_STYLES = 32768,
  168. FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE = 65536,
  169. FZ_STEXT_CLIP_RECT = (1<<17),
  170. FZ_STEXT_ACCURATE_ASCENDERS = (1<<18),
  171. FZ_STEXT_ACCURATE_SIDE_BEARINGS = (1<<19),
  172. /* An old, deprecated option. */
  173. FZ_STEXT_MEDIABOX_CLIP = FZ_STEXT_CLIP
  174. };
  175. /**
  176. * A note on stext's handling of structure.
  177. *
  178. * A PDF document can contain a structure tree. This gives the
  179. * structure of a document in its entirety as a tree. e.g.
  180. *
  181. * Tree MCID INDEX
  182. * -------------------------------------
  183. * DOC 0 0
  184. * TOC 1 0
  185. * TOC_ITEM 2 0
  186. * TOC_ITEM 3 1
  187. * TOC_ITEM 4 2
  188. * ...
  189. * STORY 100 1
  190. * SECTION 101 0
  191. * HEADING 102 0
  192. * SUBSECTION 103 1
  193. * PARAGRAPH 104 0
  194. * PARAGRAPH 105 1
  195. * PARAGRAPH 106 2
  196. * SUBSECTION 107 2
  197. * PARAGRAPH 108 0
  198. * PARAGRAPH 109 1
  199. * PARAGRAPH 110 2
  200. * ...
  201. * SECTION 200 1
  202. * ...
  203. *
  204. * Each different section of the tree is identified as part of an
  205. * MCID by a number (this is a slight simplification, but makes the
  206. * explanation easier).
  207. *
  208. * The PDF document contains markings that say "Entering MCID 0"
  209. * and "Leaving MCID 0". Any content within that region is therefore
  210. * identified as appearing in that particular structural region.
  211. *
  212. * This means that content can be sent in the document in a different
  213. * order to which it appears 'logically' in the tree.
  214. *
  215. * MuPDF converts this tree form into a nested series of calls to
  216. * begin_structure and end_structure.
  217. *
  218. * For instance, if the document started out with MCID 100, then
  219. * we'd send:
  220. * begin_structure("DOC")
  221. * begin_structure("STORY")
  222. *
  223. * The problem with this is that if we send:
  224. * begin_structure("DOC")
  225. * begin_structure("STORY")
  226. * begin_structure("SECTION")
  227. * begin_structure("SUBSECTION")
  228. *
  229. * or
  230. * begin_structure("DOC")
  231. * begin_structure("STORY")
  232. * begin_structure("SECTION")
  233. * begin_structure("HEADING")
  234. *
  235. * How do I know what order the SECTION and HEADING should appear in?
  236. * Are they even in the same STORY? Or the same DOC?
  237. *
  238. * Accordingly, every begin_structure is accompanied not only with the
  239. * node type, but with an index. The index is the number of this node
  240. * within this level of the tree. Hence:
  241. *
  242. * begin_structure("DOC", 0)
  243. * begin_structure("STORY", 0)
  244. * begin_structure("SECTION", 0)
  245. * begin_structure("HEADING", 0)
  246. * and
  247. * begin_structure("DOC", 0)
  248. * begin_structure("STORY", 0)
  249. * begin_structure("SECTION", 0)
  250. * begin_structure("SUBSECTION", 1)
  251. *
  252. * are now unambiguous in their describing of the tree.
  253. *
  254. * MuPDF automatically sends the minimal end_structure/begin_structure
  255. * pairs to move us between nodes in the tree.
  256. *
  257. * In order to accommodate this information within the structured text
  258. * data structures an additional block type is used. Previously a
  259. * "page" was just a list of blocks, either text or images. e.g.
  260. *
  261. * [BLOCK:TEXT] <-> [BLOCK:IMG] <-> [BLOCK:TEXT] <-> [BLOCK:TEXT] ...
  262. *
  263. * We now introduce a new type of block, STRUCT, that turns this into
  264. * a tree:
  265. *
  266. * [BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ...
  267. * /|\
  268. * [STRUCT:TYPE=DOC] <----
  269. * |
  270. * [BLOCK:TEXT] <-> [BLOCK:STRUCT(IDX=0)] <-> [BLOCK:TEXT] <-> ...
  271. * /|\
  272. * [STRUCT:TYPE=STORY] <--
  273. * |
  274. * ...
  275. *
  276. * Rather than doing a simple linear traversal of the list to extract
  277. * the logical data, a caller now has to do a depth-first traversal.
  278. */
  279. typedef struct
  280. {
  281. fz_rect mediabox;
  282. int chapter;
  283. int page;
  284. } fz_stext_page_details;
  285. /**
  286. A text page is a list of blocks, together with an overall
  287. bounding box.
  288. The name of this structure is now slightly out of date. It
  289. should really be fz_stext_document, cos it can contain
  290. content from multiple pages.
  291. */
  292. typedef struct
  293. {
  294. int refs;
  295. fz_pool *pool;
  296. fz_rect mediabox;
  297. fz_stext_block *first_block;
  298. /* The following fields are only of use to the routines that
  299. * build an fz_stext_page. They change during page construction
  300. * and their meaning is subject to change. These values should
  301. * not be used by anything outside of the stext device. */
  302. fz_stext_block *last_block;
  303. fz_stext_struct *last_struct;
  304. /* An array of fz_stext_page_details */
  305. fz_pool_array *id_list;
  306. } fz_stext_page;
  307. /**
  308. Take a new reference to an fz_stext_page.
  309. */
  310. fz_stext_page *fz_keep_stext_page(fz_context *ctx, fz_stext_page *page);
  311. /**
  312. Helper function to retrieve the details for a given id from a block.
  313. */
  314. fz_stext_page_details *fz_stext_page_details_for_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block);
  315. enum
  316. {
  317. FZ_STEXT_BLOCK_TEXT = 0,
  318. FZ_STEXT_BLOCK_IMAGE = 1,
  319. FZ_STEXT_BLOCK_STRUCT = 2,
  320. FZ_STEXT_BLOCK_VECTOR = 3,
  321. FZ_STEXT_BLOCK_GRID = 4
  322. };
  323. enum
  324. {
  325. FZ_STEXT_TEXT_JUSTIFY_UNKNOWN = 0,
  326. FZ_STEXT_TEXT_JUSTIFY_LEFT = 1,
  327. FZ_STEXT_TEXT_JUSTIFY_CENTRE = 2,
  328. FZ_STEXT_TEXT_JUSTIFY_RIGHT = 3,
  329. FZ_STEXT_TEXT_JUSTIFY_FULL = 4,
  330. };
  331. enum
  332. {
  333. /* Indicates that this vector came from a stroked
  334. * path. */
  335. FZ_STEXT_VECTOR_IS_STROKED = 1,
  336. /* Indicates that this vector came from a rectangular
  337. * (axis-aligned) path (or path segment). */
  338. FZ_STEXT_VECTOR_IS_RECTANGLE = 2,
  339. /* Indicates that this vector came from a path
  340. * segment, and more segments from this same path are
  341. * still to come. */
  342. FZ_STEXT_VECTOR_CONTINUES = 4
  343. };
  344. /**
  345. A text block is a list of lines of text (typically a paragraph),
  346. or an image.
  347. */
  348. struct fz_stext_block
  349. {
  350. int type;
  351. int id;
  352. fz_rect bbox;
  353. union {
  354. struct { fz_stext_line *first_line, *last_line; int flags;} t;
  355. struct { fz_matrix transform; fz_image *image; } i;
  356. struct { fz_stext_struct *down; int index; } s;
  357. struct { uint32_t flags; uint32_t argb; } v;
  358. struct { fz_stext_grid_positions *xs; fz_stext_grid_positions *ys; } b;
  359. } u;
  360. fz_stext_block *prev, *next;
  361. };
  362. /**
  363. A text line is a list of characters that share a common baseline.
  364. */
  365. struct fz_stext_line
  366. {
  367. int wmode; /* 0 for horizontal, 1 for vertical */
  368. fz_point dir; /* normalized direction of baseline */
  369. fz_rect bbox;
  370. fz_stext_char *first_char, *last_char;
  371. fz_stext_line *prev, *next;
  372. };
  373. /**
  374. A text char is a unicode character, the style in which is
  375. appears, and the point at which it is positioned.
  376. */
  377. struct fz_stext_char
  378. {
  379. int c; /* unicode character value */
  380. uint16_t bidi; /* even for LTR, odd for RTL - probably only needs 8 bits? */
  381. uint16_t flags;
  382. uint32_t argb; /* sRGB hex color (alpha in top 8 bits, then r, then g, then b in low bits) */
  383. fz_point origin;
  384. fz_quad quad;
  385. float size;
  386. fz_font *font;
  387. fz_stext_char *next;
  388. };
  389. enum
  390. {
  391. FZ_STEXT_STRIKEOUT = 1,
  392. FZ_STEXT_UNDERLINE = 2,
  393. FZ_STEXT_SYNTHETIC = 4,
  394. FZ_STEXT_BOLD = 8, /* Either real or 'fake' bold */
  395. FZ_STEXT_FILLED = 16,
  396. FZ_STEXT_STROKED = 32,
  397. FZ_STEXT_CLIPPED = 64,
  398. FZ_STEXT_UNICODE_IS_CID = 128,
  399. FZ_STEXT_UNICODE_IS_GID = 256,
  400. };
  401. /**
  402. When we are collecting the structure information from
  403. PDF structure trees/tags, we end up with a tree of
  404. nodes. The structure should be walked in depth-first
  405. traversal order to extract the content.
  406. An fz_stext_struct pointer can be NULL to indicate that
  407. we know there is a child there within the complete tree,
  408. but we don't know what it is yet.
  409. */
  410. struct fz_stext_struct
  411. {
  412. /* up points to the block that contains this fz_stext_struct. */
  413. fz_stext_block *up;
  414. /* parent points to the struct that has up as one of its children.
  415. * parent is useful for doing depth first traversal without having
  416. * to store the entire chain of structs in the iterator. */
  417. fz_stext_struct *parent;
  418. /* first_block points to the first child of this node (or NULL
  419. * if there are none). */
  420. fz_stext_block *first_block;
  421. /* last_block points to the last child of this node (or NULL
  422. * if there are none). */
  423. fz_stext_block *last_block;
  424. /* We have a set of 'standard' structure types. Every structure
  425. * element should correspond to one of these. */
  426. fz_structure standard;
  427. /* Documents can use their own non-standard structure types, which
  428. * are held as 'raw' strings. */
  429. char raw[FZ_FLEXIBLE_ARRAY];
  430. };
  431. /* An example to show how fz_stext_blocks and fz_stext_structs interact:
  432. *
  433. * [fz_stext_page]
  434. * |
  435. * first_block|
  436. * |
  437. * \|/
  438. * [fz_stext_block:TEXT]<->[fz_stext_block:STRUCT]<->[fz_stext_block:IMG]
  439. * u.s.down| /|\
  440. * | |
  441. * \|/ |up
  442. * [fz_stext_struct]<---------.
  443. * | | |
  444. * first_block| |last_block |
  445. * _______________________| | |
  446. * | | |
  447. * | | |
  448. * \|/ \|/ |
  449. * [fz_stext_block:...]<->...<->[fz_stext_block:STRUCT] |
  450. * | /|\ |
  451. * u.s.down| |up |
  452. * \|/ | parent|
  453. * [fz_stext_struct]--------'
  454. * | |
  455. * first_block| |last_block
  456. * : :
  457. */
  458. struct fz_stext_grid_positions
  459. {
  460. int len;
  461. int max_uncertainty;
  462. struct {
  463. int reinforcement;
  464. float pos;
  465. float min;
  466. float max;
  467. int uncertainty;
  468. } list[FZ_FLEXIBLE_ARRAY];
  469. };
  470. FZ_DATA extern const char *fz_stext_options_usage;
  471. /**
  472. Create an empty text page.
  473. The text page is filled out by the text device to contain the
  474. blocks and lines of text on the page.
  475. mediabox: optional mediabox information.
  476. */
  477. fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox);
  478. void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
  479. /**
  480. Output structured text to a file in HTML (visual) format.
  481. */
  482. void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
  483. void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out);
  484. void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out);
  485. /**
  486. Output structured text to a file in XHTML (semantic) format.
  487. */
  488. void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
  489. void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out);
  490. void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out);
  491. /**
  492. Output structured text to a file in XML format.
  493. */
  494. void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
  495. /**
  496. Output structured text to a file in JSON format.
  497. */
  498. void fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale);
  499. /**
  500. Output structured text to a file in plain-text UTF-8 format.
  501. */
  502. void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page);
  503. /**
  504. Search for occurrence of 'needle' in text page.
  505. Return the number of quads and store hit quads in the passed in
  506. array.
  507. NOTE: This is an experimental interface and subject to change
  508. without notice.
  509. */
  510. int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, int *hit_mark, fz_quad *hit_bbox, int hit_max);
  511. /**
  512. Callback function for use in searching.
  513. Called with the list of quads that correspond to a single hit.
  514. The callback should return with 0 to continue the search, or 1 to abort it.
  515. All other values are reserved at this point.
  516. */
  517. typedef int (fz_search_callback_fn)(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox);
  518. /**
  519. Search for occurrence of 'needle' in text page.
  520. Call callback once for each hit. This callback will receive
  521. (potentially) multiple quads for each hit.
  522. Returns the number of hits - note that this is potentially
  523. different from (i.e. is not greater than) the number of quads
  524. as returned by the non callback API.
  525. NOTE: This is an experimental interface and subject to change
  526. without notice.
  527. */
  528. int fz_search_stext_page_cb(fz_context *ctx, fz_stext_page *text, const char *needle, fz_search_callback_fn *cb, void *opaque);
  529. /**
  530. Return a list of quads to highlight lines inside the selection
  531. points.
  532. */
  533. int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads);
  534. enum
  535. {
  536. FZ_SELECT_CHARS,
  537. FZ_SELECT_WORDS,
  538. FZ_SELECT_LINES,
  539. };
  540. fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode);
  541. /**
  542. Return a newly allocated UTF-8 string with the text for a given
  543. selection.
  544. crlf: If true, write "\r\n" style line endings (otherwise "\n"
  545. only).
  546. */
  547. char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf);
  548. /**
  549. Return a newly allocated UTF-8 string with the text for a given
  550. selection rectangle.
  551. crlf: If true, write "\r\n" style line endings (otherwise "\n"
  552. only).
  553. */
  554. char *fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf);
  555. /**
  556. Options for creating structured text.
  557. */
  558. typedef struct
  559. {
  560. int flags;
  561. float scale;
  562. fz_rect clip;
  563. } fz_stext_options;
  564. /**
  565. Parse stext device options from a comma separated key-value
  566. string.
  567. */
  568. fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
  569. /**
  570. Perform segmentation analysis on an (unstructured) page to look for
  571. recursive subdivisions.
  572. Essentially this code attempts to split the page horizontally and/or
  573. vertically repeatedly into smaller and smaller "segments" (divisions).
  574. This minimises the reordering of the content, but some reordering
  575. may be unavoidable.
  576. Returns 0 if no changes were made to the document.
  577. This is experimental code, and may change (or be removed) in future
  578. versions!
  579. */
  580. int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page);
  581. /**
  582. Perform segmentation analysis on a rectangle of a given
  583. stext page.
  584. Like fz_segment_stext_page, this attempts to split the given page
  585. region horizontally and/or vertically repeatedly into smaller and
  586. smaller "segments".
  587. This works for pages with structure too, but splitting with
  588. rectangles that cut across structure blocks may not behave as
  589. expected.
  590. This minimises the reordering of the content (as viewed from the
  591. perspective of a depth first traversal), but some reordering may
  592. be unavoidable.
  593. This function accepts smaller gaps for segmentation than the full
  594. page segmentation does.
  595. Returns 0 if no changes were made to the document.
  596. This is experimental code, and may change (or be removed) in future
  597. versions!
  598. */
  599. int fz_segment_stext_rect(fz_context *ctx, fz_stext_page *page, fz_rect rect);
  600. /**
  601. Attempt to break paragraphs at plausible places.
  602. */
  603. void fz_paragraph_break(fz_context *ctx, fz_stext_page *page);
  604. /**
  605. Hunt for possible tables on a page, and update the stext with
  606. information.
  607. */
  608. void fz_table_hunt(fz_context *ctx, fz_stext_page *page);
  609. /**
  610. Hunt for possible tables within a specific rect on a page, and
  611. update the stext with information.
  612. */
  613. void fz_table_hunt_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds);
  614. /**
  615. Interpret the bounded contents of a given stext page as
  616. a table.
  617. The page contents will be rewritten to contain a Table
  618. structure with the identified content in it.
  619. This uses the same logic as for fz_table_hunt, without the
  620. actual hunting. fz_table_hunt hunts to find possible bounds
  621. for multiple tables on the page; this routine just finds a
  622. single table contained within the given rectangle.
  623. Returns the stext_block list that contains the content of
  624. the table.
  625. */
  626. fz_stext_block *
  627. fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds);
  628. /**
  629. Create a device to extract the text on a page.
  630. Gather the text on a page into blocks and lines.
  631. The reading order is taken from the order the text is drawn in
  632. the source file, so may not be accurate.
  633. page: The text page to which content should be added. This will
  634. usually be a newly created (empty) text page, but it can be one
  635. containing data already (for example when merging multiple
  636. pages, or watermarking).
  637. options: Options to configure the stext device.
  638. */
  639. fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
  640. /**
  641. Create a device to extract the text on a page into an existing
  642. fz_stext_page structure.
  643. Gather the text on a page into blocks and lines.
  644. The reading order is taken from the order the text is drawn in
  645. the source file, so may not be accurate.
  646. stext_page: The text page to which content should be added. This will
  647. usually be a newly created (empty) text page, but it can be one
  648. containing data already (for example when merging multiple
  649. pages, or watermarking).
  650. options: Options to configure the stext device.
  651. The next 2 parameters are copied into the fz_stext_page structure's
  652. ids section, so only have to be valid if you expect to interrogate
  653. that section later.
  654. chapter_num: The chapter number that this page came from.
  655. page_num: The page number that this page came from.
  656. The final parameter is copied into the fz_stext_page structure's
  657. ids section. The mediabox for the enture fz_stext_page is unioned
  658. with this, so pass fz_empty_bbox if you don't care about getting
  659. a valid value back from the ids section, but you don't want to
  660. upset the value in the page->mediabox field.
  661. mediabox: The mediabox for this page.
  662. */
  663. fz_device *
  664. fz_new_stext_device_for_page(fz_context *ctx, fz_stext_page *stext_page, const fz_stext_options *opts, int chapter_num, int page_num, fz_rect mediabox);
  665. /**
  666. Create a device to OCR the text on the page.
  667. Renders the page internally to a bitmap that is then OCRd. Text
  668. is then forwarded onto the target device.
  669. target: The target device to receive the OCRd text.
  670. ctm: The transform to apply to the mediabox to get the size for
  671. the rendered page image. Also used to calculate the resolution
  672. for the page image. In general, this will be the same as the CTM
  673. that you pass to fz_run_page (or fz_run_display_list) to feed
  674. this device.
  675. mediabox: The mediabox (in points). Combined with the CTM to get
  676. the bounds of the pixmap used internally for the rendered page
  677. image.
  678. with_list: If with_list is false, then all non-text operations
  679. are forwarded instantly to the target device. This results in
  680. the target device seeing all NON-text operations, followed by
  681. all the text operations (derived from OCR).
  682. If with_list is true, then all the marking operations are
  683. collated into a display list which is then replayed to the
  684. target device at the end.
  685. language: NULL (for "eng"), or a pointer to a string to describe
  686. the languages/scripts that should be used for OCR (e.g.
  687. "eng,ara").
  688. datadir: NULL (for ""), or a pointer to a path string otherwise
  689. provided to Tesseract in the TESSDATA_PREFIX environment variable.
  690. progress: NULL, or function to be called periodically to indicate
  691. progress. Return 0 to continue, or 1 to cancel. progress_arg is
  692. returned as the void *. The int is a value between 0 and 100 to
  693. indicate progress.
  694. progress_arg: A void * value to be parrotted back to the progress
  695. function.
  696. */
  697. fz_device *fz_new_ocr_device(fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language,
  698. const char *datadir, int (*progress)(fz_context *, void *, int), void *progress_arg);
  699. fz_document *fz_open_reflowed_document(fz_context *ctx, fz_document *underdoc, const fz_stext_options *opts);
  700. /*
  701. Allocator function to make a new STRUCT stext block to be used in
  702. a given page (and it's 'down' structure, initially empty). Not
  703. linked in to the overall page structure yet.
  704. */
  705. fz_stext_block *fz_new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_structure standard, const char *raw, int index);
  706. /* Iterators for walking over stext pages */
  707. /*
  708. Iterator definition. The parts of this are subject to change.
  709. */
  710. typedef struct
  711. {
  712. fz_stext_page *page;
  713. fz_stext_block *pos;
  714. fz_stext_struct *parent;
  715. } fz_stext_page_block_iterator;
  716. /*
  717. Create a new iterator, initialised to point at the first block on the page.
  718. */
  719. fz_stext_page_block_iterator fz_stext_page_block_iterator_begin(fz_stext_page *page);
  720. /*
  721. Move to the next block (never moving upwards).
  722. If there is no next block, iterator.pos is returned as NULL.
  723. */
  724. fz_stext_page_block_iterator fz_stext_page_block_iterator_next(fz_stext_page_block_iterator pos);
  725. /*
  726. On a structure block, this moves the iterator down to the first child of
  727. that block.
  728. On any other block, this does nothing.
  729. */
  730. fz_stext_page_block_iterator fz_stext_page_block_iterator_down(fz_stext_page_block_iterator pos);
  731. /*
  732. Move up to the parent of the current block.
  733. If there is no parent, iterator.pos is return as NULL.
  734. */
  735. fz_stext_page_block_iterator fz_stext_page_block_iterator_up(fz_stext_page_block_iterator pos);
  736. /*
  737. Move to the next block (in a depth first traversal style).
  738. The iterator never stops on struct blocks, and instead steps into them.
  739. At the end of a set of child blocks, it will move back to the parent and
  740. continue from there.
  741. */
  742. fz_stext_page_block_iterator fz_stext_page_block_iterator_next_dfs(fz_stext_page_block_iterator pos);
  743. /*
  744. Return true if the iterator is at the end of a list of blocks.
  745. (No attempt is made to account for whether there is more data after a
  746. parent block).
  747. */
  748. int fz_stext_page_block_iterator_eod(fz_stext_page_block_iterator pos);
  749. /*
  750. Return true if the iterator is at the end of a depth first traversal
  751. of the stext page.
  752. */
  753. int fz_stext_page_block_iterator_eod_dfs(fz_stext_page_block_iterator pos);
  754. /*
  755. Update a given stext page so that the contents within it that fall
  756. within the given rectangle are contained within a structure tag of the
  757. given classification.
  758. The code tries not to change the ordering of content as seen from
  759. a depth first traversal as it does this.
  760. This is an experimental interface. It may be updated or removed in
  761. future with no warning!
  762. */
  763. void
  764. fz_classify_stext_rect(fz_context *ctx, fz_stext_page *page, fz_structure classification, fz_rect rect);
  765. /*
  766. Remove any prefix of large white rectangular vectors that (almost)
  767. fills the page from the stext.
  768. This is an experimental interface. It may be updated or removed in
  769. future with no warning!
  770. */
  771. int
  772. fz_stext_remove_page_fill(fz_context *ctx, fz_stext_page *page);
  773. #endif