xml.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. // Copyright (C) 2004-2024 Artifex Software, Inc.
  2. //
  3. // This file is part of MuPDF.
  4. //
  5. // MuPDF is free software: you can redistribute it and/or modify it under the
  6. // terms of the GNU Affero General Public License as published by the Free
  7. // Software Foundation, either version 3 of the License, or (at your option)
  8. // any later version.
  9. //
  10. // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  11. // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  13. // details.
  14. //
  15. // You should have received a copy of the GNU Affero General Public License
  16. // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  17. //
  18. // Alternative licensing terms are available from the licensor.
  19. // For commercial licensing, see <https://www.artifex.com/> or contact
  20. // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  21. // CA 94129, USA, for further information.
  22. #ifndef MUPDF_FITZ_XML_H
  23. #define MUPDF_FITZ_XML_H
  24. #include "mupdf/fitz/system.h"
  25. #include "mupdf/fitz/context.h"
  26. #include "mupdf/fitz/buffer.h"
  27. #include "mupdf/fitz/pool.h"
  28. #include "mupdf/fitz/archive.h"
  29. /**
  30. XML document model
  31. */
  32. typedef struct fz_xml fz_xml;
  33. /* For backwards compatibility */
  34. typedef fz_xml fz_xml_doc;
  35. /**
  36. Parse the contents of buffer into a tree of xml nodes.
  37. preserve_white: whether to keep or delete all-whitespace nodes.
  38. */
  39. fz_xml *fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white);
  40. /**
  41. Parse the contents of buffer into a tree of xml nodes.
  42. preserve_white: whether to keep or delete all-whitespace nodes.
  43. */
  44. fz_xml *fz_parse_xml_stream(fz_context *ctx, fz_stream *stream, int preserve_white);
  45. /**
  46. Parse the contents of an archive entry into a tree of xml nodes.
  47. preserve_white: whether to keep or delete all-whitespace nodes.
  48. */
  49. fz_xml *fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *dir, const char *filename, int preserve_white);
  50. /**
  51. Try and parse the contents of an archive entry into a tree of xml nodes.
  52. preserve_white: whether to keep or delete all-whitespace nodes.
  53. Will return NULL if the archive entry can't be found. Otherwise behaves
  54. the same as fz_parse_xml_archive_entry. May throw exceptions.
  55. */
  56. fz_xml *fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *dir, const char *filename, int preserve_white);
  57. /**
  58. Parse the contents of a buffer into a tree of XML nodes,
  59. using the HTML5 parsing algorithm.
  60. */
  61. fz_xml *fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf);
  62. /**
  63. Add a reference to the XML.
  64. */
  65. fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml);
  66. /**
  67. Drop a reference to the XML. When the last reference is
  68. dropped, the node and all its children and siblings will
  69. be freed.
  70. */
  71. void fz_drop_xml(fz_context *ctx, fz_xml *xml);
  72. /**
  73. Detach a node from the tree, unlinking it from its parent,
  74. and setting the document root to the node.
  75. */
  76. void fz_detach_xml(fz_context *ctx, fz_xml *node);
  77. /**
  78. Return the topmost XML node of a document.
  79. */
  80. fz_xml *fz_xml_root(fz_xml_doc *xml);
  81. /**
  82. Return previous sibling of XML node.
  83. */
  84. fz_xml *fz_xml_prev(fz_xml *item);
  85. /**
  86. Return next sibling of XML node.
  87. */
  88. fz_xml *fz_xml_next(fz_xml *item);
  89. /**
  90. Return parent of XML node.
  91. */
  92. fz_xml *fz_xml_up(fz_xml *item);
  93. /**
  94. Return first child of XML node.
  95. */
  96. fz_xml *fz_xml_down(fz_xml *item);
  97. /**
  98. Return true if the tag name matches.
  99. */
  100. int fz_xml_is_tag(fz_xml *item, const char *name);
  101. /**
  102. Return tag of XML node. Return NULL for text nodes.
  103. */
  104. char *fz_xml_tag(fz_xml *item);
  105. /**
  106. Return the value of an attribute of an XML node.
  107. NULL if the attribute doesn't exist.
  108. */
  109. char *fz_xml_att(fz_xml *item, const char *att);
  110. /**
  111. Return the value of an attribute of an XML node.
  112. If the first attribute doesn't exist, try the second.
  113. NULL if neither attribute exists.
  114. */
  115. char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two);
  116. /**
  117. Check for a matching attribute on an XML node.
  118. If the node has the requested attribute (name), and the value
  119. matches (match) then return 1. Otherwise, 0.
  120. */
  121. int fz_xml_att_eq(fz_xml *item, const char *name, const char *match);
  122. /**
  123. Add an attribute to an XML node.
  124. */
  125. void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val);
  126. /**
  127. Return the text content of an XML node.
  128. Return NULL if the node is a tag.
  129. */
  130. char *fz_xml_text(fz_xml *item);
  131. /**
  132. Pretty-print an XML tree to given output.
  133. */
  134. void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level);
  135. /**
  136. Pretty-print an XML tree to stdout. (Deprecated, use
  137. fz_output_xml in preference).
  138. */
  139. void fz_debug_xml(fz_xml *item, int level);
  140. /**
  141. Search the siblings of XML nodes starting with item looking for
  142. the first with the given tag.
  143. Return NULL if none found.
  144. */
  145. fz_xml *fz_xml_find(fz_xml *item, const char *tag);
  146. /**
  147. Search the siblings of XML nodes starting with the first sibling
  148. of item looking for the first with the given tag.
  149. Return NULL if none found.
  150. */
  151. fz_xml *fz_xml_find_next(fz_xml *item, const char *tag);
  152. /**
  153. Search the siblings of XML nodes starting with the first child
  154. of item looking for the first with the given tag.
  155. Return NULL if none found.
  156. */
  157. fz_xml *fz_xml_find_down(fz_xml *item, const char *tag);
  158. /**
  159. Search the siblings of XML nodes starting with item looking for
  160. the first with the given tag (or any tag if tag is NULL), and
  161. with a matching attribute.
  162. Return NULL if none found.
  163. */
  164. fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match);
  165. /**
  166. Search the siblings of XML nodes starting with the first sibling
  167. of item looking for the first with the given tag (or any tag if tag
  168. is NULL), and with a matching attribute.
  169. Return NULL if none found.
  170. */
  171. fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match);
  172. /**
  173. Search the siblings of XML nodes starting with the first child
  174. of item looking for the first with the given tag (or any tag if
  175. tag is NULL), and with a matching attribute.
  176. Return NULL if none found.
  177. */
  178. fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match);
  179. /**
  180. Perform a depth first search from item, returning the first
  181. child that matches the given tag (or any tag if tag is NULL),
  182. with the given attribute (if att is non NULL), that matches
  183. match (if match is non NULL).
  184. */
  185. fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match);
  186. /**
  187. Perform a depth first search from item, returning the first
  188. child that matches the given tag (or any tag if tag is NULL),
  189. with the given attribute (if att is non NULL), that matches
  190. match (if match is non NULL). The search stops if it ever
  191. reaches the top of the tree, or the declared 'top' item.
  192. */
  193. fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top);
  194. /**
  195. Perform a depth first search onwards from item, returning the first
  196. child that matches the given tag (or any tag if tag is NULL),
  197. with the given attribute (if att is non NULL), that matches
  198. match (if match is non NULL).
  199. */
  200. fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match);
  201. /**
  202. Perform a depth first search onwards from item, returning the first
  203. child that matches the given tag (or any tag if tag is NULL),
  204. with the given attribute (if att is non NULL), that matches
  205. match (if match is non NULL). The search stops if it ever reaches
  206. the top of the tree, or the declared 'top' item.
  207. */
  208. fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top);
  209. /**
  210. DOM-like functions for html in xml.
  211. */
  212. /**
  213. Return a borrowed reference for the 'body' element of
  214. the given DOM.
  215. */
  216. fz_xml *fz_dom_body(fz_context *ctx, fz_xml *dom);
  217. /**
  218. Return a borrowed reference for the document (the top
  219. level element) of the DOM.
  220. */
  221. fz_xml *fz_dom_document_element(fz_context *ctx, fz_xml *dom);
  222. /**
  223. Create an element of a given tag type for the given DOM.
  224. The element is not linked into the DOM yet.
  225. */
  226. fz_xml *fz_dom_create_element(fz_context *ctx, fz_xml *dom, const char *tag);
  227. /**
  228. Create a text node for the given DOM.
  229. The element is not linked into the DOM yet.
  230. */
  231. fz_xml *fz_dom_create_text_node(fz_context *ctx, fz_xml *dom, const char *text);
  232. /**
  233. Find the first element matching the requirements in a depth first traversal from elt.
  234. The tagname must match tag, unless tag is NULL, when all tag names are considered to match.
  235. If att is NULL, then all tags match.
  236. Otherwise:
  237. If match is NULL, then only nodes that have an att attribute match.
  238. If match is non-NULL, then only nodes that have an att attribute that matches match match.
  239. Returns NULL (if no match found), or a borrowed reference to the first matching element.
  240. */
  241. fz_xml *fz_dom_find(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match);
  242. /**
  243. Find the next element matching the requirements.
  244. */
  245. fz_xml *fz_dom_find_next(fz_context *ctx, fz_xml *elt, const char *tag, const char *att, const char *match);
  246. /**
  247. Insert an element as the last child of a parent, unlinking the
  248. child from its current position if required.
  249. */
  250. void fz_dom_append_child(fz_context *ctx, fz_xml *parent, fz_xml *child);
  251. /**
  252. Insert an element (new_elt), before another element (node),
  253. unlinking the new_elt from its current position if required.
  254. */
  255. void fz_dom_insert_before(fz_context *ctx, fz_xml *node, fz_xml *new_elt);
  256. /**
  257. Insert an element (new_elt), after another element (node),
  258. unlinking the new_elt from its current position if required.
  259. */
  260. void fz_dom_insert_after(fz_context *ctx, fz_xml *node, fz_xml *new_elt);
  261. /**
  262. Remove an element from the DOM. The element can be added back elsewhere
  263. if required.
  264. No reference counting changes for the element.
  265. */
  266. void fz_dom_remove(fz_context *ctx, fz_xml *elt);
  267. /**
  268. Clone an element (and its children).
  269. A borrowed reference to the clone is returned. The clone is not
  270. yet linked into the DOM.
  271. */
  272. fz_xml *fz_dom_clone(fz_context *ctx, fz_xml *elt);
  273. /**
  274. Return a borrowed reference to the first child of a node,
  275. or NULL if there isn't one.
  276. */
  277. fz_xml *fz_dom_first_child(fz_context *ctx, fz_xml *elt);
  278. /**
  279. Return a borrowed reference to the parent of a node,
  280. or NULL if there isn't one.
  281. */
  282. fz_xml *fz_dom_parent(fz_context *ctx, fz_xml *elt);
  283. /**
  284. Return a borrowed reference to the next sibling of a node,
  285. or NULL if there isn't one.
  286. */
  287. fz_xml *fz_dom_next(fz_context *ctx, fz_xml *elt);
  288. /**
  289. Return a borrowed reference to the previous sibling of a node,
  290. or NULL if there isn't one.
  291. */
  292. fz_xml *fz_dom_previous(fz_context *ctx, fz_xml *elt);
  293. /**
  294. Add an attribute to an element.
  295. Ownership of att and value remain with the caller.
  296. */
  297. void fz_dom_add_attribute(fz_context *ctx, fz_xml *elt, const char *att, const char *value);
  298. /**
  299. Remove an attribute from an element.
  300. */
  301. void fz_dom_remove_attribute(fz_context *ctx, fz_xml *elt, const char *att);
  302. /**
  303. Retrieve the value of a given attribute from a given element.
  304. Returns a borrowed pointer to the value or NULL if not found.
  305. */
  306. const char *fz_dom_attribute(fz_context *ctx, fz_xml *elt, const char *att);
  307. /**
  308. Enumerate through the attributes of an element.
  309. Call with i=0,1,2,3... to enumerate attributes.
  310. On return *att and the return value will be NULL if there are not
  311. that many attributes to read. Otherwise, *att will be filled in
  312. with a borrowed pointer to the attribute name, and the return
  313. value will be a borrowed pointer to the value.
  314. */
  315. const char *fz_dom_get_attribute(fz_context *ctx, fz_xml *elt, int i, const char **att);
  316. /**
  317. Make new xml dom root element.
  318. */
  319. fz_xml *fz_new_dom(fz_context *ctx, const char *tag);
  320. /**
  321. Create a new dom node.
  322. This will NOT be linked in yet.
  323. */
  324. fz_xml *fz_new_dom_node(fz_context *ctx, fz_xml *dom, const char *tag);
  325. /**
  326. Create a new dom text node.
  327. This will NOT be linked in yet.
  328. */
  329. fz_xml *fz_new_dom_text_node(fz_context *ctx, fz_xml *dom, const char *text);
  330. /**
  331. Write our xml structure out to an xml stream.
  332. Properly formatted XML is only allowed to have a single top-level node
  333. under which everything must sit. Our structures allow for multiple
  334. top level nodes. If required, we will output an extra 'ROOT' node
  335. at the top so that the xml is well-formed.
  336. If 'indented' is non-zero then additional whitespace will be added to
  337. make the XML easier to read in a text editor. It will NOT be properly
  338. compliant.
  339. */
  340. void fz_write_xml(fz_context *ctx, fz_xml *root, fz_output *out, int indented);
  341. /**
  342. As for fz_write_xml, but direct to a file.
  343. */
  344. void fz_save_xml(fz_context *ctx, fz_xml *root, const char *path, int indented);
  345. #endif