xml.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. """
  2. :mod:`pandas.io.formats.xml` is a module for formatting data in XML.
  3. """
  4. from __future__ import annotations
  5. import codecs
  6. import io
  7. from typing import (
  8. TYPE_CHECKING,
  9. Any,
  10. final,
  11. )
  12. import warnings
  13. from pandas.errors import AbstractMethodError
  14. from pandas.util._decorators import (
  15. cache_readonly,
  16. doc,
  17. )
  18. from pandas.core.dtypes.common import is_list_like
  19. from pandas.core.dtypes.missing import isna
  20. from pandas.core.shared_docs import _shared_docs
  21. from pandas.io.common import get_handle
  22. from pandas.io.xml import (
  23. get_data_from_filepath,
  24. preprocess_data,
  25. )
  26. if TYPE_CHECKING:
  27. from pandas._typing import (
  28. CompressionOptions,
  29. FilePath,
  30. ReadBuffer,
  31. StorageOptions,
  32. WriteBuffer,
  33. )
  34. from pandas import DataFrame
  35. @doc(
  36. storage_options=_shared_docs["storage_options"],
  37. compression_options=_shared_docs["compression_options"] % "path_or_buffer",
  38. )
  39. class _BaseXMLFormatter:
  40. """
  41. Subclass for formatting data in XML.
  42. Parameters
  43. ----------
  44. path_or_buffer : str or file-like
  45. This can be either a string of raw XML, a valid URL,
  46. file or file-like object.
  47. index : bool
  48. Whether to include index in xml document.
  49. row_name : str
  50. Name for root of xml document. Default is 'data'.
  51. root_name : str
  52. Name for row elements of xml document. Default is 'row'.
  53. na_rep : str
  54. Missing data representation.
  55. attrs_cols : list
  56. List of columns to write as attributes in row element.
  57. elem_cols : list
  58. List of columns to write as children in row element.
  59. namespaces : dict
  60. The namespaces to define in XML document as dicts with key
  61. being namespace and value the URI.
  62. prefix : str
  63. The prefix for each element in XML document including root.
  64. encoding : str
  65. Encoding of xml object or document.
  66. xml_declaration : bool
  67. Whether to include xml declaration at top line item in xml.
  68. pretty_print : bool
  69. Whether to write xml document with line breaks and indentation.
  70. stylesheet : str or file-like
  71. A URL, file, file-like object, or a raw string containing XSLT.
  72. {compression_options}
  73. .. versionchanged:: 1.4.0 Zstandard support.
  74. {storage_options}
  75. See also
  76. --------
  77. pandas.io.formats.xml.EtreeXMLFormatter
  78. pandas.io.formats.xml.LxmlXMLFormatter
  79. """
  80. def __init__(
  81. self,
  82. frame: DataFrame,
  83. path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
  84. index: bool = True,
  85. root_name: str | None = "data",
  86. row_name: str | None = "row",
  87. na_rep: str | None = None,
  88. attr_cols: list[str] | None = None,
  89. elem_cols: list[str] | None = None,
  90. namespaces: dict[str | None, str] | None = None,
  91. prefix: str | None = None,
  92. encoding: str = "utf-8",
  93. xml_declaration: bool | None = True,
  94. pretty_print: bool | None = True,
  95. stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
  96. compression: CompressionOptions = "infer",
  97. storage_options: StorageOptions | None = None,
  98. ) -> None:
  99. self.frame = frame
  100. self.path_or_buffer = path_or_buffer
  101. self.index = index
  102. self.root_name = root_name
  103. self.row_name = row_name
  104. self.na_rep = na_rep
  105. self.attr_cols = attr_cols
  106. self.elem_cols = elem_cols
  107. self.namespaces = namespaces
  108. self.prefix = prefix
  109. self.encoding = encoding
  110. self.xml_declaration = xml_declaration
  111. self.pretty_print = pretty_print
  112. self.stylesheet = stylesheet
  113. self.compression: CompressionOptions = compression
  114. self.storage_options = storage_options
  115. self.orig_cols = self.frame.columns.tolist()
  116. self.frame_dicts = self._process_dataframe()
  117. self._validate_columns()
  118. self._validate_encoding()
  119. self.prefix_uri = self._get_prefix_uri()
  120. self._handle_indexes()
  121. def _build_tree(self) -> bytes:
  122. """
  123. Build tree from data.
  124. This method initializes the root and builds attributes and elements
  125. with optional namespaces.
  126. """
  127. raise AbstractMethodError(self)
  128. @final
  129. def _validate_columns(self) -> None:
  130. """
  131. Validate elems_cols and attrs_cols.
  132. This method will check if columns is list-like.
  133. Raises
  134. ------
  135. ValueError
  136. * If value is not a list and less then length of nodes.
  137. """
  138. if self.attr_cols and not is_list_like(self.attr_cols):
  139. raise TypeError(
  140. f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
  141. )
  142. if self.elem_cols and not is_list_like(self.elem_cols):
  143. raise TypeError(
  144. f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
  145. )
  146. @final
  147. def _validate_encoding(self) -> None:
  148. """
  149. Validate encoding.
  150. This method will check if encoding is among listed under codecs.
  151. Raises
  152. ------
  153. LookupError
  154. * If encoding is not available in codecs.
  155. """
  156. codecs.lookup(self.encoding)
  157. @final
  158. def _process_dataframe(self) -> dict[int | str, dict[str, Any]]:
  159. """
  160. Adjust Data Frame to fit xml output.
  161. This method will adjust underlying data frame for xml output,
  162. including optionally replacing missing values and including indexes.
  163. """
  164. df = self.frame
  165. if self.index:
  166. df = df.reset_index()
  167. if self.na_rep is not None:
  168. with warnings.catch_warnings():
  169. warnings.filterwarnings(
  170. "ignore",
  171. "Downcasting object dtype arrays",
  172. category=FutureWarning,
  173. )
  174. df = df.fillna(self.na_rep)
  175. return df.to_dict(orient="index")
  176. @final
  177. def _handle_indexes(self) -> None:
  178. """
  179. Handle indexes.
  180. This method will add indexes into attr_cols or elem_cols.
  181. """
  182. if not self.index:
  183. return
  184. first_key = next(iter(self.frame_dicts))
  185. indexes: list[str] = [
  186. x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
  187. ]
  188. if self.attr_cols:
  189. self.attr_cols = indexes + self.attr_cols
  190. if self.elem_cols:
  191. self.elem_cols = indexes + self.elem_cols
  192. def _get_prefix_uri(self) -> str:
  193. """
  194. Get uri of namespace prefix.
  195. This method retrieves corresponding URI to prefix in namespaces.
  196. Raises
  197. ------
  198. KeyError
  199. *If prefix is not included in namespace dict.
  200. """
  201. raise AbstractMethodError(self)
  202. @final
  203. def _other_namespaces(self) -> dict:
  204. """
  205. Define other namespaces.
  206. This method will build dictionary of namespaces attributes
  207. for root element, conditionally with optional namespaces and
  208. prefix.
  209. """
  210. nmsp_dict: dict[str, str] = {}
  211. if self.namespaces:
  212. nmsp_dict = {
  213. f"xmlns{p if p=='' else f':{p}'}": n
  214. for p, n in self.namespaces.items()
  215. if n != self.prefix_uri[1:-1]
  216. }
  217. return nmsp_dict
  218. @final
  219. def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
  220. """
  221. Create attributes of row.
  222. This method adds attributes using attr_cols to row element and
  223. works with tuples for multindex or hierarchical columns.
  224. """
  225. if not self.attr_cols:
  226. return elem_row
  227. for col in self.attr_cols:
  228. attr_name = self._get_flat_col_name(col)
  229. try:
  230. if not isna(d[col]):
  231. elem_row.attrib[attr_name] = str(d[col])
  232. except KeyError:
  233. raise KeyError(f"no valid column, {col}")
  234. return elem_row
  235. @final
  236. def _get_flat_col_name(self, col: str | tuple) -> str:
  237. flat_col = col
  238. if isinstance(col, tuple):
  239. flat_col = (
  240. "".join([str(c) for c in col]).strip()
  241. if "" in col
  242. else "_".join([str(c) for c in col]).strip()
  243. )
  244. return f"{self.prefix_uri}{flat_col}"
  245. @cache_readonly
  246. def _sub_element_cls(self):
  247. raise AbstractMethodError(self)
  248. @final
  249. def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
  250. """
  251. Create child elements of row.
  252. This method adds child elements using elem_cols to row element and
  253. works with tuples for multindex or hierarchical columns.
  254. """
  255. sub_element_cls = self._sub_element_cls
  256. if not self.elem_cols:
  257. return
  258. for col in self.elem_cols:
  259. elem_name = self._get_flat_col_name(col)
  260. try:
  261. val = None if isna(d[col]) or d[col] == "" else str(d[col])
  262. sub_element_cls(elem_row, elem_name).text = val
  263. except KeyError:
  264. raise KeyError(f"no valid column, {col}")
  265. @final
  266. def write_output(self) -> str | None:
  267. xml_doc = self._build_tree()
  268. if self.path_or_buffer is not None:
  269. with get_handle(
  270. self.path_or_buffer,
  271. "wb",
  272. compression=self.compression,
  273. storage_options=self.storage_options,
  274. is_text=False,
  275. ) as handles:
  276. handles.handle.write(xml_doc)
  277. return None
  278. else:
  279. return xml_doc.decode(self.encoding).rstrip()
  280. class EtreeXMLFormatter(_BaseXMLFormatter):
  281. """
  282. Class for formatting data in xml using Python standard library
  283. modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
  284. """
  285. def _build_tree(self) -> bytes:
  286. from xml.etree.ElementTree import (
  287. Element,
  288. SubElement,
  289. tostring,
  290. )
  291. self.root = Element(
  292. f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces()
  293. )
  294. for d in self.frame_dicts.values():
  295. elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
  296. if not self.attr_cols and not self.elem_cols:
  297. self.elem_cols = list(d.keys())
  298. self._build_elems(d, elem_row)
  299. else:
  300. elem_row = self._build_attribs(d, elem_row)
  301. self._build_elems(d, elem_row)
  302. self.out_xml = tostring(
  303. self.root,
  304. method="xml",
  305. encoding=self.encoding,
  306. xml_declaration=self.xml_declaration,
  307. )
  308. if self.pretty_print:
  309. self.out_xml = self._prettify_tree()
  310. if self.stylesheet is not None:
  311. raise ValueError(
  312. "To use stylesheet, you need lxml installed and selected as parser."
  313. )
  314. return self.out_xml
  315. def _get_prefix_uri(self) -> str:
  316. from xml.etree.ElementTree import register_namespace
  317. uri = ""
  318. if self.namespaces:
  319. for p, n in self.namespaces.items():
  320. if isinstance(p, str) and isinstance(n, str):
  321. register_namespace(p, n)
  322. if self.prefix:
  323. try:
  324. uri = f"{{{self.namespaces[self.prefix]}}}"
  325. except KeyError:
  326. raise KeyError(f"{self.prefix} is not included in namespaces")
  327. elif "" in self.namespaces:
  328. uri = f'{{{self.namespaces[""]}}}'
  329. else:
  330. uri = ""
  331. return uri
  332. @cache_readonly
  333. def _sub_element_cls(self):
  334. from xml.etree.ElementTree import SubElement
  335. return SubElement
  336. def _prettify_tree(self) -> bytes:
  337. """
  338. Output tree for pretty print format.
  339. This method will pretty print xml with line breaks and indentation.
  340. """
  341. from xml.dom.minidom import parseString
  342. dom = parseString(self.out_xml)
  343. return dom.toprettyxml(indent=" ", encoding=self.encoding)
  344. class LxmlXMLFormatter(_BaseXMLFormatter):
  345. """
  346. Class for formatting data in xml using Python standard library
  347. modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
  348. """
  349. def __init__(self, *args, **kwargs) -> None:
  350. super().__init__(*args, **kwargs)
  351. self._convert_empty_str_key()
  352. def _build_tree(self) -> bytes:
  353. """
  354. Build tree from data.
  355. This method initializes the root and builds attributes and elements
  356. with optional namespaces.
  357. """
  358. from lxml.etree import (
  359. Element,
  360. SubElement,
  361. tostring,
  362. )
  363. self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
  364. for d in self.frame_dicts.values():
  365. elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
  366. if not self.attr_cols and not self.elem_cols:
  367. self.elem_cols = list(d.keys())
  368. self._build_elems(d, elem_row)
  369. else:
  370. elem_row = self._build_attribs(d, elem_row)
  371. self._build_elems(d, elem_row)
  372. self.out_xml = tostring(
  373. self.root,
  374. pretty_print=self.pretty_print,
  375. method="xml",
  376. encoding=self.encoding,
  377. xml_declaration=self.xml_declaration,
  378. )
  379. if self.stylesheet is not None:
  380. self.out_xml = self._transform_doc()
  381. return self.out_xml
  382. def _convert_empty_str_key(self) -> None:
  383. """
  384. Replace zero-length string in `namespaces`.
  385. This method will replace '' with None to align to `lxml`
  386. requirement that empty string prefixes are not allowed.
  387. """
  388. if self.namespaces and "" in self.namespaces.keys():
  389. self.namespaces[None] = self.namespaces.pop("", "default")
  390. def _get_prefix_uri(self) -> str:
  391. uri = ""
  392. if self.namespaces:
  393. if self.prefix:
  394. try:
  395. uri = f"{{{self.namespaces[self.prefix]}}}"
  396. except KeyError:
  397. raise KeyError(f"{self.prefix} is not included in namespaces")
  398. elif "" in self.namespaces:
  399. uri = f'{{{self.namespaces[""]}}}'
  400. else:
  401. uri = ""
  402. return uri
  403. @cache_readonly
  404. def _sub_element_cls(self):
  405. from lxml.etree import SubElement
  406. return SubElement
  407. def _transform_doc(self) -> bytes:
  408. """
  409. Parse stylesheet from file or buffer and run it.
  410. This method will parse stylesheet object into tree for parsing
  411. conditionally by its specific object type, then transforms
  412. original tree with XSLT script.
  413. """
  414. from lxml.etree import (
  415. XSLT,
  416. XMLParser,
  417. fromstring,
  418. parse,
  419. )
  420. style_doc = self.stylesheet
  421. assert style_doc is not None # is ensured by caller
  422. handle_data = get_data_from_filepath(
  423. filepath_or_buffer=style_doc,
  424. encoding=self.encoding,
  425. compression=self.compression,
  426. storage_options=self.storage_options,
  427. )
  428. with preprocess_data(handle_data) as xml_data:
  429. curr_parser = XMLParser(encoding=self.encoding)
  430. if isinstance(xml_data, io.StringIO):
  431. xsl_doc = fromstring(
  432. xml_data.getvalue().encode(self.encoding), parser=curr_parser
  433. )
  434. else:
  435. xsl_doc = parse(xml_data, parser=curr_parser)
  436. transformer = XSLT(xsl_doc)
  437. new_doc = transformer(self.root)
  438. return bytes(new_doc)