| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155 |
- """
- :mod:``pandas.io.xml`` is a module for reading XML.
- """
- from __future__ import annotations
- import io
- from os import PathLike
- from typing import (
- TYPE_CHECKING,
- Any,
- )
- from pandas._libs import lib
- from pandas.compat._optional import import_optional_dependency
- from pandas.errors import (
- AbstractMethodError,
- ParserError,
- )
- from pandas.util._decorators import set_module
- from pandas.util._validators import check_dtype_backend
- from pandas.core.dtypes.common import is_list_like
- from pandas.io.common import (
- get_handle,
- infer_compression,
- is_fsspec_url,
- is_url,
- stringify_path,
- )
- from pandas.io.parsers import TextParser
- if TYPE_CHECKING:
- from collections.abc import (
- Callable,
- Sequence,
- )
- from xml.etree.ElementTree import Element
- from lxml import etree
- from pandas._typing import (
- CompressionOptions,
- ConvertersArg,
- DtypeArg,
- DtypeBackend,
- FilePath,
- ParseDatesArg,
- ReadBuffer,
- StorageOptions,
- XMLParsers,
- )
- from pandas import DataFrame
- class _XMLFrameParser:
- """
- Internal subclass to parse XML into DataFrames.
- Parameters
- ----------
- path_or_buffer : a valid JSON ``str``, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file.
- xpath : str or regex
- The ``XPath`` expression to parse required set of nodes for
- migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``.
- namespaces : dict
- The namespaces defined in XML document (``xmlns:namespace='URI'``)
- as dicts with key being namespace and value the URI.
- elems_only : bool
- Parse only the child elements at the specified ``xpath``.
- attrs_only : bool
- Parse only the attributes at the specified ``xpath``.
- names : list
- Column names for :class:`~pandas.DataFrame` of parsed XML data.
- dtype : dict
- Data type for data or columns. E.g. {{'a': np.float64,
- 'b': np.int32, 'c': 'Int64'}}
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels.
- parse_dates : bool or list of int or names or list of lists or dict
- Converts either index or select columns to datetimes
- encoding : str
- Encoding of xml object or document.
- stylesheet : str or file-like
- URL, file, file-like object, or a raw string containing XSLT,
- ``etree`` does not support XSLT but retained for consistency.
- iterparse : dict, optional
- Dict with row element as key and list of descendant elements
- and/or attributes as value to be retrieved in iterparsing of
- XML document.
- compression : str or dict, default 'infer'
- For on-the-fly decompression of on-disk data. If 'infer' and
- 'path_or_buffer' is path-like, then detect compression from the
- following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
- '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
- If using 'zip' or 'tar', the ZIP file must contain only one data
- file to be read in. Set to ``None`` for no decompression.
- Can also be a dict with key ``'method'`` set to one of
- {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
- and other key-value pairs are forwarded to ``zipfile.ZipFile``,
- ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``,
- ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively.
- As an example, the following could be passed for Zstandard
- decompression using a custom compression dictionary:
- ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
- storage_options : dict, optional
- Extra options that make sense for a particular storage connection,
- e.g. host, port, username, password, etc. For HTTP(S) URLs the
- key-value pairs are forwarded to ``urllib.request.Request`` as header
- options. For other URLs (e.g. starting with "s3://", and "gcs://")
- the key-value pairs are forwarded to ``fsspec.open``. Please see
- ``fsspec`` and ``urllib`` for more details, and for more examples on
- storage options refer `here <https://pandas.pydata.org/docs/
- user_guide/io.html?highlight=storage_options#reading-writing-remote-
- files>`_.
- See also
- --------
- pandas.io.xml._EtreeFrameParser
- pandas.io.xml._LxmlFrameParser
- Notes
- -----
- To subclass this class effectively you must override the following methods:`
- * :func:`parse_data`
- * :func:`_parse_nodes`
- * :func:`_iterparse_nodes`
- * :func:`_parse_doc`
- * :func:`_validate_names`
- * :func:`_validate_path`
- See each method's respective documentation for details on their
- functionality.
- """
- def __init__(
- self,
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str,
- namespaces: dict[str, str] | None,
- elems_only: bool,
- attrs_only: bool,
- names: Sequence[str] | None,
- dtype: DtypeArg | None,
- converters: ConvertersArg | None,
- parse_dates: ParseDatesArg | None,
- encoding: str | None,
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
- iterparse: dict[str, list[str]] | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- ) -> None:
- self.path_or_buffer = path_or_buffer
- self.xpath = xpath
- self.namespaces = namespaces
- self.elems_only = elems_only
- self.attrs_only = attrs_only
- self.names = names
- self.dtype = dtype
- self.converters = converters
- self.parse_dates = parse_dates
- self.encoding = encoding
- self.stylesheet = stylesheet
- self.iterparse = iterparse
- self.compression: CompressionOptions = compression
- self.storage_options = storage_options
- def parse_data(self) -> list[dict[str, str | None]]:
- """
- Parse xml data.
- This method will call the other internal methods to
- validate ``xpath``, names, parse and return specific nodes.
- """
- raise AbstractMethodError(self)
- def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
- """
- Parse xml nodes.
- This method will parse the children and attributes of elements
- in ``xpath``, conditionally for only elements, only attributes
- or both while optionally renaming node names.
- Raises
- ------
- ValueError
- * If only elements and only attributes are specified.
- Notes
- -----
- Namespace URIs will be removed from return node values. Also,
- elements with missing children or attributes compared to siblings
- will have optional keys filled with None values.
- """
- dicts: list[dict[str, str | None]]
- if self.elems_only and self.attrs_only:
- raise ValueError("Either element or attributes can be parsed not both.")
- if self.elems_only:
- if self.names:
- dicts = [
- {
- **(
- {el.tag: el.text}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- nm: ch.text if ch.text else None
- for nm, ch in zip(self.names, el.findall("*"), strict=True)
- },
- }
- for el in elems
- ]
- else:
- dicts = [
- {ch.tag: ch.text if ch.text else None for ch in el.findall("*")}
- for el in elems
- ]
- elif self.attrs_only:
- dicts = [
- {k: v if v else None for k, v in el.attrib.items()} for el in elems
- ]
- elif self.names:
- dicts = [
- {
- **el.attrib,
- **({el.tag: el.text} if el.text and not el.text.isspace() else {}),
- **{
- nm: ch.text if ch.text else None
- for nm, ch in zip(self.names, el.findall("*"), strict=False)
- },
- }
- for el in elems
- ]
- else:
- dicts = [
- {
- **el.attrib,
- **({el.tag: el.text} if el.text and not el.text.isspace() else {}),
- **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")},
- }
- for el in elems
- ]
- dicts = [
- {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
- ]
- keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
- dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
- if self.names:
- dicts = [dict(zip(self.names, d.values(), strict=True)) for d in dicts]
- return dicts
- def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
- """
- Iterparse xml nodes.
- This method will read in local disk, decompressed XML files for elements
- and underlying descendants using iterparse, a method to iterate through
- an XML tree without holding entire XML tree in memory.
- Raises
- ------
- TypeError
- * If ``iterparse`` is not a dict or its dict value is not list-like.
- ParserError
- * If ``path_or_buffer`` is not a physical file on disk or file-like object.
- * If no data is returned from selected items in ``iterparse``.
- Notes
- -----
- Namespace URIs will be removed from return node values. Also,
- elements with missing children or attributes in submitted list
- will have optional keys filled with None values.
- """
- dicts: list[dict[str, str | None]] = []
- row: dict[str, str | None] | None = None
- if not isinstance(self.iterparse, dict):
- raise TypeError(
- f"{type(self.iterparse).__name__} is not a valid type for iterparse"
- )
- row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
- if not is_list_like(self.iterparse[row_node]):
- raise TypeError(
- f"{type(self.iterparse[row_node])} is not a valid type "
- "for value in iterparse"
- )
- if (not hasattr(self.path_or_buffer, "read")) and (
- not isinstance(self.path_or_buffer, (str, PathLike))
- or is_url(self.path_or_buffer)
- or is_fsspec_url(self.path_or_buffer)
- or (
- isinstance(self.path_or_buffer, str)
- and self.path_or_buffer.startswith(("<?xml", "<"))
- )
- or infer_compression(self.path_or_buffer, "infer") is not None
- ):
- raise ParserError(
- "iterparse is designed for large XML files that are fully extracted on "
- "local disk and not as compressed files or online sources."
- )
- iterparse_repeats = len(self.iterparse[row_node]) != len(
- set(self.iterparse[row_node])
- )
- for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
- curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
- if event == "start":
- if curr_elem == row_node:
- row = {}
- if row is not None:
- if self.names and iterparse_repeats:
- for col, nm in zip(
- self.iterparse[row_node], self.names, strict=True
- ):
- if curr_elem == col:
- elem_val = elem.text if elem.text else None
- if elem_val not in row.values() and nm not in row:
- row[nm] = elem_val
- if col in elem.attrib:
- if elem.attrib[col] not in row.values() and nm not in row:
- row[nm] = elem.attrib[col]
- else:
- for col in self.iterparse[row_node]:
- if curr_elem == col:
- row[col] = elem.text if elem.text else None
- if col in elem.attrib:
- row[col] = elem.attrib[col]
- if event == "end":
- if curr_elem == row_node and row is not None:
- dicts.append(row)
- row = None
- elem.clear()
- if hasattr(elem, "getprevious"):
- while (
- elem.getprevious() is not None and elem.getparent() is not None
- ):
- del elem.getparent()[0]
- if dicts == []:
- raise ParserError("No result from selected items in iterparse.")
- keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
- dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
- if self.names:
- dicts = [dict(zip(self.names, d.values(), strict=True)) for d in dicts]
- return dicts
- def _validate_path(self) -> list[Any]:
- """
- Validate ``xpath``.
- This method checks for syntax, evaluation, or empty nodes return.
- Raises
- ------
- SyntaxError
- * If xpah is not supported or issues with namespaces.
- ValueError
- * If xpah does not return any nodes.
- """
- raise AbstractMethodError(self)
- def _validate_names(self) -> None:
- """
- Validate names.
- This method will check if names is a list-like and aligns
- with length of parse nodes.
- Raises
- ------
- ValueError
- * If value is not a list and less then length of nodes.
- """
- raise AbstractMethodError(self)
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> Element | etree._Element:
- """
- Build tree from path_or_buffer.
- This method will parse XML object into tree
- either from string/bytes or file location.
- """
- raise AbstractMethodError(self)
- class _EtreeFrameParser(_XMLFrameParser):
- """
- Internal class to parse XML into DataFrames with the Python
- standard library XML module: `xml.etree.ElementTree`.
- """
- def parse_data(self) -> list[dict[str, str | None]]:
- from xml.etree.ElementTree import iterparse
- if self.stylesheet is not None:
- raise ValueError(
- "To use stylesheet, you need lxml installed and selected as parser."
- )
- if self.iterparse is None:
- self.xml_doc = self._parse_doc(self.path_or_buffer)
- elems = self._validate_path()
- self._validate_names()
- xml_dicts: list[dict[str, str | None]] = (
- self._parse_nodes(elems)
- if self.iterparse is None
- else self._iterparse_nodes(iterparse)
- )
- return xml_dicts
- def _validate_path(self) -> list[Any]:
- """
- Notes
- -----
- ``etree`` supports limited ``XPath``. If user attempts a more complex
- expression syntax error will raise.
- """
- msg = (
- "xpath does not return any nodes or attributes. "
- "Be sure to specify in `xpath` the parent nodes of "
- "children and attributes to parse. "
- "If document uses namespaces denoted with "
- "xmlns, be sure to define namespaces and "
- "use them in xpath."
- )
- try:
- elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
- children = [ch for el in elems for ch in el.findall("*")]
- attrs = {k: v for el in elems for k, v in el.attrib.items()}
- if elems is None:
- raise ValueError(msg)
- if elems is not None:
- if self.elems_only and children == []:
- raise ValueError(msg)
- if self.attrs_only and attrs == {}:
- raise ValueError(msg)
- if children == [] and attrs == {}:
- raise ValueError(msg)
- except (KeyError, SyntaxError) as err:
- raise SyntaxError(
- "You have used an incorrect or unsupported XPath "
- "expression for etree library or you used an "
- "undeclared namespace prefix."
- ) from err
- return elems
- def _validate_names(self) -> None:
- children: list[Any]
- if self.names:
- if self.iterparse:
- children = self.iterparse[next(iter(self.iterparse))]
- else:
- parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
- children = parent.findall("*") if parent is not None else []
- if is_list_like(self.names):
- if len(self.names) < len(children):
- raise ValueError(
- "names does not match length of child elements in xpath."
- )
- else:
- raise TypeError(
- f"{type(self.names).__name__} is not a valid type for names"
- )
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> Element:
- from xml.etree.ElementTree import (
- XMLParser,
- parse,
- )
- handle_data = get_data_from_filepath(
- filepath_or_buffer=raw_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
- with handle_data as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
- document = parse(xml_data, parser=curr_parser)
- return document.getroot()
- class _LxmlFrameParser(_XMLFrameParser):
- """
- Internal class to parse XML into :class:`~pandas.DataFrame` with third-party
- full-featured XML library, ``lxml``, that supports
- ``XPath`` 1.0 and XSLT 1.0.
- """
- def parse_data(self) -> list[dict[str, str | None]]:
- """
- Parse xml data.
- This method will call the other internal methods to
- validate ``xpath``, names, optionally parse and run XSLT,
- and parse original or transformed XML and return specific nodes.
- """
- from lxml.etree import iterparse
- if self.iterparse is None:
- self.xml_doc = self._parse_doc(self.path_or_buffer)
- if self.stylesheet:
- self.xsl_doc = self._parse_doc(self.stylesheet)
- self.xml_doc = self._transform_doc()
- elems = self._validate_path()
- self._validate_names()
- xml_dicts: list[dict[str, str | None]] = (
- self._parse_nodes(elems)
- if self.iterparse is None
- else self._iterparse_nodes(iterparse)
- )
- return xml_dicts
- def _validate_path(self) -> list[Any]:
- msg = (
- "xpath does not return any nodes or attributes. "
- "Be sure to specify in `xpath` the parent nodes of "
- "children and attributes to parse. "
- "If document uses namespaces denoted with "
- "xmlns, be sure to define namespaces and "
- "use them in xpath."
- )
- elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
- children = [ch for el in elems for ch in el.xpath("*")]
- attrs = {k: v for el in elems for k, v in el.attrib.items()}
- if elems == []:
- raise ValueError(msg)
- if elems != []:
- if self.elems_only and children == []:
- raise ValueError(msg)
- if self.attrs_only and attrs == {}:
- raise ValueError(msg)
- if children == [] and attrs == {}:
- raise ValueError(msg)
- return elems
- def _validate_names(self) -> None:
- children: list[Any]
- if self.names:
- if self.iterparse:
- children = self.iterparse[next(iter(self.iterparse))]
- else:
- children = self.xml_doc.xpath(
- self.xpath + "[1]/*", namespaces=self.namespaces
- )
- if is_list_like(self.names):
- if len(self.names) < len(children):
- raise ValueError(
- "names does not match length of child elements in xpath."
- )
- else:
- raise TypeError(
- f"{type(self.names).__name__} is not a valid type for names"
- )
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> etree._Element:
- from lxml.etree import (
- XMLParser,
- fromstring,
- parse,
- )
- handle_data = get_data_from_filepath(
- filepath_or_buffer=raw_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
- with handle_data as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
- if isinstance(xml_data, io.StringIO):
- if self.encoding is None:
- raise TypeError(
- "Can not pass encoding None when input is StringIO."
- )
- document = fromstring(
- xml_data.getvalue().encode(self.encoding), parser=curr_parser
- )
- else:
- document = parse(xml_data, parser=curr_parser)
- return document
- def _transform_doc(self) -> etree._XSLTResultTree:
- """
- Transform original tree using stylesheet.
- This method will transform original xml using XSLT script into
- am ideally flatter xml document for easier parsing and migration
- to Data Frame.
- """
- from lxml.etree import XSLT
- transformer = XSLT(self.xsl_doc)
- new_doc = transformer(self.xml_doc)
- return new_doc
- def get_data_from_filepath(
- filepath_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- encoding: str | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- ):
- """
- Extract raw XML data.
- The method accepts two input types:
- 1. filepath (string-like)
- 2. file-like object (e.g. open file object, StringIO)
- """
- filepath_or_buffer = stringify_path(filepath_or_buffer)
- with get_handle(
- filepath_or_buffer,
- "r",
- encoding=encoding,
- compression=compression,
- storage_options=storage_options,
- ) as handle_obj:
- return (
- preprocess_data(handle_obj.handle.read())
- if hasattr(handle_obj.handle, "read")
- else handle_obj.handle
- )
- def preprocess_data(
- data: str | bytes | io.StringIO | io.BytesIO,
- ) -> io.StringIO | io.BytesIO:
- """
- Convert extracted raw data.
- This method will return underlying data of extracted XML content.
- The data either has a `read` attribute (e.g. a file object or a
- StringIO/BytesIO) or is a string or bytes that is an XML document.
- """
- if isinstance(data, str):
- data = io.StringIO(data)
- elif isinstance(data, bytes):
- data = io.BytesIO(data)
- return data
- def _data_to_frame(data: list[dict[str, str | None]], **kwargs) -> DataFrame:
- """
- Convert parsed data to Data Frame.
- This method will bind xml dictionary data of keys and values
- into named columns of Data Frame using the built-in TextParser
- class that build Data Frame and infers specific dtypes.
- """
- tags = next(iter(data))
- nodes = [list(d.values()) for d in data]
- try:
- with TextParser(nodes, names=tags, **kwargs) as tp:
- return tp.read()
- except ParserError as err:
- raise ParserError(
- "XML document may be too complex for import. "
- "Try to flatten document and use distinct "
- "element and attribute names."
- ) from err
- def _parse(
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str,
- namespaces: dict[str, str] | None,
- elems_only: bool,
- attrs_only: bool,
- names: Sequence[str] | None,
- dtype: DtypeArg | None,
- converters: ConvertersArg | None,
- parse_dates: ParseDatesArg | None,
- encoding: str | None,
- parser: XMLParsers,
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
- iterparse: dict[str, list[str]] | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwargs,
- ) -> DataFrame:
- """
- Call internal parsers.
- This method will conditionally call internal parsers:
- LxmlFrameParser and/or EtreeParser.
- Raises
- ------
- ImportError
- * If lxml is not installed if selected as parser.
- ValueError
- * If parser is not lxml or etree.
- """
- p: _EtreeFrameParser | _LxmlFrameParser
- if parser == "lxml":
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
- if lxml is not None:
- p = _LxmlFrameParser(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- dtype,
- converters,
- parse_dates,
- encoding,
- stylesheet,
- iterparse,
- compression,
- storage_options,
- )
- else:
- raise ImportError("lxml not found, please install or use the etree parser.")
- elif parser == "etree":
- p = _EtreeFrameParser(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- dtype,
- converters,
- parse_dates,
- encoding,
- stylesheet,
- iterparse,
- compression,
- storage_options,
- )
- else:
- raise ValueError("Values for parser can only be lxml or etree.")
- data_dicts = p.parse_data()
- return _data_to_frame(
- data=data_dicts,
- dtype=dtype,
- converters=converters,
- parse_dates=parse_dates,
- dtype_backend=dtype_backend,
- **kwargs,
- )
- @set_module("pandas")
- def read_xml(
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- *,
- xpath: str = "./*",
- namespaces: dict[str, str] | None = None,
- elems_only: bool = False,
- attrs_only: bool = False,
- names: Sequence[str] | None = None,
- dtype: DtypeArg | None = None,
- converters: ConvertersArg | None = None,
- parse_dates: ParseDatesArg | None = None,
- # encoding can not be None for lxml and StringIO input
- encoding: str | None = "utf-8",
- parser: XMLParsers = "lxml",
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
- iterparse: dict[str, list[str]] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ) -> DataFrame:
- r"""
- Read XML document into a :class:`~pandas.DataFrame` object.
- Parameters
- ----------
- path_or_buffer : str, path object, or file-like object
- String path, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a ``read()`` function. The string can be a path.
- The string can further be a URL. Valid URL schemes
- include http, ftp, s3, and file.
- xpath : str, optional, default './\*'
- The ``XPath`` to parse required set of nodes for migration to
- :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements
- and not a single element. Note: The ``etree`` parser supports limited ``XPath``
- expressions. For more complex ``XPath``, use ``lxml`` which requires
- installation.
- namespaces : dict, optional
- The namespaces defined in XML document as dicts with key being
- namespace prefix and value the URI. There is no need to include all
- namespaces in XML, only the ones used in ``xpath`` expression.
- Note: if XML document uses default namespace denoted as
- `xmlns='<URI>'` without a prefix, you must assign any temporary
- namespace prefix such as 'doc' to the URI in order to parse
- underlying nodes and/or attributes.
- elems_only : bool, optional, default False
- Parse only the child elements at the specified ``xpath``. By default,
- all child elements and non-empty text nodes are returned.
- attrs_only : bool, optional, default False
- Parse only the attributes at the specified ``xpath``.
- By default, all attributes are returned.
- names : list-like, optional
- Column names for DataFrame of parsed XML data. Use this parameter to
- rename original element names and distinguish same named elements and
- attributes.
- dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
- parse_dates : bool or list of int or names or list of lists or dict, default False
- Identifiers to parse index or columns to datetime. The behavior is as follows:
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
- encoding : str, optional, default 'utf-8'
- Encoding of XML document.
- parser : {{'lxml','etree'}}, default 'lxml'
- Parser module to use for retrieval of data. Only 'lxml' and
- 'etree' are supported. With 'lxml' more complex ``XPath`` searches
- and ability to use XSLT stylesheet are supported.
- stylesheet : str, path object or file-like object
- A URL, file-like object, or a string path containing an XSLT script.
- This stylesheet should flatten complex, deeply nested XML documents
- for easier parsing. To use this feature you must have ``lxml`` module
- installed and specify 'lxml' as ``parser``. The ``xpath`` must
- reference nodes of transformed XML document generated after XSLT
- transformation and not the original XML document. Only XSLT 1.0
- scripts and not later versions is currently supported.
- iterparse : dict, optional
- The nodes or attributes to retrieve in iterparsing of XML document
- as a dict with key being the name of repeating element and value being
- list of elements or attribute names that are descendants of the repeated
- element. Note: If this option is used, it will replace ``xpath`` parsing
- and unlike ``xpath``, descendants do not need to relate to each other but can
- exist any where in document under the repeating element. This memory-
- efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
- For example, ``{{"row_element": ["child_elem", "attr", "grandchild_elem"]}}``.
- compression : str or dict, default 'infer'
- For on-the-fly decompression of on-disk data. If 'infer' and
- 'path_or_buffer' is path-like, then detect compression from the
- following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
- '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
- If using 'zip' or 'tar', the ZIP file must contain only one data
- file to be read in. Set to ``None`` for no decompression.
- Can also be a dict with key ``'method'`` set to one of
- {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
- and other key-value pairs are forwarded to ``zipfile.ZipFile``,
- ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``,
- ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively.
- As an example, the following could be passed for Zstandard
- decompression using a custom compression dictionary:
- ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
- storage_options : dict, optional
- Extra options that make sense for a particular storage connection,
- e.g. host, port, username, password, etc. For HTTP(S) URLs the
- key-value pairs are forwarded to ``urllib.request.Request`` as header
- options. For other URLs (e.g. starting with "s3://", and "gcs://")
- the key-value pairs are forwarded to ``fsspec.open``. Please see
- ``fsspec`` and ``urllib`` for more details, and for more examples on
- storage options refer `here <https://pandas.pydata.org/docs/
- user_guide/io.html?highlight=storage_options#reading-writing-remote-
- files>`_.
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}
- Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). If not specified, the default behavior
- is to not use nullable data types. If specified, the behavior
- is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- * ``"pyarrow"``: returns pyarrow-backed nullable
- :class:`ArrowDtype` :class:`DataFrame`
- .. versionadded:: 2.0
- Returns
- -------
- df
- A DataFrame.
- See Also
- --------
- read_json : Convert a JSON string to pandas object.
- read_html : Read HTML tables into a list of DataFrame objects.
- Notes
- -----
- This method is best designed to import shallow XML documents in
- following format which is the ideal fit for the two-dimensions of a
- ``DataFrame`` (row by column). ::
- <root>
- <row>
- <column1>data</column1>
- <column2>data</column2>
- <column3>data</column3>
- ...
- </row>
- <row>
- ...
- </row>
- ...
- </root>
- As a file format, XML documents can be designed any way including
- layout of elements and attributes as long as it conforms to W3C
- specifications. Therefore, this method is a convenience handler for
- a specific flatter design and not all possible XML structures.
- However, for more complex XML documents, ``stylesheet`` allows you to
- temporarily redesign original document with XSLT (a special purpose
- language) for a flatter version for migration to a DataFrame.
- This function will *always* return a single :class:`DataFrame` or raise
- exceptions due to issues with XML document, ``xpath``, or other
- parameters.
- See the :ref:`read_xml documentation in the IO section of the docs
- <io.read_xml>` for more information in using this method to parse XML
- files to DataFrames.
- Examples
- --------
- >>> from io import StringIO
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <data xmlns="http://example.com">
- ... <row>
- ... <shape>square</shape>
- ... <degrees>360</degrees>
- ... <sides>4.0</sides>
- ... </row>
- ... <row>
- ... <shape>circle</shape>
- ... <degrees>360</degrees>
- ... <sides/>
- ... </row>
- ... <row>
- ... <shape>triangle</shape>
- ... <degrees>180</degrees>
- ... <sides>3.0</sides>
- ... </row>
- ... </data>'''
- >>> df = pd.read_xml(StringIO(xml))
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <data>
- ... <row shape="square" degrees="360" sides="4.0"/>
- ... <row shape="circle" degrees="360"/>
- ... <row shape="triangle" degrees="180" sides="3.0"/>
- ... </data>'''
- >>> df = pd.read_xml(StringIO(xml), xpath=".//row")
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <doc:data xmlns:doc="https://example.com">
- ... <doc:row>
- ... <doc:shape>square</doc:shape>
- ... <doc:degrees>360</doc:degrees>
- ... <doc:sides>4.0</doc:sides>
- ... </doc:row>
- ... <doc:row>
- ... <doc:shape>circle</doc:shape>
- ... <doc:degrees>360</doc:degrees>
- ... <doc:sides/>
- ... </doc:row>
- ... <doc:row>
- ... <doc:shape>triangle</doc:shape>
- ... <doc:degrees>180</doc:degrees>
- ... <doc:sides>3.0</doc:sides>
- ... </doc:row>
- ... </doc:data>'''
- >>> df = pd.read_xml(
- ... StringIO(xml),
- ... xpath="//doc:row",
- ... namespaces={"doc": "https://example.com"},
- ... )
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- >>> xml_data = '''
- ... <data>
- ... <row>
- ... <index>0</index>
- ... <a>1</a>
- ... <b>2.5</b>
- ... <c>True</c>
- ... <d>a</d>
- ... <e>2019-12-31 00:00:00</e>
- ... </row>
- ... <row>
- ... <index>1</index>
- ... <b>4.5</b>
- ... <c>False</c>
- ... <d>b</d>
- ... <e>2019-12-31 00:00:00</e>
- ... </row>
- ... </data>
- ... '''
- >>> df = pd.read_xml(
- ... StringIO(xml_data), dtype_backend="numpy_nullable", parse_dates=["e"]
- ... )
- >>> df
- index a b c d e
- 0 0 1 2.5 True a 2019-12-31
- 1 1 <NA> 4.5 False b 2019-12-31
- """
- check_dtype_backend(dtype_backend)
- return _parse(
- path_or_buffer=path_or_buffer,
- xpath=xpath,
- namespaces=namespaces,
- elems_only=elems_only,
- attrs_only=attrs_only,
- names=names,
- dtype=dtype,
- converters=converters,
- parse_dates=parse_dates,
- encoding=encoding,
- parser=parser,
- stylesheet=stylesheet,
- iterparse=iterparse,
- compression=compression,
- storage_options=storage_options,
- dtype_backend=dtype_backend,
- )
|