Spaces:
Runtime error
Runtime error
""" | |
:mod:`pandas.io.xml` is a module for reading XML. | |
""" | |
from __future__ import annotations | |
import io | |
from typing import ( | |
Any, | |
Callable, | |
Sequence, | |
) | |
from pandas._typing import ( | |
TYPE_CHECKING, | |
CompressionOptions, | |
ConvertersArg, | |
DtypeArg, | |
FilePath, | |
ParseDatesArg, | |
ReadBuffer, | |
StorageOptions, | |
XMLParsers, | |
) | |
from pandas.compat._optional import import_optional_dependency | |
from pandas.errors import ( | |
AbstractMethodError, | |
ParserError, | |
) | |
from pandas.util._decorators import ( | |
deprecate_nonkeyword_arguments, | |
doc, | |
) | |
from pandas.core.dtypes.common import is_list_like | |
from pandas.core.shared_docs import _shared_docs | |
from pandas.io.common import ( | |
file_exists, | |
get_handle, | |
infer_compression, | |
is_fsspec_url, | |
is_url, | |
stringify_path, | |
) | |
from pandas.io.parsers import TextParser | |
if TYPE_CHECKING: | |
from xml.etree.ElementTree import Element | |
from lxml.etree import ( | |
_Element, | |
_XSLTResultTree, | |
) | |
from pandas import DataFrame | |
class _XMLFrameParser: | |
""" | |
Internal subclass to parse XML into DataFrames. | |
Parameters | |
---------- | |
path_or_buffer : a valid JSON str, path object or file-like object | |
Any valid string path is acceptable. The string could be a URL. Valid | |
URL schemes include http, ftp, s3, and file. | |
xpath : str or regex | |
The XPath expression to parse required set of nodes for | |
migration to `Data Frame`. `etree` supports limited XPath. | |
namespaces : dict | |
The namespaces defined in XML document (`xmlns:namespace='URI') | |
as dicts with key being namespace and value the URI. | |
elems_only : bool | |
Parse only the child elements at the specified `xpath`. | |
attrs_only : bool | |
Parse only the attributes at the specified `xpath`. | |
names : list | |
Column names for Data Frame of parsed XML data. | |
dtype : dict | |
Data type for data or columns. E.g. {{'a': np.float64, | |
'b': np.int32, 'c': 'Int64'}} | |
.. versionadded:: 1.5.0 | |
converters : dict, optional | |
Dict of functions for converting values in certain columns. Keys can | |
either be integers or column labels. | |
.. versionadded:: 1.5.0 | |
parse_dates : bool or list of int or names or list of lists or dict | |
Converts either index or select columns to datetimes | |
.. versionadded:: 1.5.0 | |
encoding : str | |
Encoding of xml object or document. | |
stylesheet : str or file-like | |
URL, file, file-like object, or a raw string containing XSLT, | |
`etree` does not support XSLT but retained for consistency. | |
iterparse : dict, optional | |
Dict with row element as key and list of descendant elements | |
and/or attributes as value to be retrieved in iterparsing of | |
XML document. | |
.. versionadded:: 1.5.0 | |
{decompression_options} | |
.. versionchanged:: 1.4.0 Zstandard support. | |
{storage_options} | |
See also | |
-------- | |
pandas.io.xml._EtreeFrameParser | |
pandas.io.xml._LxmlFrameParser | |
Notes | |
----- | |
To subclass this class effectively you must override the following methods:` | |
* :func:`parse_data` | |
* :func:`_parse_nodes` | |
* :func:`_iterparse_nodes` | |
* :func:`_parse_doc` | |
* :func:`_validate_names` | |
* :func:`_validate_path` | |
See each method's respective documentation for details on their | |
functionality. | |
""" | |
def __init__( | |
self, | |
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], | |
xpath: str, | |
namespaces: dict[str, str] | None, | |
elems_only: bool, | |
attrs_only: bool, | |
names: Sequence[str] | None, | |
dtype: DtypeArg | None, | |
converters: ConvertersArg | None, | |
parse_dates: ParseDatesArg | None, | |
encoding: str | None, | |
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, | |
iterparse: dict[str, list[str]] | None, | |
compression: CompressionOptions, | |
storage_options: StorageOptions, | |
) -> None: | |
self.path_or_buffer = path_or_buffer | |
self.xpath = xpath | |
self.namespaces = namespaces | |
self.elems_only = elems_only | |
self.attrs_only = attrs_only | |
self.names = names | |
self.dtype = dtype | |
self.converters = converters | |
self.parse_dates = parse_dates | |
self.encoding = encoding | |
self.stylesheet = stylesheet | |
self.iterparse = iterparse | |
self.is_style = None | |
self.compression = compression | |
self.storage_options = storage_options | |
def parse_data(self) -> list[dict[str, str | None]]: | |
""" | |
Parse xml data. | |
This method will call the other internal methods to | |
validate xpath, names, parse and return specific nodes. | |
""" | |
raise AbstractMethodError(self) | |
def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: | |
""" | |
Parse xml nodes. | |
This method will parse the children and attributes of elements | |
in xpath, conditionally for only elements, only attributes | |
or both while optionally renaming node names. | |
Raises | |
------ | |
ValueError | |
* If only elements and only attributes are specified. | |
Notes | |
----- | |
Namespace URIs will be removed from return node values. Also, | |
elements with missing children or attributes compared to siblings | |
will have optional keys filled with None values. | |
""" | |
dicts: list[dict[str, str | None]] | |
if self.elems_only and self.attrs_only: | |
raise ValueError("Either element or attributes can be parsed not both.") | |
elif self.elems_only: | |
if self.names: | |
dicts = [ | |
{ | |
**( | |
{el.tag: el.text.strip()} | |
if el.text and not el.text.isspace() | |
else {} | |
), | |
**{ | |
nm: ch.text.strip() if ch.text else None | |
for nm, ch in zip(self.names, el.findall("*")) | |
}, | |
} | |
for el in elems | |
] | |
else: | |
dicts = [ | |
{ | |
ch.tag: ch.text.strip() if ch.text else None | |
for ch in el.findall("*") | |
} | |
for el in elems | |
] | |
elif self.attrs_only: | |
dicts = [ | |
{k: v.strip() if v else None for k, v in el.attrib.items()} | |
for el in elems | |
] | |
else: | |
if self.names: | |
dicts = [ | |
{ | |
**el.attrib, | |
**( | |
{el.tag: el.text.strip()} | |
if el.text and not el.text.isspace() | |
else {} | |
), | |
**{ | |
nm: ch.text.strip() if ch.text else None | |
for nm, ch in zip(self.names, el.findall("*")) | |
}, | |
} | |
for el in elems | |
] | |
else: | |
dicts = [ | |
{ | |
**el.attrib, | |
**( | |
{el.tag: el.text.strip()} | |
if el.text and not el.text.isspace() | |
else {} | |
), | |
**{ | |
ch.tag: ch.text.strip() if ch.text else None | |
for ch in el.findall("*") | |
}, | |
} | |
for el in elems | |
] | |
dicts = [ | |
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts | |
] | |
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) | |
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] | |
if self.names: | |
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] | |
return dicts | |
def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: | |
""" | |
Iterparse xml nodes. | |
This method will read in local disk, decompressed XML files for elements | |
and underlying descendants using iterparse, a method to iterate through | |
an XML tree without holding entire XML tree in memory. | |
Raises | |
------ | |
TypeError | |
* If `iterparse` is not a dict or its dict value is not list-like. | |
ParserError | |
* If `path_or_buffer` is not a physical, decompressed file on disk. | |
* If no data is returned from selected items in `iterparse`. | |
Notes | |
----- | |
Namespace URIs will be removed from return node values. Also, | |
elements with missing children or attributes in submitted list | |
will have optional keys filled with None values. | |
""" | |
dicts: list[dict[str, str | None]] = [] | |
row: dict[str, str | None] | None = None | |
if not isinstance(self.iterparse, dict): | |
raise TypeError( | |
f"{type(self.iterparse).__name__} is not a valid type for iterparse" | |
) | |
row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" | |
if not is_list_like(self.iterparse[row_node]): | |
raise TypeError( | |
f"{type(self.iterparse[row_node])} is not a valid type " | |
"for value in iterparse" | |
) | |
if ( | |
not isinstance(self.path_or_buffer, str) | |
or is_url(self.path_or_buffer) | |
or is_fsspec_url(self.path_or_buffer) | |
or self.path_or_buffer.startswith(("<?xml", "<")) | |
or infer_compression(self.path_or_buffer, "infer") is not None | |
): | |
raise ParserError( | |
"iterparse is designed for large XML files that are fully extracted on " | |
"local disk and not as compressed files or online sources." | |
) | |
for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): | |
curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag | |
if event == "start": | |
if curr_elem == row_node: | |
row = {} | |
if row is not None: | |
if self.names: | |
for col, nm in zip(self.iterparse[row_node], self.names): | |
if curr_elem == col: | |
elem_val = elem.text.strip() if elem.text else None | |
if row.get(nm) != elem_val and nm not in row: | |
row[nm] = elem_val | |
if col in elem.attrib: | |
if elem.attrib[col] not in row.values() and nm not in row: | |
row[nm] = elem.attrib[col] | |
else: | |
for col in self.iterparse[row_node]: | |
if curr_elem == col: | |
row[col] = elem.text.strip() if elem.text else None | |
if col in elem.attrib: | |
row[col] = elem.attrib[col] | |
if event == "end": | |
if curr_elem == row_node and row is not None: | |
dicts.append(row) | |
row = None | |
elem.clear() | |
if hasattr(elem, "getprevious"): | |
while ( | |
elem.getprevious() is not None and elem.getparent() is not None | |
): | |
del elem.getparent()[0] | |
if dicts == []: | |
raise ParserError("No result from selected items in iterparse.") | |
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) | |
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] | |
if self.names: | |
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] | |
return dicts | |
def _validate_path(self) -> None: | |
""" | |
Validate xpath. | |
This method checks for syntax, evaluation, or empty nodes return. | |
Raises | |
------ | |
SyntaxError | |
* If xpah is not supported or issues with namespaces. | |
ValueError | |
* If xpah does not return any nodes. | |
""" | |
raise AbstractMethodError(self) | |
def _validate_names(self) -> None: | |
""" | |
Validate names. | |
This method will check if names is a list-like and aligns | |
with length of parse nodes. | |
Raises | |
------ | |
ValueError | |
* If value is not a list and less then length of nodes. | |
""" | |
raise AbstractMethodError(self) | |
def _parse_doc( | |
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | |
) -> Element | _Element: | |
""" | |
Build tree from path_or_buffer. | |
This method will parse XML object into tree | |
either from string/bytes or file location. | |
""" | |
raise AbstractMethodError(self) | |
class _EtreeFrameParser(_XMLFrameParser): | |
""" | |
Internal class to parse XML into DataFrames with the Python | |
standard library XML module: `xml.etree.ElementTree`. | |
""" | |
def parse_data(self) -> list[dict[str, str | None]]: | |
from xml.etree.ElementTree import iterparse | |
if self.stylesheet is not None: | |
raise ValueError( | |
"To use stylesheet, you need lxml installed and selected as parser." | |
) | |
if self.iterparse is None: | |
self.xml_doc = self._parse_doc(self.path_or_buffer) | |
self._validate_path() | |
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) | |
self._validate_names() | |
xml_dicts: list[dict[str, str | None]] = ( | |
self._parse_nodes(elems) | |
if self.iterparse is None | |
else self._iterparse_nodes(iterparse) | |
) | |
return xml_dicts | |
def _validate_path(self) -> None: | |
""" | |
Notes | |
----- | |
`etree` supports limited XPath. If user attempts a more complex | |
expression syntax error will raise. | |
""" | |
msg = ( | |
"xpath does not return any nodes. " | |
"If document uses namespaces denoted with " | |
"xmlns, be sure to define namespaces and " | |
"use them in xpath." | |
) | |
try: | |
elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces) | |
if elems is None: | |
raise ValueError(msg) | |
if elems is not None and elems.find("*") is None and elems.attrib is None: | |
raise ValueError(msg) | |
except (KeyError, SyntaxError): | |
raise SyntaxError( | |
"You have used an incorrect or unsupported XPath " | |
"expression for etree library or you used an " | |
"undeclared namespace prefix." | |
) | |
def _validate_names(self) -> None: | |
children: list[Any] | |
if self.names: | |
if self.iterparse: | |
children = self.iterparse[next(iter(self.iterparse))] | |
else: | |
parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) | |
children = parent.findall("*") if parent else [] | |
if is_list_like(self.names): | |
if len(self.names) < len(children): | |
raise ValueError( | |
"names does not match length of child elements in xpath." | |
) | |
else: | |
raise TypeError( | |
f"{type(self.names).__name__} is not a valid type for names" | |
) | |
def _parse_doc( | |
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | |
) -> Element: | |
from xml.etree.ElementTree import ( | |
XMLParser, | |
parse, | |
) | |
handle_data = get_data_from_filepath( | |
filepath_or_buffer=raw_doc, | |
encoding=self.encoding, | |
compression=self.compression, | |
storage_options=self.storage_options, | |
) | |
with preprocess_data(handle_data) as xml_data: | |
curr_parser = XMLParser(encoding=self.encoding) | |
doc = parse(xml_data, parser=curr_parser) | |
return doc.getroot() | |
class _LxmlFrameParser(_XMLFrameParser): | |
""" | |
Internal class to parse XML into DataFrames with third-party | |
full-featured XML library, `lxml`, that supports | |
XPath 1.0 and XSLT 1.0. | |
""" | |
def parse_data(self) -> list[dict[str, str | None]]: | |
""" | |
Parse xml data. | |
This method will call the other internal methods to | |
validate xpath, names, optionally parse and run XSLT, | |
and parse original or transformed XML and return specific nodes. | |
""" | |
from lxml.etree import iterparse | |
if self.iterparse is None: | |
self.xml_doc = self._parse_doc(self.path_or_buffer) | |
if self.stylesheet: | |
self.xsl_doc = self._parse_doc(self.stylesheet) | |
self.xml_doc = self._transform_doc() | |
self._validate_path() | |
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) | |
self._validate_names() | |
xml_dicts: list[dict[str, str | None]] = ( | |
self._parse_nodes(elems) | |
if self.iterparse is None | |
else self._iterparse_nodes(iterparse) | |
) | |
return xml_dicts | |
def _validate_path(self) -> None: | |
msg = ( | |
"xpath does not return any nodes. " | |
"Be sure row level nodes are in xpath. " | |
"If document uses namespaces denoted with " | |
"xmlns, be sure to define namespaces and " | |
"use them in xpath." | |
) | |
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) | |
children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) | |
attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) | |
if elems == []: | |
raise ValueError(msg) | |
if elems != [] and attrs == [] and children == []: | |
raise ValueError(msg) | |
def _validate_names(self) -> None: | |
children: list[Any] | |
if self.names: | |
if self.iterparse: | |
children = self.iterparse[next(iter(self.iterparse))] | |
else: | |
children = self.xml_doc.xpath( | |
self.xpath + "[1]/*", namespaces=self.namespaces | |
) | |
if is_list_like(self.names): | |
if len(self.names) < len(children): | |
raise ValueError( | |
"names does not match length of child elements in xpath." | |
) | |
else: | |
raise TypeError( | |
f"{type(self.names).__name__} is not a valid type for names" | |
) | |
def _parse_doc( | |
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | |
) -> _Element: | |
from lxml.etree import ( | |
XMLParser, | |
fromstring, | |
parse, | |
) | |
handle_data = get_data_from_filepath( | |
filepath_or_buffer=raw_doc, | |
encoding=self.encoding, | |
compression=self.compression, | |
storage_options=self.storage_options, | |
) | |
with preprocess_data(handle_data) as xml_data: | |
curr_parser = XMLParser(encoding=self.encoding) | |
if isinstance(xml_data, io.StringIO): | |
if self.encoding is None: | |
raise TypeError( | |
"Can not pass encoding None when input is StringIO." | |
) | |
doc = fromstring( | |
xml_data.getvalue().encode(self.encoding), parser=curr_parser | |
) | |
else: | |
doc = parse(xml_data, parser=curr_parser) | |
return doc | |
def _transform_doc(self) -> _XSLTResultTree: | |
""" | |
Transform original tree using stylesheet. | |
This method will transform original xml using XSLT script into | |
am ideally flatter xml document for easier parsing and migration | |
to Data Frame. | |
""" | |
from lxml.etree import XSLT | |
transformer = XSLT(self.xsl_doc) | |
new_doc = transformer(self.xml_doc) | |
return new_doc | |
def get_data_from_filepath( | |
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], | |
encoding: str | None, | |
compression: CompressionOptions, | |
storage_options: StorageOptions, | |
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: | |
""" | |
Extract raw XML data. | |
The method accepts three input types: | |
1. filepath (string-like) | |
2. file-like object (e.g. open file object, StringIO) | |
3. XML string or bytes | |
This method turns (1) into (2) to simplify the rest of the processing. | |
It returns input types (2) and (3) unchanged. | |
""" | |
if not isinstance(filepath_or_buffer, bytes): | |
filepath_or_buffer = stringify_path(filepath_or_buffer) | |
if ( | |
isinstance(filepath_or_buffer, str) | |
and not filepath_or_buffer.startswith(("<?xml", "<")) | |
) and ( | |
not isinstance(filepath_or_buffer, str) | |
or is_url(filepath_or_buffer) | |
or is_fsspec_url(filepath_or_buffer) | |
or file_exists(filepath_or_buffer) | |
): | |
with get_handle( | |
filepath_or_buffer, | |
"r", | |
encoding=encoding, | |
compression=compression, | |
storage_options=storage_options, | |
) as handle_obj: | |
filepath_or_buffer = ( | |
# error: Incompatible types in assignment (expression has type | |
# "Union[str, IO[str]]", variable has type "Union[Union[str, | |
# PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]") | |
handle_obj.handle.read() # type: ignore[assignment] | |
if hasattr(handle_obj.handle, "read") | |
else handle_obj.handle | |
) | |
return filepath_or_buffer | |
def preprocess_data(data) -> io.StringIO | io.BytesIO: | |
""" | |
Convert extracted raw data. | |
This method will return underlying data of extracted XML content. | |
The data either has a `read` attribute (e.g. a file object or a | |
StringIO/BytesIO) or is a string or bytes that is an XML document. | |
""" | |
if isinstance(data, str): | |
data = io.StringIO(data) | |
elif isinstance(data, bytes): | |
data = io.BytesIO(data) | |
return data | |
def _data_to_frame(data, **kwargs) -> DataFrame: | |
""" | |
Convert parsed data to Data Frame. | |
This method will bind xml dictionary data of keys and values | |
into named columns of Data Frame using the built-in TextParser | |
class that build Data Frame and infers specific dtypes. | |
""" | |
tags = next(iter(data)) | |
nodes = [list(d.values()) for d in data] | |
try: | |
with TextParser(nodes, names=tags, **kwargs) as tp: | |
return tp.read() | |
except ParserError: | |
raise ParserError( | |
"XML document may be too complex for import. " | |
"Try to flatten document and use distinct " | |
"element and attribute names." | |
) | |
def _parse( | |
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], | |
xpath: str, | |
namespaces: dict[str, str] | None, | |
elems_only: bool, | |
attrs_only: bool, | |
names: Sequence[str] | None, | |
dtype: DtypeArg | None, | |
converters: ConvertersArg | None, | |
parse_dates: ParseDatesArg | None, | |
encoding: str | None, | |
parser: XMLParsers, | |
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, | |
iterparse: dict[str, list[str]] | None, | |
compression: CompressionOptions, | |
storage_options: StorageOptions, | |
**kwargs, | |
) -> DataFrame: | |
""" | |
Call internal parsers. | |
This method will conditionally call internal parsers: | |
LxmlFrameParser and/or EtreeParser. | |
Raises | |
------ | |
ImportError | |
* If lxml is not installed if selected as parser. | |
ValueError | |
* If parser is not lxml or etree. | |
""" | |
p: _EtreeFrameParser | _LxmlFrameParser | |
if parser == "lxml": | |
lxml = import_optional_dependency("lxml.etree", errors="ignore") | |
if lxml is not None: | |
p = _LxmlFrameParser( | |
path_or_buffer, | |
xpath, | |
namespaces, | |
elems_only, | |
attrs_only, | |
names, | |
dtype, | |
converters, | |
parse_dates, | |
encoding, | |
stylesheet, | |
iterparse, | |
compression, | |
storage_options, | |
) | |
else: | |
raise ImportError("lxml not found, please install or use the etree parser.") | |
elif parser == "etree": | |
p = _EtreeFrameParser( | |
path_or_buffer, | |
xpath, | |
namespaces, | |
elems_only, | |
attrs_only, | |
names, | |
dtype, | |
converters, | |
parse_dates, | |
encoding, | |
stylesheet, | |
iterparse, | |
compression, | |
storage_options, | |
) | |
else: | |
raise ValueError("Values for parser can only be lxml or etree.") | |
data_dicts = p.parse_data() | |
return _data_to_frame( | |
data=data_dicts, | |
dtype=dtype, | |
converters=converters, | |
parse_dates=parse_dates, | |
**kwargs, | |
) | |
def read_xml( | |
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], | |
xpath: str = "./*", | |
namespaces: dict[str, str] | None = None, | |
elems_only: bool = False, | |
attrs_only: bool = False, | |
names: Sequence[str] | None = None, | |
dtype: DtypeArg | None = None, | |
converters: ConvertersArg | None = None, | |
parse_dates: ParseDatesArg | None = None, | |
# encoding can not be None for lxml and StringIO input | |
encoding: str | None = "utf-8", | |
parser: XMLParsers = "lxml", | |
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, | |
iterparse: dict[str, list[str]] | None = None, | |
compression: CompressionOptions = "infer", | |
storage_options: StorageOptions = None, | |
) -> DataFrame: | |
r""" | |
Read XML document into a ``DataFrame`` object. | |
.. versionadded:: 1.3.0 | |
Parameters | |
---------- | |
path_or_buffer : str, path object, or file-like object | |
String, path object (implementing ``os.PathLike[str]``), or file-like | |
object implementing a ``read()`` function. The string can be any valid XML | |
string or a path. The string can further be a URL. Valid URL schemes | |
include http, ftp, s3, and file. | |
xpath : str, optional, default './\*' | |
The XPath to parse required set of nodes for migration to DataFrame. | |
XPath should return a collection of elements and not a single | |
element. Note: The ``etree`` parser supports limited XPath | |
expressions. For more complex XPath, use ``lxml`` which requires | |
installation. | |
namespaces : dict, optional | |
The namespaces defined in XML document as dicts with key being | |
namespace prefix and value the URI. There is no need to include all | |
namespaces in XML, only the ones used in ``xpath`` expression. | |
Note: if XML document uses default namespace denoted as | |
`xmlns='<URI>'` without a prefix, you must assign any temporary | |
namespace prefix such as 'doc' to the URI in order to parse | |
underlying nodes and/or attributes. For example, :: | |
namespaces = {{"doc": "https://example.com"}} | |
elems_only : bool, optional, default False | |
Parse only the child elements at the specified ``xpath``. By default, | |
all child elements and non-empty text nodes are returned. | |
attrs_only : bool, optional, default False | |
Parse only the attributes at the specified ``xpath``. | |
By default, all attributes are returned. | |
names : list-like, optional | |
Column names for DataFrame of parsed XML data. Use this parameter to | |
rename original element names and distinguish same named elements and | |
attributes. | |
dtype : Type name or dict of column -> type, optional | |
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, | |
'c': 'Int64'}} | |
Use `str` or `object` together with suitable `na_values` settings | |
to preserve and not interpret dtype. | |
If converters are specified, they will be applied INSTEAD | |
of dtype conversion. | |
.. versionadded:: 1.5.0 | |
converters : dict, optional | |
Dict of functions for converting values in certain columns. Keys can either | |
be integers or column labels. | |
.. versionadded:: 1.5.0 | |
parse_dates : bool or list of int or names or list of lists or dict, default False | |
Identifiers to parse index or columns to datetime. The behavior is as follows: | |
* boolean. If True -> try parsing the index. | |
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 | |
each as a separate date column. | |
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as | |
a single date column. | |
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call | |
result 'foo' | |
.. versionadded:: 1.5.0 | |
encoding : str, optional, default 'utf-8' | |
Encoding of XML document. | |
parser : {{'lxml','etree'}}, default 'lxml' | |
Parser module to use for retrieval of data. Only 'lxml' and | |
'etree' are supported. With 'lxml' more complex XPath searches | |
and ability to use XSLT stylesheet are supported. | |
stylesheet : str, path object or file-like object | |
A URL, file-like object, or a raw string containing an XSLT script. | |
This stylesheet should flatten complex, deeply nested XML documents | |
for easier parsing. To use this feature you must have ``lxml`` module | |
installed and specify 'lxml' as ``parser``. The ``xpath`` must | |
reference nodes of transformed XML document generated after XSLT | |
transformation and not the original XML document. Only XSLT 1.0 | |
scripts and not later versions is currently supported. | |
iterparse : dict, optional | |
The nodes or attributes to retrieve in iterparsing of XML document | |
as a dict with key being the name of repeating element and value being | |
list of elements or attribute names that are descendants of the repeated | |
element. Note: If this option is used, it will replace ``xpath`` parsing | |
and unlike xpath, descendants do not need to relate to each other but can | |
exist any where in document under the repeating element. This memory- | |
efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). | |
For example, :: | |
iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} | |
.. versionadded:: 1.5.0 | |
{decompression_options} | |
.. versionchanged:: 1.4.0 Zstandard support. | |
{storage_options} | |
Returns | |
------- | |
df | |
A DataFrame. | |
See Also | |
-------- | |
read_json : Convert a JSON string to pandas object. | |
read_html : Read HTML tables into a list of DataFrame objects. | |
Notes | |
----- | |
This method is best designed to import shallow XML documents in | |
following format which is the ideal fit for the two-dimensions of a | |
``DataFrame`` (row by column). :: | |
<root> | |
<row> | |
<column1>data</column1> | |
<column2>data</column2> | |
<column3>data</column3> | |
... | |
</row> | |
<row> | |
... | |
</row> | |
... | |
</root> | |
As a file format, XML documents can be designed any way including | |
layout of elements and attributes as long as it conforms to W3C | |
specifications. Therefore, this method is a convenience handler for | |
a specific flatter design and not all possible XML structures. | |
However, for more complex XML documents, ``stylesheet`` allows you to | |
temporarily redesign original document with XSLT (a special purpose | |
language) for a flatter version for migration to a DataFrame. | |
This function will *always* return a single :class:`DataFrame` or raise | |
exceptions due to issues with XML document, ``xpath``, or other | |
parameters. | |
See the :ref:`read_xml documentation in the IO section of the docs | |
<io.read_xml>` for more information in using this method to parse XML | |
files to DataFrames. | |
Examples | |
-------- | |
>>> xml = '''<?xml version='1.0' encoding='utf-8'?> | |
... <data xmlns="http://example.com"> | |
... <row> | |
... <shape>square</shape> | |
... <degrees>360</degrees> | |
... <sides>4.0</sides> | |
... </row> | |
... <row> | |
... <shape>circle</shape> | |
... <degrees>360</degrees> | |
... <sides/> | |
... </row> | |
... <row> | |
... <shape>triangle</shape> | |
... <degrees>180</degrees> | |
... <sides>3.0</sides> | |
... </row> | |
... </data>''' | |
>>> df = pd.read_xml(xml) | |
>>> df | |
shape degrees sides | |
0 square 360 4.0 | |
1 circle 360 NaN | |
2 triangle 180 3.0 | |
>>> xml = '''<?xml version='1.0' encoding='utf-8'?> | |
... <data> | |
... <row shape="square" degrees="360" sides="4.0"/> | |
... <row shape="circle" degrees="360"/> | |
... <row shape="triangle" degrees="180" sides="3.0"/> | |
... </data>''' | |
>>> df = pd.read_xml(xml, xpath=".//row") | |
>>> df | |
shape degrees sides | |
0 square 360 4.0 | |
1 circle 360 NaN | |
2 triangle 180 3.0 | |
>>> xml = '''<?xml version='1.0' encoding='utf-8'?> | |
... <doc:data xmlns:doc="https://example.com"> | |
... <doc:row> | |
... <doc:shape>square</doc:shape> | |
... <doc:degrees>360</doc:degrees> | |
... <doc:sides>4.0</doc:sides> | |
... </doc:row> | |
... <doc:row> | |
... <doc:shape>circle</doc:shape> | |
... <doc:degrees>360</doc:degrees> | |
... <doc:sides/> | |
... </doc:row> | |
... <doc:row> | |
... <doc:shape>triangle</doc:shape> | |
... <doc:degrees>180</doc:degrees> | |
... <doc:sides>3.0</doc:sides> | |
... </doc:row> | |
... </doc:data>''' | |
>>> df = pd.read_xml(xml, | |
... xpath="//doc:row", | |
... namespaces={{"doc": "https://example.com"}}) | |
>>> df | |
shape degrees sides | |
0 square 360 4.0 | |
1 circle 360 NaN | |
2 triangle 180 3.0 | |
""" | |
return _parse( | |
path_or_buffer=path_or_buffer, | |
xpath=xpath, | |
namespaces=namespaces, | |
elems_only=elems_only, | |
attrs_only=attrs_only, | |
names=names, | |
dtype=dtype, | |
converters=converters, | |
parse_dates=parse_dates, | |
encoding=encoding, | |
parser=parser, | |
stylesheet=stylesheet, | |
iterparse=iterparse, | |
compression=compression, | |
storage_options=storage_options, | |
) | |