From 72891a72676ed41bba78b5b9c750f4aa384a57c2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 11:15:33 +0100 Subject: [PATCH 1/4] ENH: Add use_nullable_dtypes to read_xml --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/_libs/ops.pyx | 2 +- pandas/io/xml.py | 17 +++++++ pandas/tests/io/xml/test_xml.py | 83 ++++++++++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..712635d7a7e2a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -39,6 +39,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_fwf` * :func:`read_excel` * :func:`read_html` +* :func:`read_xml` * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` @@ -49,6 +50,7 @@ to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``) * :func:`read_excel` * :func:`read_html` +* :func:`read_xml` * :func:`read_parquet` * :func:`read_orc` diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 478e7eaee90c1..9154e836b3477 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -292,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr, result[i] = 1 elif val in false_vals: result[i] = 0 - elif is_nan(val): + elif is_nan(val) or val is None: mask[i] = 1 result[i] = 0 # Value here doesn't matter, will be replaced w/ nan has_na = True diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 4f61455826286..1368a407fa494 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -774,6 +774,7 @@ def _parse( iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, + use_nullable_dtypes: bool = False, **kwargs, ) -> DataFrame: """ @@ -843,6 +844,7 @@ def _parse( dtype=dtype, converters=converters, parse_dates=parse_dates, + use_nullable_dtypes=use_nullable_dtypes, **kwargs, ) @@ -869,6 +871,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -980,6 +983,19 @@ def read_xml( {storage_options} + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 + Returns ------- df @@ -1113,4 +1129,5 @@ def read_xml( iterparse=iterparse, compression=compression, storage_options=storage_options, + use_nullable_dtypes=use_nullable_dtypes, ) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index aeaf2d3b7edbf..b442b0dead376 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -21,8 +21,17 @@ ) import pandas.util._test_decorators as td -from pandas import DataFrame +import pandas as pd +from pandas import ( + NA, + DataFrame, + Series, +) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1702,3 +1711,75 @@ def test_s3_parser_consistency(): ) tm.assert_frame_equal(df_lxml, df_etree) + + +@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) +def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): + # GH# + data = """ + + + x + 1 + 4.0 + x + 2 + 4.0 + + True + False + + + y + 2 + 5.0 + + + + + False + + +""" + + if string_storage == "python": + string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) + string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + + else: + pa = pytest.importorskip("pyarrow") + string_array = ArrowStringArray(pa.array(["x", "y"])) + string_array_na = ArrowStringArray(pa.array(["x", None])) + + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_xml(data, parser=parser, use_nullable_dtypes=True) + + expected = DataFrame( + { + "a": string_array, + "b": Series([1, 2], dtype="Int64"), + "c": Series([4.0, 5.0], dtype="Float64"), + "d": string_array_na, + "e": Series([2, NA], dtype="Int64"), + "f": Series([4.0, NA], dtype="Float64"), + "g": Series([NA, NA], dtype="Int64"), + "h": Series([True, False], dtype="boolean"), + "i": Series([False, NA], dtype="boolean"), + } + ) + + if dtype_backend == "pyarrow": + import pyarrow as pa + + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + expected["g"] = ArrowExtensionArray(pa.array([None, None])) + + tm.assert_frame_equal(result, expected) From 437426043dba0083f063f1b716f313e573f5a07c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 11:16:26 +0100 Subject: [PATCH 2/4] Add gh ref --- pandas/tests/io/xml/test_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b442b0dead376..a7411c560c1f4 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1715,7 +1715,7 @@ def test_s3_parser_consistency(): @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): - # GH# + # GH#50500 data = """ From 847d8bd0027ac07eb3dd5c47d8c2c819d530afa7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 12:23:02 +0100 Subject: [PATCH 3/4] Move import --- pandas/tests/io/xml/test_xml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index a7411c560c1f4..6265ca48b745a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1716,6 +1716,8 @@ def test_s3_parser_consistency(): @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): # GH#50500 + if string_storage == "pyarrow" or dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") data = """ @@ -1747,7 +1749,6 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) else: - pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["x", "y"])) string_array_na = ArrowStringArray(pa.array(["x", None])) From 9f35147f58d06a59cb43f3defbdd3eaeac2b2bc2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Jan 2023 22:46:00 +0100 Subject: [PATCH 4/4] Remove import --- pandas/tests/io/xml/test_xml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6265ca48b745a..d65b9b8af4365 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1771,8 +1771,6 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): ) if dtype_backend == "pyarrow": - import pyarrow as pa - from pandas.arrays import ArrowExtensionArray expected = DataFrame(