From 557b0a00bcbeaf75d9176083d27be0c119f8b807 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Dec 2023 17:31:51 +0100 Subject: [PATCH 1/2] ENH: support the Arrow PyCapsule Interface on pandas.DataFrame --- pandas/compat/_optional.py | 5 ++- pandas/core/frame.py | 24 ++++++++++++ pandas/tests/frame/test_arrow_interface.py | 45 ++++++++++++++++++++++ pandas/tests/test_optional_dependency.py | 14 +++++++ 4 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/frame/test_arrow_interface.py diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9d04d7c0a1216..2bc6cd46f09a7 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -120,9 +120,8 @@ def import_optional_dependency( The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -163,5 +162,7 @@ def import_optional_dependency( return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d0dc476ee76a..5d0a048417fee 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -971,6 +971,30 @@ def __dataframe_consortium_standard__( ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format. This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + # ---------------------------------------------------------------------- @property diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py new file mode 100644 index 0000000000000..ac7b51cbdfa92 --- /dev/null +++ b/pandas/tests/frame/test_arrow_interface.py @@ -0,0 +1,45 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index c1d1948d6c31a..52b5f636b1254 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule From 05fec034ebf6824e1b9e78c4f788f7231fdb8313 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Jan 2024 14:33:32 +0100 Subject: [PATCH 2/2] expand documentation on how index is handled --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfbcead972813..e093d551f3ead 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -993,7 +993,10 @@ def __arrow_c_stream__(self, requested_schema=None): Export the pandas DataFrame as an Arrow C stream PyCapsule. This relies on pyarrow to convert the pandas DataFrame to the Arrow - format. This conversion is not necessarily zero-copy. + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. Parameters ----------