diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi index 93a21dc..8dddd1f 100644 --- a/arro3-core/python/arro3/core/_core.pyi +++ b/arro3-core/python/arro3/core/_core.pyi @@ -16,7 +16,7 @@ class Array: obj: A sequence of input objects. type: Explicit type to attempt to coerce to. """ - def __array__(self) -> NDArray: ... + def __array__(self, dtype=None, copy=None) -> NDArray: ... def __arrow_c_array__( self, requested_schema: object | None = None ) -> tuple[object, object]: ... @@ -101,7 +101,7 @@ class ChunkedArray: arrays: Sequence[ArrowArrayExportable], type: ArrowSchemaExportable | None = None, ) -> None: ... - def __array__(self) -> NDArray: ... + def __array__(self, dtype=None, copy=None) -> NDArray: ... def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... def __eq__(self, other) -> bool: ... def __len__(self) -> int: ... @@ -1007,6 +1007,13 @@ class Table: Returns: _description_ """ + @classmethod + def from_batches( + cls, + batches: Sequence[ArrowArrayExportable], + *, + schema: ArrowSchemaExportable | None = None, + ) -> Table: ... @overload @classmethod def from_pydict( diff --git a/pyo3-arrow/src/array.rs b/pyo3-arrow/src/array.rs index 9c0507b..7a17510 100644 --- a/pyo3-arrow/src/array.rs +++ b/pyo3-arrow/src/array.rs @@ -197,7 +197,14 @@ impl PyArray { /// An implementation of the Array interface, for interoperability with numpy and other /// array libraries. - pub fn __array__(&self, py: Python) -> PyResult { + #[pyo3(signature = (dtype=None, copy=None))] + #[allow(unused_variables)] + pub fn __array__( + &self, + py: Python, + dtype: Option, + copy: Option, + ) -> PyResult { to_numpy(py, &self.array) } @@ -289,7 +296,7 @@ impl PyArray { /// Copy this array to a `numpy` NDArray pub fn to_numpy(&self, py: Python) -> PyResult { - self.__array__(py) + self.__array__(py, None, None) } #[getter] diff --git a/pyo3-arrow/src/chunked.rs b/pyo3-arrow/src/chunked.rs index e0270cf..9badb25 100644 --- a/pyo3-arrow/src/chunked.rs +++ b/pyo3-arrow/src/chunked.rs @@ -241,7 +241,14 @@ impl PyChunkedArray { /// An implementation of the Array interface, for interoperability with numpy and other /// array libraries. - pub fn __array__(&self, py: Python) -> PyResult { + #[pyo3(signature = (dtype=None, copy=None))] + #[allow(unused_variables)] + pub fn __array__( + &self, + py: Python, + dtype: Option, + copy: Option, + ) -> PyResult { let chunk_refs = self .chunks .iter() @@ -386,7 +393,7 @@ impl PyChunkedArray { /// Copy this array to a `numpy` NDArray pub fn to_numpy(&self, py: Python) -> PyResult { - self.__array__(py) + self.__array__(py, None, None) } pub fn r#type(&self, py: Python) -> PyResult { diff --git a/pyo3-arrow/src/interop/numpy/to_numpy.rs b/pyo3-arrow/src/interop/numpy/to_numpy.rs index afe2d25..924a5b1 100644 --- a/pyo3-arrow/src/interop/numpy/to_numpy.rs +++ b/pyo3-arrow/src/interop/numpy/to_numpy.rs @@ -3,8 +3,8 @@ use arrow::datatypes::*; use arrow_array::Array; use arrow_schema::DataType; use numpy::ToPyArray; -use pyo3::exceptions::PyValueError; -use pyo3::types::PyAnyMethods; +use pyo3::exceptions::{PyNotImplementedError, PyValueError}; +use pyo3::types::{PyAnyMethods, PyBytes, PyDict, PyList, PyString, PyTuple}; use pyo3::{intern, PyObject, PyResult, Python, ToPyObject}; pub fn to_numpy(py: Python, arr: &dyn Array) -> PyResult { @@ -39,7 +39,77 @@ pub fn to_numpy(py: Python, arr: &dyn Array) -> PyResult { let bools = arr.as_boolean().values().iter().collect::>(); bools.to_pyarray_bound(py).to_object(py) } - _ => todo!(), + // For other data types we create Python objects and then create an object-typed numpy + // array + DataType::Binary => { + let mut py_bytes = Vec::with_capacity(arr.len()); + arr.as_binary::() + .iter() + .for_each(|x| py_bytes.push(PyBytes::new_bound(py, x.unwrap()))); + let py_list = PyList::new_bound(py, py_bytes); + let numpy_mod = py.import_bound(intern!(py, "numpy"))?; + let kwargs = PyDict::new_bound(py); + kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?; + let np_arr = numpy_mod.call_method( + intern!(py, "array"), + PyTuple::new_bound(py, vec![py_list]), + Some(&kwargs), + )?; + np_arr.into() + } + DataType::LargeBinary => { + let mut py_bytes = Vec::with_capacity(arr.len()); + arr.as_binary::() + .iter() + .for_each(|x| py_bytes.push(PyBytes::new_bound(py, x.unwrap()))); + let py_list = PyList::new_bound(py, py_bytes); + let numpy_mod = py.import_bound(intern!(py, "numpy"))?; + let kwargs = PyDict::new_bound(py); + kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?; + let np_arr = numpy_mod.call_method( + intern!(py, "array"), + PyTuple::new_bound(py, vec![py_list]), + Some(&kwargs), + )?; + np_arr.into() + } + DataType::Utf8 => { + let mut py_bytes = Vec::with_capacity(arr.len()); + arr.as_string::() + .iter() + .for_each(|x| py_bytes.push(PyString::new_bound(py, x.unwrap()))); + let py_list = PyList::new_bound(py, py_bytes); + let numpy_mod = py.import_bound(intern!(py, "numpy"))?; + let kwargs = PyDict::new_bound(py); + kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?; + let np_arr = numpy_mod.call_method( + intern!(py, "array"), + PyTuple::new_bound(py, vec![py_list]), + Some(&kwargs), + )?; + np_arr.into() + } + DataType::LargeUtf8 => { + let mut py_bytes = Vec::with_capacity(arr.len()); + arr.as_string::() + .iter() + .for_each(|x| py_bytes.push(PyString::new_bound(py, x.unwrap()))); + let py_list = PyList::new_bound(py, py_bytes); + let numpy_mod = py.import_bound(intern!(py, "numpy"))?; + let kwargs = PyDict::new_bound(py); + kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?; + let np_arr = numpy_mod.call_method( + intern!(py, "array"), + PyTuple::new_bound(py, vec![py_list]), + Some(&kwargs), + )?; + np_arr.into() + } + dt => { + return Err(PyNotImplementedError::new_err(format!( + "Unsupported type in to_numpy {dt}" + ))) + } }; Ok(result) } diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs index d072083..d9f2f36 100644 --- a/pyo3-arrow/src/table.rs +++ b/pyo3-arrow/src/table.rs @@ -146,6 +146,30 @@ impl PyTable { Ok(Self::new(batches, schema)) } + #[classmethod] + #[pyo3(signature = (batches, *, schema=None))] + pub fn from_batches( + _cls: &Bound, + batches: Vec, + schema: Option, + ) -> PyArrowResult { + if batches.is_empty() { + let schema = schema.ok_or(PyValueError::new_err( + "schema must be passed for an empty list of batches", + ))?; + return Ok(Self::new(vec![], schema.into_inner())); + } + + let batches = batches + .into_iter() + .map(|batch| batch.into_inner()) + .collect::>(); + let schema = schema + .map(|s| s.into_inner()) + .unwrap_or(batches.first().unwrap().schema()); + Ok(Self::new(batches, schema)) + } + #[classmethod] #[pyo3(signature = (mapping, *, schema=None, metadata=None))] pub fn from_pydict(