Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Allow init from BigQuery Arrow data containing ExtensionType cols with irrelevant metadata #21492

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion crates/polars-python/src/interop/arrow/to_rust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,49 @@ pub fn field_to_rust_arrow(obj: Bound<'_, PyAny>) -> PyResult<ArrowField> {
// make the conversion through PyArrow's private API
obj.call_method1("_export_to_c", (schema_ptr as Py_uintptr_t,))?;
let field = unsafe { ffi::import_field_from_c(schema.as_ref()).map_err(PyPolarsErr::from)? };
Ok(field.clone())
Ok(normalize_arrow_fields(&field))
}

fn normalize_arrow_fields(field: &ArrowField) -> ArrowField {
// normalize fields with extension dtypes that are otherwise standard dtypes associated
// with (for us) irrelevant metadata; recreate the field using the inner (standard) dtype
match field {
ArrowField {
dtype: ArrowDataType::Struct(ref fields),
..
} => {
let mut normalized = false;
let normalized_fields: Vec<_> = fields
.iter()
.map(|f| {
// note: google bigquery column data is returned as a standard arrow dtype, but the
// sql type it was loaded from is associated as metadata (resulting in an extension dtype)
if let ArrowDataType::Extension(ext_type) = &f.dtype {
if ext_type.name.starts_with("google:sqlType:") {
normalized = true;
return ArrowField::new(
f.name.clone(),
ext_type.inner.clone(),
f.is_nullable,
);
}
}
f.clone()
})
.collect();

if normalized {
ArrowField::new(
field.name.clone(),
ArrowDataType::Struct(normalized_fields),
field.is_nullable,
)
} else {
field.clone()
}
},
_ => field.clone(),
}
}

pub fn field_to_rust(obj: Bound<'_, PyAny>) -> PyResult<Field> {
Expand Down
10 changes: 5 additions & 5 deletions py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,13 @@ def expand_selector(
Parameters
----------
target
A polars DataFrame, LazyFrame or schema.
A Polars DataFrame, LazyFrame or Schema.
selector
An arbitrary polars selector (or compound selector).
strict
Setting False will additionally allow for a broader range of column selection
expressions (such as bare columns or use of `.exclude()`) to be expanded, not
just the dedicated selectors.
Setting False additionally allows for a broader range of column selection
expressions (such as bare columns or use of `.exclude()`) to be expanded,
not just the dedicated selectors.

Examples
--------
Expand All @@ -158,7 +158,7 @@ def expand_selector(
>>> cs.expand_selector(df.lazy(), ~(cs.first() | cs.last()))
('coly',)

Expand selector with respect to a standalone schema:
Expand selector with respect to a standalone `Schema` dict:

>>> schema = {
... "id": pl.Int64,
Expand Down
25 changes: 25 additions & 0 deletions py-polars/tests/unit/interop/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,31 @@ def test_from_arrow() -> None:
assert df.schema == {"a": pl.UInt32, "b": pl.UInt64} # type: ignore[union-attr]


def test_from_arrow_with_bigquery_metadata() -> None:
arrow_schema = pa.schema(
[
pa.field("id", pa.int64()).with_metadata(
{"ARROW:extension:name": "google:sqlType:integer"}
),
pa.field(
"misc",
pa.struct([("num", pa.int32()), ("val", pa.string())]),
).with_metadata({"ARROW:extension:name": "google:sqlType:struct"}),
]
)
arrow_tbl = pa.Table.from_pylist(
[{"id": 1, "misc": None}, {"id": 2, "misc": None}],
schema=arrow_schema,
)

expected_data = {"id": [1, 2], "num": [None, None], "val": [None, None]}
expected_schema = {"id": pl.Int64, "num": pl.Int32, "val": pl.String}
assert_frame_equal(
pl.DataFrame(expected_data, schema=expected_schema),
pl.from_arrow(arrow_tbl).unnest("misc"), # type: ignore[union-attr]
)


def test_from_optional_not_available() -> None:
from polars.dependencies import _LazyModule

Expand Down
Loading