pola-rs · alexander-beedie · Feb 27, 2025
@@ -17,7 +17,49 @@ pub fn field_to_rust_arrow(obj: Bound<'_, PyAny>) -> PyResult<ArrowField> {
     // make the conversion through PyArrow's private API
     obj.call_method1("_export_to_c", (schema_ptr as Py_uintptr_t,))?;
     let field = unsafe { ffi::import_field_from_c(schema.as_ref()).map_err(PyPolarsErr::from)? };
-    Ok(field.clone())
+    Ok(normalize_arrow_fields(&field))
+}
+
+fn normalize_arrow_fields(field: &ArrowField) -> ArrowField {
+    // normalize fields with extension dtypes that are otherwise standard dtypes associated
+    // with (for us) irrelevant metadata; recreate the field using the inner (standard) dtype
+    match field {
+        ArrowField {
+            dtype: ArrowDataType::Struct(ref fields),
+            ..
+        } => {
+            let mut normalized = false;
+            let normalized_fields: Vec<_> = fields
+                .iter()
+                .map(|f| {
+                    // note: google bigquery column data is returned as a standard arrow dtype, but the
+                    // sql type it was loaded from is associated as metadata (resulting in an extension dtype)
+                    if let ArrowDataType::Extension(ext_type) = &f.dtype {
+                        if ext_type.name.starts_with("google:sqlType:") {
+                            normalized = true;
+                            return ArrowField::new(
+                                f.name.clone(),
+                                ext_type.inner.clone(),
+                                f.is_nullable,
+                            );
+                        }
+                    }
+                    f.clone()
+                })
+                .collect();
+
+            if normalized {
+                ArrowField::new(
+                    field.name.clone(),
+                    ArrowDataType::Struct(normalized_fields),
+                    field.is_nullable,
+                )
+            } else {
+                field.clone()
+            }
+        },
+        _ => field.clone(),
+    }
 }
 
 pub fn field_to_rust(obj: Bound<'_, PyAny>) -> PyResult<Field> {

@@ -127,13 +127,13 @@ def expand_selector(
     Parameters
     ----------
     target
-        A polars DataFrame, LazyFrame or schema.
+        A Polars DataFrame, LazyFrame or Schema.
     selector
         An arbitrary polars selector (or compound selector).
     strict
-        Setting False will additionally allow for a broader range of column selection
-        expressions (such as bare columns or use of `.exclude()`) to be expanded, not
-        just the dedicated selectors.
+        Setting False additionally allows for a broader range of column selection
+        expressions (such as bare columns or use of `.exclude()`) to be expanded,
+        not just the dedicated selectors.
 
     Examples
     --------
@@ -158,7 +158,7 @@ def expand_selector(
     >>> cs.expand_selector(df.lazy(), ~(cs.first() | cs.last()))
     ('coly',)
 
-    Expand selector with respect to a standalone schema:
+    Expand selector with respect to a standalone `Schema` dict:
 
     >>> schema = {
     ...     "id": pl.Int64,

@@ -235,6 +235,31 @@ def test_from_arrow() -> None:
     assert df.schema == {"a": pl.UInt32, "b": pl.UInt64}  # type: ignore[union-attr]
 
 
+def test_from_arrow_with_bigquery_metadata() -> None:
+    arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int64()).with_metadata(
+                {"ARROW:extension:name": "google:sqlType:integer"}
+            ),
+            pa.field(
+                "misc",
+                pa.struct([("num", pa.int32()), ("val", pa.string())]),
+            ).with_metadata({"ARROW:extension:name": "google:sqlType:struct"}),
+        ]
+    )
+    arrow_tbl = pa.Table.from_pylist(
+        [{"id": 1, "misc": None}, {"id": 2, "misc": None}],
+        schema=arrow_schema,
+    )
+
+    expected_data = {"id": [1, 2], "num": [None, None], "val": [None, None]}
+    expected_schema = {"id": pl.Int64, "num": pl.Int32, "val": pl.String}
+    assert_frame_equal(
+        pl.DataFrame(expected_data, schema=expected_schema),
+        pl.from_arrow(arrow_tbl).unnest("misc"),  # type: ignore[union-attr]
+    )
+
+
 def test_from_optional_not_available() -> None:
     from polars.dependencies import _LazyModule