From 8978e1809f7001cb5384485458031e62094f42fc Mon Sep 17 00:00:00 2001 From: Jakub Valtar Date: Mon, 17 Feb 2025 16:53:39 +0100 Subject: [PATCH] fix: Panic in `to_physical` for series of arrays and lists (#21289) --- .../src/chunked_array/array/mod.rs | 25 ++++++++++++++----- .../polars-core/src/chunked_array/list/mod.rs | 11 ++++++-- py-polars/tests/unit/series/test_series.py | 19 ++++++++++++++ 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index 26ef1d00ac32..4d1510a8d0e3 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -4,6 +4,8 @@ mod iterator; use std::borrow::Cow; +use either::Either; + use crate::prelude::*; impl ArrayChunked { @@ -37,13 +39,24 @@ impl ArrayChunked { return Cow::Borrowed(self); }; - assert_eq!(self.chunks().len(), physical_repr.chunks().len()); + let chunk_len_validity_iter = + if physical_repr.chunks().len() == 1 && self.chunks().len() > 1 { + // Physical repr got rechunked, rechunk our validity as well. + Either::Left(std::iter::once((self.len(), self.rechunk_validity()))) + } else { + // No rechunking, expect the same number of chunks. + assert_eq!(self.chunks().len(), physical_repr.chunks().len()); + Either::Right( + self.chunks() + .iter() + .map(|c| (c.len(), c.validity().cloned())), + ) + }; let width = self.width(); - let chunks: Vec<_> = self - .downcast_iter() + let chunks: Vec<_> = chunk_len_validity_iter .zip(physical_repr.into_chunks()) - .map(|(chunk, values)| { + .map(|((len, validity), values)| { FixedSizeListArray::new( ArrowDataType::FixedSizeList( Box::new(ArrowField::new( @@ -53,9 +66,9 @@ impl ArrayChunked { )), width, ), - chunk.len(), + len, values, - chunk.validity().cloned(), + validity, ) .to_boxed() }) diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs index 5c76ae4b9cef..b00589fa104b 100644 --- a/crates/polars-core/src/chunked_array/list/mod.rs +++ b/crates/polars-core/src/chunked_array/list/mod.rs @@ -44,9 +44,16 @@ impl ListChunked { return Cow::Borrowed(self); }; - assert_eq!(self.chunks().len(), physical_repr.chunks().len()); + let ca = if physical_repr.chunks().len() == 1 && self.chunks().len() > 1 { + // Physical repr got rechunked, rechunk self as well. + self.rechunk() + } else { + Cow::Borrowed(self) + }; - let chunks: Vec<_> = self + assert_eq!(ca.chunks().len(), physical_repr.chunks().len()); + + let chunks: Vec<_> = ca .downcast_iter() .zip(physical_repr.into_chunks()) .map(|(chunk, values)| { diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index a0d705bb7cff..0640247ee3d9 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -1672,6 +1672,25 @@ def test_to_physical() -> None: assert_series_equal(s.to_physical(), expected) +def test_to_physical_rechunked_21285() -> None: + # A series with multiple chunks, dtype is array or list of structs with a + # null field (causes rechunking) and a field with a different physical and + # logical repr (causes the full body of `to_physical_repr` to run). + arr_dtype = pl.Array(pl.Struct({"f0": pl.Time, "f1": pl.Null}), shape=(1,)) + s = pl.Series("a", [None], arr_dtype) # content doesn't matter + s = s.append(s) + expected_arr_dtype = pl.Array(pl.Struct({"f0": Int64, "f1": pl.Null}), shape=(1,)) + expected = pl.Series("a", [None, None], expected_arr_dtype) + assert_series_equal(s.to_physical(), expected) + + list_dtype = pl.List(pl.Struct({"f0": pl.Time, "f1": pl.Null})) + s = pl.Series("a", [None], list_dtype) # content doesn't matter + s = s.append(s) + expected_list_dtype = pl.List(pl.Struct({"f0": Int64, "f1": pl.Null})) + expected = pl.Series("a", [None, None], expected_list_dtype) + assert_series_equal(s.to_physical(), expected) + + def test_is_between_datetime() -> None: s = pl.Series("a", [datetime(2020, 1, 1, 10, 0, 0), datetime(2020, 1, 1, 20, 0, 0)]) start = datetime(2020, 1, 1, 12, 0, 0)