diff --git a/crates/polars-core/src/chunked_array/ops/row_encode.rs b/crates/polars-core/src/chunked_array/ops/row_encode.rs index 5ac627327389..31b76357c470 100644 --- a/crates/polars-core/src/chunked_array/ops/row_encode.rs +++ b/crates/polars-core/src/chunked_array/ops/row_encode.rs @@ -134,7 +134,14 @@ pub fn encode_rows_unordered(by: &[Series]) -> PolarsResult pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult { let mut cols = Vec::with_capacity(by.len()); let mut fields = Vec::with_capacity(by.len()); + + // Since ZFS exists, we might not actually have any arrays and need to get the length from the + // columns. + let num_rows = by.first().map_or(0, |c| c.len()); + for by in by { + debug_assert_eq!(by.len(), num_rows); + let arr = _get_rows_encoded_compat_array(by)?; let field = EncodingField::new_unsorted(); match arr.dtype() { @@ -152,7 +159,7 @@ pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult { }, } } - Ok(convert_columns(&cols, &fields)) + Ok(convert_columns(num_rows, &cols, &fields)) } pub fn _get_rows_encoded( @@ -166,7 +173,13 @@ pub fn _get_rows_encoded( let mut cols = Vec::with_capacity(by.len()); let mut fields = Vec::with_capacity(by.len()); + // Since ZFS exists, we might not actually have any arrays and need to get the length from the + // columns. + let num_rows = by.first().map_or(0, |c| c.len()); + for ((by, desc), null_last) in by.iter().zip(descending).zip(nulls_last) { + debug_assert_eq!(by.len(), num_rows); + let by = by.as_materialized_series(); let arr = _get_rows_encoded_compat_array(by)?; let sort_field = EncodingField { @@ -190,7 +203,7 @@ pub fn _get_rows_encoded( }, } } - Ok(convert_columns(&cols, &fields)) + Ok(convert_columns(num_rows, &cols, &fields)) } pub fn _get_rows_encoded_ca( diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs index f81366a34641..8f89fbf8619c 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs @@ -85,6 +85,7 @@ impl Eval { } polars_row::convert_columns_amortized( + keys_columns[0].len(), // @NOTE: does not work for ZFS keys_columns, &self.key_fields, &mut self.rows_encoded, diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs index ff6d66f91e0b..85e7ad581ad6 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs @@ -141,7 +141,11 @@ impl GenericBuild { let arr = s.to_physical_repr().rechunk().array_ref(0).clone(); self.join_columns.push(arr); } - let rows_encoded = polars_row::convert_columns_no_order(&self.join_columns).into_array(); + let rows_encoded = polars_row::convert_columns_no_order( + self.join_columns[0].len(), // @NOTE: does not work for ZFS + &self.join_columns, + ) + .into_array(); self.materialized_join_cols.push(rows_encoded); Ok(self.materialized_join_cols.last().unwrap()) } diff --git a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs index 72824d75d3b7..acf64b5f1715 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs @@ -74,6 +74,7 @@ impl RowValues { self.join_column_idx = Some(idx); } polars_row::convert_columns_amortized_no_order( + self.join_columns_material[0].len(), // @NOTE: does not work for ZFS &self.join_columns_material, &mut self.current_rows, ); diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index 7c0a35db38a1..fc1a5744b154 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -233,7 +233,11 @@ impl SortSinkMultiple { let column = if chunk.data.height() == 0 && chunk.data.width() > 0 { Column::new_empty(name, &DataType::BinaryOffset) } else { - let rows_encoded = polars_row::convert_columns(&self.sort_column, &self.sort_fields); + let rows_encoded = polars_row::convert_columns( + self.sort_column[0].len(), // @NOTE: does not work for ZFS + &self.sort_column, + &self.sort_fields, + ); let series = unsafe { Series::from_chunks_and_dtype_unchecked( name, diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index 0494d80bacea..9363d98fe76c 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -740,7 +740,7 @@ impl PyDataFrame { ) .collect::>(); - let rows = polars_row::convert_columns(&chunks, &fields); + let rows = polars_row::convert_columns(df.height(), &chunks, &fields); Ok(unsafe { Series::from_chunks_and_dtype_unchecked( diff --git a/crates/polars-row/src/encode.rs b/crates/polars-row/src/encode.rs index 840f6617bbf4..57ede510fb11 100644 --- a/crates/polars-row/src/encode.rs +++ b/crates/polars-row/src/encode.rs @@ -13,20 +13,29 @@ use crate::fixed::FixedLengthEncoding; use crate::row::{EncodingField, RowsEncoded}; use crate::{with_match_arrow_primitive_type, ArrayRef}; -pub fn convert_columns(columns: &[ArrayRef], fields: &[EncodingField]) -> RowsEncoded { +pub fn convert_columns( + num_rows: usize, + columns: &[ArrayRef], + fields: &[EncodingField], +) -> RowsEncoded { let mut rows = RowsEncoded::new(vec![], vec![]); - convert_columns_amortized(columns, fields, &mut rows); + convert_columns_amortized(num_rows, columns, fields, &mut rows); rows } -pub fn convert_columns_no_order(columns: &[ArrayRef]) -> RowsEncoded { +pub fn convert_columns_no_order(num_rows: usize, columns: &[ArrayRef]) -> RowsEncoded { let mut rows = RowsEncoded::new(vec![], vec![]); - convert_columns_amortized_no_order(columns, &mut rows); + convert_columns_amortized_no_order(num_rows, columns, &mut rows); rows } -pub fn convert_columns_amortized_no_order(columns: &[ArrayRef], rows: &mut RowsEncoded) { +pub fn convert_columns_amortized_no_order( + num_rows: usize, + columns: &[ArrayRef], + rows: &mut RowsEncoded, +) { convert_columns_amortized( + num_rows, columns, std::iter::repeat(&EncodingField::default()).take(columns.len()), rows, @@ -83,13 +92,6 @@ impl Encoder { } } - fn len(&self) -> usize { - match self { - Encoder::List { original, .. } => original.len(), - Encoder::Leaf(arr) => arr.len(), - } - } - fn dtype(&self) -> &ArrowDataType { match self { Encoder::List { original, .. } => original.dtype(), @@ -155,6 +157,7 @@ fn get_encoders(arr: &dyn Array, encoders: &mut Vec, field: &EncodingFi } pub fn convert_columns_amortized<'a, I: IntoIterator>( + num_rows: usize, columns: &'a [ArrayRef], fields: I, rows: &mut RowsEncoded, @@ -177,6 +180,7 @@ pub fn convert_columns_amortized<'a, I: IntoIterator>( } } let values_size = allocate_rows_buf( + num_rows, &mut flattened_columns, &flattened_fields, &mut rows.values, @@ -195,8 +199,13 @@ pub fn convert_columns_amortized<'a, I: IntoIterator>( .map(|arr| Encoder::Leaf(arr.clone())) .collect::>(); let fields = fields.cloned().collect::>(); - let values_size = - allocate_rows_buf(&mut encoders, &fields, &mut rows.values, &mut rows.offsets); + let values_size = allocate_rows_buf( + num_rows, + &mut encoders, + &fields, + &mut rows.values, + &mut rows.offsets, + ); for (enc, field) in encoders.iter().zip(fields) { // SAFETY: // we allocated rows with enough bytes. @@ -221,7 +230,7 @@ fn encode_primitive( } } -/// Ecnodes an array into `out` +/// Encodes an array into `out` /// /// # Safety /// `out` must have enough bytes allocated otherwise it will be out of bounds. @@ -294,6 +303,7 @@ pub fn encoded_size(dtype: &ArrowDataType) -> usize { // Returns the length that the caller must set on the `values` buf once the bytes // are initialized. fn allocate_rows_buf( + num_rows: usize, columns: &mut [Encoder], fields: &[EncodingField], values: &mut Vec, @@ -301,7 +311,6 @@ fn allocate_rows_buf( ) -> usize { let has_variable = columns.iter().any(|enc| enc.is_variable()); - let num_rows = columns[0].len(); if has_variable { // row size of the fixed-length columns // those can be determined without looping over the arrays @@ -350,6 +359,7 @@ fn allocate_rows_buf( // Allocate and immediately row-encode the inner types recursively. let values_size = allocate_rows_buf( + original.values().len(), inner_enc, &fields, &mut values_rows.values, @@ -521,7 +531,7 @@ mod test { let b = Int32Array::from_vec(vec![213, 12, 12]); let c = Utf8ViewArray::from_slice([Some("a"), Some(""), Some("meep")]); - let encoded = convert_columns_no_order(&[Box::new(a), Box::new(b), Box::new(c)]); + let encoded = convert_columns_no_order(a.len(), &[Box::new(a), Box::new(b), Box::new(c)]); assert_eq!(encoded.offsets, &[0, 44, 55, 99]); assert_eq!(encoded.values.len(), 99); assert!(encoded.values.ends_with(&[0, 0, 0, 4])); @@ -537,7 +547,7 @@ mod test { let field = EncodingField::new_sorted(false, false); let arr = arrow::compute::cast::cast(&arr, &ArrowDataType::BinaryView, Default::default()) .unwrap(); - let rows_encoded = convert_columns(&[arr], &[field]); + let rows_encoded = convert_columns(arr.len(), &[arr], &[field]); let row1 = rows_encoded.get(0); // + 2 for the start valid byte and for the continuation token @@ -586,7 +596,7 @@ mod test { let field = EncodingField::new_sorted(false, false); let arr = BinaryViewArray::from_slice_values(a); - let rows_encoded = convert_columns_no_order(&[arr.clone().boxed()]); + let rows_encoded = convert_columns_no_order(arr.len(), &[arr.clone().boxed()]); let mut rows = rows_encoded.iter().collect::>(); let decoded = unsafe { decode_binview(&mut rows, &field) }; @@ -602,7 +612,7 @@ mod test { let dtypes = [ArrowDataType::Utf8View]; unsafe { - let encoded = convert_columns(&[Box::new(a.clone())], fields); + let encoded = convert_columns(a.len(), &[Box::new(a.clone())], fields); let out = decode_rows_from_binary(&encoded.into_array(), fields, &dtypes, &mut vec![]); let arr = &out[0]; @@ -627,7 +637,7 @@ mod test { ); let fields = &[EncodingField::new_sorted(true, false)]; - let out = convert_columns(&[array.boxed()], fields); + let out = convert_columns(array.len(), &[array.boxed()], fields); let out = out.into_array(); assert_eq!( out.values().iter().map(|v| *v as usize).sum::(), diff --git a/py-polars/tests/unit/test_row_encoding.py b/py-polars/tests/unit/test_row_encoding.py index 3c94b53a1bf2..0ae0cb9c34f4 100644 --- a/py-polars/tests/unit/test_row_encoding.py +++ b/py-polars/tests/unit/test_row_encoding.py @@ -134,13 +134,16 @@ def test_str(field: tuple[bool, bool, bool]) -> None: ) -# def test_struct() -> None: -# # @TODO: How do we deal with zero-field structs? -# # roundtrip_re(pl.Series('a', [], pl.Struct({})).to_frame()) -# # roundtrip_re(pl.Series('a', [{}], pl.Struct({})).to_frame()) -# roundtrip_re(pl.Series("a", [{"x": 1}], pl.Struct({"x": pl.Int32})).to_frame()) -# roundtrip_re( -# pl.Series( -# "a", [{"x": 1}, {"y": 2}], pl.Struct({"x": pl.Int32, "y": pl.Int32}) -# ).to_frame() -# ) +@pytest.mark.parametrize("field", FIELD_COMBS) +def test_struct(field: tuple[bool, bool, bool]) -> None: + roundtrip_re(pl.Series("a", [], pl.Struct({})).to_frame()) + roundtrip_re(pl.Series("a", [{}], pl.Struct({})).to_frame()) + roundtrip_re( + pl.Series("a", [{"x": 1}], pl.Struct({"x": pl.Int32})).to_frame(), [field] + ) + roundtrip_re( + pl.Series( + "a", [{"x": 1}, {"y": 2}], pl.Struct({"x": pl.Int32, "y": pl.Int32}) + ).to_frame(), + [field], + )