Skip to content

Commit

Permalink
Specializations
Browse files Browse the repository at this point in the history
  • Loading branch information
stijnherfst committed Nov 21, 2024
1 parent c9a4f79 commit ad22190
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 9 deletions.
55 changes: 47 additions & 8 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use arrow::array::{Array, ValueSize};
use arrow::compute::utils::combine_validities_and;
use arrow::legacy::kernels::string::*;
#[cfg(feature = "string_encoding")]
use base64::engine::general_purpose;
Expand All @@ -8,13 +9,14 @@ use base64::Engine as _;
use polars_core::export::num::Num;
use polars_core::export::regex::Regex;
use polars_core::prelude::arity::*;
use polars_core::utils::align_chunks_binary;
use polars_utils::cache::FastFixedCache;
use regex::escape;

use super::*;
#[cfg(feature = "binary_encoding")]
use crate::chunked_array::binary::BinaryNameSpaceImpl;
use crate::prelude::strings::starts_with::starts_with;
use crate::prelude::strings::starts_with::{starts_with_str, starts_with_view};

// We need this to infer the right lifetimes for the match closure.
#[inline(always)]
Expand Down Expand Up @@ -221,12 +223,22 @@ pub trait StringNameSpaceImpl: AsString {
fn starts_with(&self, sub: &str) -> BooleanChunked {
let ca = self.as_string();

let iter = ca.downcast_iter().map(|arr| {
let out: <BooleanType as PolarsDataType>::Array = arr
.views()
.iter()
.map(|view| starts_with(*view, sub, arr.data_buffers()))
.collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest()));
let iter = ca.downcast_iter().map(|arr| unsafe {
// If the buffer is empty then all strings are inlined and we can avoid a branch which might result in vectorization
let out: <BooleanType as PolarsDataType>::Array = if arr.data_buffers().is_empty() {
arr.views()
.iter()
.map(|view| {
view.get_inlined_slice_unchecked()
.starts_with(sub.as_bytes())
})
.collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest()))
} else {
arr.views()
.iter()
.map(|view| starts_with_str(*view, sub, arr.data_buffers()))
.collect_arr_with_dtype(DataType::Boolean.to_arrow(CompatLevel::newest()))
};
out.with_validity_typed(arr.validity().cloned())
});

Expand All @@ -242,7 +254,34 @@ pub trait StringNameSpaceImpl: AsString {
Some(s) => self.starts_with(s),
None => BooleanChunked::full_null(ca.name().clone(), ca.len()),
},
_ => broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub)),
_ => {
let (lhs, rhs) = align_chunks_binary(ca, prefix);

let iter =
lhs.downcast_iter()
.zip(rhs.downcast_iter())
.map(|(lhs_arr, rhs_arr)| {
let validity =
combine_validities_and(lhs_arr.validity(), rhs_arr.validity());

let element_iter =
lhs_arr.views().iter().zip(rhs_arr.views().iter()).map(
|(lhs_val, rhs_val)| {
starts_with_view(
*lhs_val,
*rhs_val,
lhs_arr.data_buffers(),
rhs_arr.data_buffers(),
)
},
);

let array: <BooleanType as PolarsDataType>::Array =
element_iter.collect_arr();
array.with_validity_typed(validity)
});
ChunkedArray::from_chunk_iter(lhs.name().clone(), iter)
},
}
}

Expand Down
24 changes: 23 additions & 1 deletion crates/polars-ops/src/chunked_array/strings/starts_with.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use polars_utils::slice::SliceAble;

/// Checks if the string starts with the prefix
/// When the prefix is smaller than View::MAX_INLINE_SIZE then this will be very fast
pub fn starts_with(view: View, prefix: &str, buffers: &[Buffer<u8>]) -> bool {
pub(crate) fn starts_with_str(view: View, prefix: &str, buffers: &[Buffer<u8>]) -> bool {
unsafe {
if view.length <= View::MAX_INLINE_SIZE {
view.get_inlined_slice_unchecked()
Expand All @@ -23,3 +23,25 @@ pub fn starts_with(view: View, prefix: &str, buffers: &[Buffer<u8>]) -> bool {
}
}
}

/// Checks if the string starts with the prefix
/// If you call this in a loop and the prefix doesn't change then prefer starts_with_str()
pub(crate) fn starts_with_view(
view: View,
prefix: View,
left_buffers: &[Buffer<u8>],
right_buffers: &[Buffer<u8>],
) -> bool {
unsafe {
if !view.prefix.to_le_bytes()[0..view.length.min(4) as usize]
.starts_with(&prefix.prefix.to_le_bytes()[..view.length.min(4) as usize])
{
return false;
}

let left_buffer = view.get_slice_unchecked(left_buffers);
let right_buffer = prefix.get_slice_unchecked(right_buffers);

left_buffer.starts_with(right_buffer)
}
}

0 comments on commit ad22190

Please sign in to comment.