Skip to content

Commit

Permalink
feat: Add a dedicated remove method for DataFrame and LazyFrame (
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored Feb 18, 2025
1 parent e0a3bb5 commit bb8efc5
Show file tree
Hide file tree
Showing 13 changed files with 559 additions and 172 deletions.
26 changes: 24 additions & 2 deletions crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,9 +928,10 @@ impl LazyFrame {
Ok(())
}

/// Filter by some predicate expression.
/// Filter frame rows that match a predicate expression.
///
/// The expression must yield boolean values.
/// The expression must yield boolean values (note that rows where the
/// predicate resolves to `null` are *not* included in the resulting frame).
///
/// # Example
///
Expand All @@ -950,6 +951,27 @@ impl LazyFrame {
Self::from_logical_plan(lp, opt_state)
}

/// Remove frame rows that match a predicate expression.
///
/// The expression must yield boolean values (note that rows where the
/// predicate resolves to `null` are *not* removed from the resulting frame).
///
/// # Example
///
/// ```rust
/// use polars_core::prelude::*;
/// use polars_lazy::prelude::*;
///
/// fn example(df: DataFrame) -> LazyFrame {
/// df.lazy()
/// .remove(col("sepal_width").is_null())
/// .select([col("sepal_width"), col("sepal_length")])
/// }
/// ```
pub fn remove(self, predicate: Expr) -> Self {
self.filter(predicate.neq_missing(lit(true)))
}

/// Select (and optionally rename, with [`alias`](crate::dsl::Expr::alias)) columns from the query.
///
/// Columns can be selected with [`col`];
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-python/src/lazyframe/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,11 @@ impl PyLazyFrame {
ldf.filter(predicate.inner).into()
}

fn remove(&mut self, predicate: PyExpr) -> Self {
let ldf = self.ldf.clone();
ldf.remove(predicate.inner).into()
}

fn select(&mut self, exprs: Vec<PyExpr>) -> Self {
let ldf = self.ldf.clone();
let exprs = exprs.to_exprs();
Expand Down
10 changes: 5 additions & 5 deletions crates/polars-sql/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -996,11 +996,11 @@ impl SQLContext {
filter_expression = all_horizontal([filter_expression])?;
}
lf = self.process_subqueries(lf, vec![&mut filter_expression]);
if invert_filter {
// negate the filter (being careful about null values)
filter_expression = filter_expression.neq_missing(lit(true))
}
lf = lf.filter(filter_expression);
lf = if invert_filter {
lf.remove(filter_expression)
} else {
lf.filter(filter_expression)
};
}
Ok(lf)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Manipulation/selection
DataFrame.pipe
DataFrame.pivot
DataFrame.rechunk
DataFrame.remove
DataFrame.rename
DataFrame.replace_column
DataFrame.reverse
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Manipulation/selection
LazyFrame.limit
LazyFrame.melt
LazyFrame.merge_sorted
LazyFrame.remove
LazyFrame.rename
LazyFrame.reverse
LazyFrame.rolling
Expand Down
189 changes: 169 additions & 20 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4892,30 +4892,33 @@ def filter(
**constraints: Any,
) -> DataFrame:
"""
Filter the rows in the DataFrame based on one or more predicate expressions.
Filter rows, retaining those that match the given predicate expression(s).
The original order of the remaining rows is preserved.
Rows where the filter does not evaluate to True are discarded, including nulls.
Only rows where the predicate resolves as True are retained; when the
predicate result is False (or null), the row is discarded.
Parameters
----------
predicates
Expression(s) that evaluates to a boolean Series.
Expression(s) that evaluate to a boolean Series.
constraints
Column filters; use `name = value` to filter columns by the supplied value.
Each constraint will behave the same as `pl.col(name).eq(value)`, and
will be implicitly joined with the other filter conditions using `&`.
be implicitly joined with the other filter conditions using `&`.
Notes
-----
If you are transitioning from pandas and performing filter operations based on
the comparison of two or more columns, please note that in Polars,
any comparison involving null values will always result in null.
As a result, these rows will be filtered out.
Ensure to handle null values appropriately to avoid unintended filtering
(See examples below).
If you are transitioning from Pandas, and performing filter operations based on
the comparison of two or more columns, please note that in Polars any comparison
involving `null` values will result in a `null` result, *not* boolean True or
False. As a result, these rows will not be retained. Ensure that null values
are handled appropriately to avoid unexpected behaviour (see examples below).
See Also
--------
remove
Examples
--------
Expand All @@ -4927,7 +4930,7 @@ def filter(
... }
... )
Filter on one condition:
Filter rows matching a condition:
>>> df.filter(pl.col("foo") > 1)
shape: (3, 3)
Expand All @@ -4943,7 +4946,9 @@ def filter(
Filter on multiple conditions, combined with and/or operators:
>>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a"))
>>> df.filter(
... (pl.col("foo") < 3) & (pl.col("ham") == "a"),
... )
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
Expand All @@ -4953,7 +4958,9 @@ def filter(
│ 1 ┆ 6 ┆ a │
└─────┴─────┴─────┘
>>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c"))
>>> df.filter(
... (pl.col("foo") == 1) | (pl.col("ham") == "c"),
... )
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
Expand Down Expand Up @@ -4992,9 +4999,11 @@ def filter(
│ 2 ┆ 7 ┆ b │
└─────┴─────┴─────┘
Filter by comparing two columns against each other
Filter by comparing two columns against each other:
>>> df.filter(pl.col("foo") == pl.col("bar"))
>>> df.filter(
... pl.col("foo") == pl.col("bar"),
... )
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
Expand All @@ -5004,7 +5013,9 @@ def filter(
│ 0 ┆ 0 ┆ f │
└─────┴─────┴─────┘
>>> df.filter(pl.col("foo") != pl.col("bar"))
>>> df.filter(
... pl.col("foo") != pl.col("bar"),
... )
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
Expand All @@ -5019,7 +5030,9 @@ def filter(
Notice how the row with `None` values is filtered out. In order to keep the
same behavior as pandas, use:
>>> df.filter(pl.col("foo").ne_missing(pl.col("bar")))
>>> df.filter(
... pl.col("foo").ne_missing(pl.col("bar")),
... )
shape: (5, 3)
┌──────┬──────┬─────┐
│ foo ┆ bar ┆ ham │
Expand All @@ -5032,10 +5045,148 @@ def filter(
│ 4 ┆ null ┆ d │
│ null ┆ 9 ┆ e │
└──────┴──────┴─────┘
"""
return self.lazy().filter(*predicates, **constraints).collect(_eager=True)

def remove(
self,
*predicates: (
IntoExprColumn
| Iterable[IntoExprColumn]
| bool
| list[bool]
| np.ndarray[Any, Any]
),
**constraints: Any,
) -> DataFrame:
"""
Remove rows, dropping those that match the given predicate expression(s).
The original order of the remaining rows is preserved.
Rows where the filter predicate does not evaluate to True are retained
(this includes rows where the predicate evaluates as `null`).
Parameters
----------
predicates
Expression that evaluates to a boolean Series.
constraints
Column filters; use `name = value` to filter columns using the supplied
value. Each constraint behaves the same as `pl.col(name).eq(value)`,
and is implicitly joined with the other filter conditions using `&`.
Notes
-----
If you are transitioning from Pandas, and performing filter operations based on
the comparison of two or more columns, please note that in Polars any comparison
involving `null` values will result in a `null` result, *not* boolean True or
False. As a result, these rows will not be removed. Ensure that null values
are handled appropriately to avoid unexpected behaviour (see examples below).
See Also
--------
filter
Examples
--------
>>> df = pl.DataFrame(
... {
... "foo": [2, 3, None, 4, 0],
... "bar": [5, 6, None, None, 0],
... "ham": ["a", "b", None, "c", "d"],
... }
... )
Remove rows matching a condition:
>>> df.remove(pl.col("bar") >= 5)
shape: (3, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
│ 4 ┆ null ┆ c │
│ 0 ┆ 0 ┆ d │
└──────┴──────┴──────┘
Discard rows based on multiple conditions, combined with and/or operators:
>>> df.remove(
... (pl.col("foo") >= 0) & (pl.col("bar") >= 0),
... )
shape: (2, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
│ 4 ┆ null ┆ c │
└──────┴──────┴──────┘
>>> df.remove(
... (pl.col("foo") >= 0) | (pl.col("bar") >= 0),
... )
shape: (1, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
└──────┴──────┴──────┘
Provide multiple constraints using `*args` syntax:
>>> df.remove(
... pl.col("ham").is_not_null(),
... pl.col("bar") >= 0,
... )
shape: (2, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
│ 4 ┆ null ┆ c │
└──────┴──────┴──────┘
Provide constraints(s) using `**kwargs` syntax:
>>> df.remove(foo=0, bar=0)
shape: (4, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ 2 ┆ 5 ┆ a │
│ 3 ┆ 6 ┆ b │
│ null ┆ null ┆ null │
│ 4 ┆ null ┆ c │
└──────┴──────┴──────┘
Remove rows by comparing two columns against each other:
>>> df.remove(
... pl.col("foo").ne_missing(pl.col("bar")),
... )
shape: (2, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
│ 0 ┆ 0 ┆ d │
└──────┴──────┴──────┘
"""
return self.lazy().remove(*predicates, **constraints).collect(_eager=True)

@overload
def glimpse(
self,
Expand Down Expand Up @@ -7293,7 +7444,6 @@ def join_asof(
│ Netherlands ┆ 2018-08-01 ┆ 17.32 ┆ 910 │
│ Netherlands ┆ 2019-01-01 ┆ 17.4 ┆ 910 │
└─────────────┴────────────┴────────────┴──────┘
"""
if not isinstance(other, DataFrame):
msg = f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
Expand Down Expand Up @@ -7628,7 +7778,6 @@ def join_where(
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
└─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
"""
if not isinstance(other, DataFrame):
msg = f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
Expand Down
Loading

0 comments on commit bb8efc5

Please sign in to comment.