From ba514a5e09604a7b0be5a1121ed8aa43c6898e77 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 11 Feb 2025 18:14:09 +0400 Subject: [PATCH 1/3] feat: Add SQL support for the `DELETE` statement --- crates/polars-sql/src/context.rs | 87 ++++++++++++++++--- .../tests/unit/sql/test_table_operations.py | 36 ++++++++ 2 files changed, 112 insertions(+), 11 deletions(-) diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index a799640060bd..389b9d62217a 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -9,10 +9,10 @@ use polars_plan::dsl::function_expr::StructFunction; use polars_plan::prelude::*; use polars_utils::format_pl_smallstr; use sqlparser::ast::{ - BinaryOperator, CreateTable, Distinct, ExcludeSelectItem, Expr as SQLExpr, FunctionArg, - GroupByExpr, Ident, JoinConstraint, JoinOperator, ObjectName, ObjectType, Offset, OrderBy, - Query, RenameSelectItem, Select, SelectItem, SetExpr, SetOperator, SetQuantifier, Statement, - TableAlias, TableFactor, TableWithJoins, UnaryOperator, Value as SQLValue, Values, + BinaryOperator, CreateTable, Delete, Distinct, ExcludeSelectItem, Expr as SQLExpr, FromTable, + FunctionArg, GroupByExpr, Ident, JoinConstraint, JoinOperator, ObjectName, ObjectType, Offset, + OrderBy, Query, RenameSelectItem, Select, SelectItem, SetExpr, SetOperator, SetQuantifier, + Statement, TableAlias, TableFactor, TableWithJoins, UnaryOperator, Value as SQLValue, Values, WildcardAdditionalOptions, }; use sqlparser::dialect::GenericDialect; @@ -201,8 +201,9 @@ impl SQLContext { } => self.execute_drop_table(stmt)?, stmt @ Statement::Explain { .. } => self.execute_explain(stmt)?, stmt @ Statement::Truncate { .. } => self.execute_truncate_table(stmt)?, + stmt @ Statement::Delete { .. } => self.execute_delete_from_table(stmt)?, _ => polars_bail!( - SQLInterface: "statement type {:?} is not supported", ast, + SQLInterface: "statement type is not supported:\n{:?}", ast, ), }) } @@ -471,7 +472,7 @@ impl SQLContext { let df = DataFrame::new(vec![plan])?; Ok(df.lazy()) }, - _ => unreachable!(), + _ => polars_bail!(SQLInterface: "unexpected statement type; expected EXPLAIN"), } } @@ -482,6 +483,7 @@ impl SQLContext { Ok(df.lazy()) } + // DROP TABLE fn execute_drop_table(&mut self, stmt: &Statement) -> PolarsResult { match stmt { Statement::Drop { names, .. } => { @@ -490,10 +492,68 @@ impl SQLContext { }); Ok(DataFrame::empty().lazy()) }, - _ => unreachable!(), + _ => polars_bail!(SQLInterface: "unexpected statement type; expected DROP"), } } + // DELETE FROM [WHERE ...] + fn execute_delete_from_table(&mut self, stmt: &Statement) -> PolarsResult { + if let Statement::Delete(Delete { + tables, + from, + using, + selection, + returning, + order_by, + limit, + }) = stmt + { + if !tables.is_empty() + || using.is_some() + || returning.is_some() + || limit.is_some() + || !order_by.is_empty() + { + let error_message = match () { + _ if !tables.is_empty() => "DELETE expects exactly one table name", + _ if using.is_some() => "DELETE does not support the USING clause", + _ if returning.is_some() => "DELETE does not support the RETURNING clause", + _ if limit.is_some() => "DELETE does not support the LIMIT clause", + _ if !order_by.is_empty() => "DELETE does not support the ORDER BY clause", + _ => unreachable!(), + }; + polars_bail!(SQLInterface: error_message); + } + let from_tables = match &from { + FromTable::WithFromKeyword(from) => from, + FromTable::WithoutKeyword(from) => from, + }; + if from_tables.len() > 1 { + polars_bail!(SQLInterface: "cannot have multiple tables in DELETE FROM (found {})", from_tables.len()) + } + let tbl_expr = from_tables.first().unwrap(); + if !tbl_expr.joins.is_empty() { + polars_bail!(SQLInterface: "DELETE does not support table JOINs") + } + let (_, mut lf) = self.get_table(&tbl_expr.relation)?; + if selection.is_none() { + // no WHERE clause; equivalent to TRUNCATE (drop all rows) + Ok(DataFrame::empty_with_schema( + lf.schema_with_arenas(&mut self.lp_arena, &mut self.expr_arena) + .unwrap() + .as_ref(), + ) + .lazy()) + } else { + // apply constraint as inverted filter (drops rows matching the selection) + Ok(self.process_where(lf.clone(), selection, true)?) + } + } else { + polars_bail!(SQLInterface: "unexpected statement type; expected DELETE") + } + } + + // TRUNCATE fn execute_truncate_table(&mut self, stmt: &Statement) -> PolarsResult { if let Statement::Truncate { table_names, @@ -524,7 +584,7 @@ impl SQLContext { }, } } else { - unreachable!() + polars_bail!(SQLInterface: "unexpected statement type; expected TRUNCATE") } } @@ -645,7 +705,7 @@ impl SQLContext { // Filter expression (WHERE clause) let schema = self.get_frame_schema(&mut lf)?; - lf = self.process_where(lf, &select_stmt.selection)?; + lf = self.process_where(lf, &select_stmt.selection, false)?; // 'SELECT *' modifiers let mut select_modifiers = SelectModifiers { @@ -907,6 +967,7 @@ impl SQLContext { &mut self, mut lf: LazyFrame, expr: &Option, + invert_filter: bool, ) -> PolarsResult { if let Some(expr) = expr { let schema = self.get_frame_schema(&mut lf)?; @@ -923,9 +984,9 @@ impl SQLContext { }, _ => (false, false), }; - if all_true { + if (all_true && !invert_filter) || (all_false && invert_filter) { return Ok(lf); - } else if all_false { + } else if (all_false && !invert_filter) || (all_true && invert_filter) { return Ok(DataFrame::empty_with_schema(schema.as_ref()).lazy()); } @@ -935,6 +996,10 @@ impl SQLContext { filter_expression = all_horizontal([filter_expression])?; } lf = self.process_subqueries(lf, vec![&mut filter_expression]); + if invert_filter { + // negate the filter (being careful about null values) + filter_expression = filter_expression.neq_missing(lit(true)) + } lf = lf.filter(filter_expression); } Ok(lf) diff --git a/py-polars/tests/unit/sql/test_table_operations.py b/py-polars/tests/unit/sql/test_table_operations.py index 8c3862b85b35..7220a5809ea4 100644 --- a/py-polars/tests/unit/sql/test_table_operations.py +++ b/py-polars/tests/unit/sql/test_table_operations.py @@ -22,6 +22,42 @@ def test_frame() -> pl.LazyFrame: ) +@pytest.mark.parametrize( + ("delete_constraint", "expected_ids"), + [ + # basic constraints + ("WHERE id = 200", {100, 300}), + ("WHERE id = 200 OR id = 300", {100}), + ("WHERE id IN (200, 300, 400)", {100}), + ("WHERE id NOT IN (200, 300, 400)", {200, 300}), + # more involved constraints + ("WHERE EXTRACT(year FROM dt) >= 2000", {200}), + # null-handling (in the data) + ("WHERE v1 < 0", {100, 300}), + ("WHERE v1 > 0", {200, 300}), + # null handling (in the constraint) + ("WHERE v1 IS NULL", {100, 200}), + ("WHERE v1 IS NOT NULL", {300}), + # boolean handling (delete all/none) + ("WHERE FALSE", {100, 200, 300}), + ("WHERE TRUE", set()), + # no constraint; equivalent to TRUNCATE (drop all rows) + ("", set()), + ], +) +def test_delete_clause(delete_constraint: str, expected_ids: set[int]) -> None: + df = pl.DataFrame( + { + "id": [100, 200, 300], + "dt": [date(2020, 10, 10), date(1999, 1, 2), date(2001, 7, 5)], + "v1": [3.5, -4.0, None], + "v2": [10.0, 2.5, -1.5], + } + ) + res = df.sql(f"DELETE FROM self {delete_constraint}") + assert set(res["id"]) == expected_ids + + def test_drop_table(test_frame: pl.LazyFrame) -> None: # 'drop' completely removes the table from sql context expected = pl.DataFrame() From ce5cf80341318354975ee00255c2caca5541e8a4 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 11 Feb 2025 18:22:42 +0400 Subject: [PATCH 2/3] add sql docs entry (table operations) --- .../source/reference/sql/table_operations.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/py-polars/docs/source/reference/sql/table_operations.rst b/py-polars/docs/source/reference/sql/table_operations.rst index 5d7a3595fde5..b0b2f530e940 100644 --- a/py-polars/docs/source/reference/sql/table_operations.rst +++ b/py-polars/docs/source/reference/sql/table_operations.rst @@ -9,6 +9,8 @@ Table Operations - Description * - :ref:`CREATE TABLE ` - Create a new table and its columns from a SQL query executed against an existing table. + * - :ref:`DELETE FROM ` + - Remove specific rows of data from a table using an (optional) constraint. * - :ref:`DROP TABLES ` - Deletes the specified table, unregistering it. * - :ref:`EXPLAIN ` @@ -34,6 +36,19 @@ Create a new table and its columns from a SQL query executed against an existing CREATE TABLE new_table AS SELECT * FROM existing_table WHERE value > 42 +.. _delete_from_table: + +DELETE +-------- +Remove specific rows from a table using an (optional) constraint. +Omitting the constraint deletes all rows, equivalent to TRUNCATE. + +**Example:** + +.. code-block:: sql + + DELETE FROM some_table WHERE value < 0 + .. _drop_tables: DROP TABLES From 3365d2083fea33fd542e7cf08c08afdb5bf04681 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Tue, 11 Feb 2025 19:21:06 +0400 Subject: [PATCH 3/3] drive-by docs fix (notes) --- .../docs/source/reference/sql/table_operations.rst | 2 +- py-polars/polars/dataframe/frame.py | 12 ++++++++---- py-polars/polars/series/series.py | 5 +++++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/py-polars/docs/source/reference/sql/table_operations.rst b/py-polars/docs/source/reference/sql/table_operations.rst index b0b2f530e940..bd959e8e0190 100644 --- a/py-polars/docs/source/reference/sql/table_operations.rst +++ b/py-polars/docs/source/reference/sql/table_operations.rst @@ -39,7 +39,7 @@ Create a new table and its columns from a SQL query executed against an existing .. _delete_from_table: DELETE --------- +------ Remove specific rows from a table using an (optional) constraint. Omitting the constraint deletes all rows, equivalent to TRUNCATE. diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 0eebc0f52a2c..f0ab632a9e92 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4574,10 +4574,10 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: FFI buffers are included in this estimation. - Note - ---- - For objects, the estimated size only reports the pointer size, which is - a huge underestimation. + Notes + ----- + For data with Object dtype, the estimated size only reports the pointer + size, which is a huge underestimation. Parameters ---------- @@ -10741,6 +10741,7 @@ def rows_by_key( include_key: bool = ..., unique: Literal[False] = ..., ) -> dict[Any, list[Any]]: ... + @overload def rows_by_key( self, @@ -10750,6 +10751,7 @@ def rows_by_key( include_key: bool = ..., unique: Literal[True], ) -> dict[Any, Any]: ... + @overload def rows_by_key( self, @@ -10759,6 +10761,7 @@ def rows_by_key( include_key: bool = ..., unique: Literal[False] = ..., ) -> dict[Any, list[dict[str, Any]]]: ... + @overload def rows_by_key( self, @@ -10768,6 +10771,7 @@ def rows_by_key( include_key: bool = ..., unique: Literal[True], ) -> dict[Any, dict[str, Any]]: ... + def rows_by_key( self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 172946aec2e3..6d4baf5c0ead 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1565,6 +1565,11 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: FFI buffers are included in this estimation. + Notes + ----- + For data with Object dtype, the estimated size only reports the pointer + size, which is a huge underestimation. + Parameters ---------- unit : {'b', 'kb', 'mb', 'gb', 'tb'}