From 1cca63012baad4f143789959ce1c46a620ff16cd Mon Sep 17 00:00:00 2001 From: Marshall Date: Sat, 22 Feb 2025 05:21:12 -0500 Subject: [PATCH] fix: Don't silently produce null values from invalid input to `pl.datetime` and `pl.date` (#21013) --- crates/polars-time/src/chunkedarray/date.rs | 16 +++--- .../polars-time/src/chunkedarray/datetime.rs | 39 ++++++++----- .../functions/as_datatype/test_datetime.py | 47 ++++++++++++++++ .../unit/functions/range/test_date_range.py | 5 -- .../namespaces/temporal/test_replace.py | 55 +++++++++++++++++++ 5 files changed, 137 insertions(+), 25 deletions(-) diff --git a/crates/polars-time/src/chunkedarray/date.rs b/crates/polars-time/src/chunkedarray/date.rs index 944648311528..d58cca038727 100644 --- a/crates/polars-time/src/chunkedarray/date.rs +++ b/crates/polars-time/src/chunkedarray/date.rs @@ -1,6 +1,5 @@ use arrow::temporal_conversions::{EPOCH_DAYS_FROM_CE, MILLISECONDS, SECONDS_IN_DAY}; use chrono::{Datelike, NaiveDate}; -use polars_core::utils::CustomIterTools; use super::*; @@ -84,20 +83,23 @@ pub trait DateMethods: AsDate { day: &Int8Chunked, name: PlSmallStr, ) -> PolarsResult { - let mut ca: Int32Chunked = year + let ca: Int32Chunked = year .into_iter() .zip(month) .zip(day) .map(|((y, m), d)| { if let (Some(y), Some(m), Some(d)) = (y, m, d) { - NaiveDate::from_ymd_opt(y, m as u32, d as u32) - .map(|t| t.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + NaiveDate::from_ymd_opt(y, m as u32, d as u32).map_or_else( + // We have an invalid date. + || Err(polars_err!(ComputeError: format!("Invalid date components ({}, {}, {}) supplied", y, m, d))), + // We have a valid date. + |date| Ok(Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)), + ) } else { - None + Ok(None) } }) - .collect_trusted(); - ca.rename(name); + .try_collect_ca_with_dtype(name, DataType::Int32)?; Ok(ca.into_date()) } } diff --git a/crates/polars-time/src/chunkedarray/datetime.rs b/crates/polars-time/src/chunkedarray/datetime.rs index ca4691cc6280..a361b101c9bb 100644 --- a/crates/polars-time/src/chunkedarray/datetime.rs +++ b/crates/polars-time/src/chunkedarray/datetime.rs @@ -179,22 +179,36 @@ pub trait DatetimeMethods: AsDatetime { if let (Some(y), Some(m), Some(d), Some(h), Some(mnt), Some(s), Some(ns)) = (y, m, d, h, mnt, s, ns) { - NaiveDate::from_ymd_opt(y, m as u32, d as u32) - .and_then(|nd| { - nd.and_hms_nano_opt(h as u32, mnt as u32, s as u32, ns as u32) - }) - .map(|ndt| match time_unit { - TimeUnit::Milliseconds => ndt.and_utc().timestamp_millis(), - TimeUnit::Microseconds => ndt.and_utc().timestamp_micros(), - TimeUnit::Nanoseconds => ndt.and_utc().timestamp_nanos_opt().unwrap(), - }) + NaiveDate::from_ymd_opt(y, m as u32, d as u32).map_or_else( + // We have an invalid date. + || Err(polars_err!(ComputeError: format!("Invalid date components ({}, {}, {}) supplied", y, m, d))), + // We have a valid date. + |date| { + date.and_hms_nano_opt(h as u32, mnt as u32, s as u32, ns as u32) + .map_or_else( + // We have invalid time components for the specified date. + || Err(polars_err!(ComputeError: format!("Invalid time components ({}, {}, {}, {}) supplied", h, mnt, s, ns))), + // We have a valid time. + |ndt| { + let t = ndt.and_utc(); + Ok(Some(match time_unit { + TimeUnit::Milliseconds => t.timestamp_millis(), + TimeUnit::Microseconds => t.timestamp_micros(), + TimeUnit::Nanoseconds => { + t.timestamp_nanos_opt().unwrap() + }, + })) + }, + ) + }, + ) } else { - None + Ok(None) } }) - .collect_trusted(); + .try_collect_ca_with_dtype(name, DataType::Int64)?; - let mut ca = match time_zone { + let ca = match time_zone { #[cfg(feature = "timezones")] Some(_) => { let mut ca = ca.into_datetime(*time_unit, None); @@ -209,7 +223,6 @@ pub trait DatetimeMethods: AsDatetime { ca.into_datetime(*time_unit, None) }, }; - ca.rename(name); Ok(ca) } } diff --git a/py-polars/tests/unit/functions/as_datatype/test_datetime.py b/py-polars/tests/unit/functions/as_datatype/test_datetime.py index f315ccad19d5..688ecc2ec7ce 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_datetime.py +++ b/py-polars/tests/unit/functions/as_datatype/test_datetime.py @@ -32,6 +32,53 @@ def test_date_datetime() -> None: assert_series_equal(out["h2"], df["hour"].rename("h2")) +@pytest.mark.parametrize( + "components", + [ + [2025, 13, 1], + [2025, 1, 32], + [2025, 2, 29], + ], +) +def test_date_invalid_component(components: list[int]) -> None: + y, m, d = components + msg = rf"Invalid date components \({y}, {m}, {d}\) supplied" + with pytest.raises(ComputeError, match=msg): + pl.select(pl.date(*components)) + + +@pytest.mark.parametrize( + "components", + [ + [2025, 13, 1, 0, 0, 0, 0], + [2025, 1, 32, 0, 0, 0, 0], + [2025, 2, 29, 0, 0, 0, 0], + ], +) +def test_datetime_invalid_date_component(components: list[int]) -> None: + y, m, d = components[0:3] + msg = rf"Invalid date components \({y}, {m}, {d}\) supplied" + with pytest.raises(ComputeError, match=msg): + pl.select(pl.datetime(*components)) + + +@pytest.mark.parametrize( + "components", + [ + [2025, 1, 1, 25, 0, 0, 0], + [2025, 1, 1, 0, 60, 0, 0], + [2025, 1, 1, 0, 0, 60, 0], + [2025, 1, 1, 0, 0, 0, 2_000_000], + ], +) +def test_datetime_invalid_time_component(components: list[int]) -> None: + h, mnt, s, us = components[3:] + ns = us * 1_000 + msg = rf"Invalid time components \({h}, {mnt}, {s}, {ns}\) supplied" + with pytest.raises(ComputeError, match=msg): + pl.select(pl.datetime(*components)) + + @pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) def test_datetime_time_unit(time_unit: TimeUnit) -> None: result = pl.datetime(2022, 1, 2, time_unit=time_unit) diff --git a/py-polars/tests/unit/functions/range/test_date_range.py b/py-polars/tests/unit/functions/range/test_date_range.py index f8ef8f7075b1..758b604aeba5 100644 --- a/py-polars/tests/unit/functions/range/test_date_range.py +++ b/py-polars/tests/unit/functions/range/test_date_range.py @@ -30,11 +30,6 @@ def test_date_range_invalid_time_unit() -> None: ) -def test_date_range_invalid_time() -> None: - with pytest.raises(ComputeError, match="end is an out-of-range time"): - pl.date_range(pl.date(2024, 1, 1), pl.date(2024, 2, 30), eager=True) - - def test_date_range_lazy_with_literals() -> None: df = pl.DataFrame({"misc": ["x"]}).with_columns( pl.date_ranges( diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_replace.py b/py-polars/tests/unit/operations/namespaces/temporal/test_replace.py index 4d144ba2662c..61a95a894ea2 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_replace.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_replace.py @@ -298,3 +298,58 @@ def test_replace_preserve_tu_and_tz(tu: TimeUnit, tzinfo: str) -> None: result = s.dt.replace(year=2000) assert result.dtype.time_unit == tu # type: ignore[attr-defined] assert result.dtype.time_zone == tzinfo # type: ignore[attr-defined] + + +def test_replace_date_invalid_components() -> None: + df = pl.DataFrame({"a": [date(2025, 1, 1)]}) + + with pytest.raises( + ComputeError, match=r"Invalid date components \(2025, 13, 1\) supplied" + ): + df.select(pl.col("a").dt.replace(month=13)) + with pytest.raises( + ComputeError, match=r"Invalid date components \(2025, 1, 32\) supplied" + ): + df.select(pl.col("a").dt.replace(day=32)) + + +def test_replace_datetime_invalid_date_components() -> None: + df = pl.DataFrame({"a": [datetime(2025, 1, 1)]}) + + with pytest.raises( + ComputeError, match=r"Invalid date components \(2025, 13, 1\) supplied" + ): + df.select(pl.col("a").dt.replace(month=13)) + with pytest.raises( + ComputeError, match=r"Invalid date components \(2025, 1, 32\) supplied" + ): + df.select(pl.col("a").dt.replace(day=32)) + + +def test_replace_datetime_invalid_time_components() -> None: + df = pl.DataFrame({"a": [datetime(2025, 1, 1)]}) + + # hour + with pytest.raises( + ComputeError, match=r"Invalid time components \(25, 0, 0, 0\) supplied" + ): + df.select(pl.col("a").dt.replace(hour=25)) + + # minute + with pytest.raises( + ComputeError, match=r"Invalid time components \(0, 61, 0, 0\) supplied" + ): + df.select(pl.col("a").dt.replace(minute=61)) + + # second + with pytest.raises( + ComputeError, match=r"Invalid time components \(0, 0, 61, 0\) supplied" + ): + df.select(pl.col("a").dt.replace(second=61)) + + # microsecond + with pytest.raises( + ComputeError, + match=r"Invalid time components \(0, 0, 0, 2000000000\) supplied", + ): + df.select(pl.col("a").dt.replace(microsecond=2_000_000))