Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: date_part to extract only the requested part (not the overall interval) #7189

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
141 changes: 91 additions & 50 deletions arrow-arith/src/temporal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,11 +464,15 @@ impl ExtractDatePartExt for PrimitiveArray<IntervalDayTimeType> {
DatePart::Week => Ok(self.unary_opt(|d| Some(d.days / 7))),
DatePart::Day => Ok(self.unary_opt(|d| Some(d.days))),
DatePart::Hour => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 60 * 1_000)))),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

D SELECT datepart('hour', interval '25 hour');
┌───────────────────────────────────────────────┐
│ datepart('hour', CAST('25 hour' AS INTERVAL)) │
│                     int64                     │
├───────────────────────────────────────────────┤
│                      25                       │
└───────────────────────────────────────────────┘
D SELECT datepart('day', interval '25 hour');
┌──────────────────────────────────────────────┐
│ datepart('day', CAST('25 hour' AS INTERVAL)) │
│                    int64                     │
├──────────────────────────────────────────────┤
│                      0                       │
└──────────────────────────────────────────────┘

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the record, the current arrow behavior is the same it seems:

> SELECT datepart('hour', interval '25 hour');
+--------------------------------------------------------------------------------------------------------------------------+
| date_part(Utf8("hour"),IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 90000000000000 }")) |
+--------------------------------------------------------------------------------------------------------------------------+
| 25                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------+
1 row(s) fetched.
Elapsed 0.030 seconds.

> SELECT datepart('day', interval '25 hour');
+-------------------------------------------------------------------------------------------------------------------------+
| date_part(Utf8("day"),IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 90000000000000 }")) |
+-------------------------------------------------------------------------------------------------------------------------+
| 0                                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------+

DatePart::Minute => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 1_000)))),
DatePart::Second => Ok(self.unary_opt(|d| Some(d.milliseconds / 1_000))),
DatePart::Millisecond => Ok(self.unary_opt(|d| Some(d.milliseconds))),
DatePart::Microsecond => Ok(self.unary_opt(|d| d.milliseconds.checked_mul(1_000))),
DatePart::Nanosecond => Ok(self.unary_opt(|d| d.milliseconds.checked_mul(1_000_000))),
DatePart::Minute => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 1_000) % 60))),
DatePart::Second => Ok(self.unary_opt(|d| Some(d.milliseconds / 1_000 % 60))),
DatePart::Millisecond => Ok(self.unary_opt(|d| Some(d.milliseconds % (60 * 1_000)))),
DatePart::Microsecond => {
Ok(self.unary_opt(|d| (d.milliseconds % (60 * 1_000)).checked_mul(1_000)))
}
DatePart::Nanosecond => {
Ok(self.unary_opt(|d| (d.milliseconds % (60 * 1_000)).checked_mul(1_000_000)))
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

D SELECT datepart('m', interval '1m 61s 33ms 44us');
┌─────────────────────────────────────────────────────┐
│ datepart('m', CAST('1m 61s 33ms 44us' AS INTERVAL)) │
│                        int64                        │
├─────────────────────────────────────────────────────┤
│                          2                          │
└─────────────────────────────────────────────────────┘
D SELECT datepart('s', interval '1m 61s 33ms 44us');
┌─────────────────────────────────────────────────────┐
│ datepart('s', CAST('1m 61s 33ms 44us' AS INTERVAL)) │
│                        int64                        │
├─────────────────────────────────────────────────────┤
│                          1                          │
└─────────────────────────────────────────────────────┘
D SELECT datepart('ms', interval '1m 61s 33ms 44us');
┌──────────────────────────────────────────────────────┐
│ datepart('ms', CAST('1m 61s 33ms 44us' AS INTERVAL)) │
│                        int64                         │
├──────────────────────────────────────────────────────┤
│                         1033                         │
└──────────────────────────────────────────────────────┘
D SELECT datepart('us', interval '1m 61s 33ms 44us');
┌──────────────────────────────────────────────────────┐
│ datepart('us', CAST('1m 61s 33ms 44us' AS INTERVAL)) │
│                        int64                         │
├──────────────────────────────────────────────────────┤
│                       1033044                        │
│                    (1.03 million)                    │
└──────────────────────────────────────────────────────┘

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a behavior change for arrow-rs.

duckdb says 1

SELECT datepart('s', interval '1m 61s 33ms 44us'); ┌─────────────────────────────────────────────────────┐ │ datepart('s', CAST('1m 61s 33ms 44us' AS INTERVAL)) │ │ int64 │ ├─────────────────────────────────────────────────────┤ │ 1 │ └─────────────────────────────────────────────────────┘

postgres says almost 1:

postgres=# SELECT extract (seconds from  interval '1m 61s 33ms 44us');
 extract
----------
 1.033044
(1 row)

Arrow says 121 (before this PR)

> SELECT datepart('s', interval '1m 61s 33ms 44us');
+---------------------------------------------------------------------------------------------------------------------+
| date_part(Utf8("s"),IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 121033044000 }")) |
+---------------------------------------------------------------------------------------------------------------------+
| 121                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------+
1 row(s) fetched.
Elapsed 0.004 seconds.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing for millisecond and microseconds


DatePart::Quarter
| DatePart::Year
Expand All @@ -488,25 +492,31 @@ impl ExtractDatePartExt for PrimitiveArray<IntervalMonthDayNanoType> {
fn date_part(&self, part: DatePart) -> Result<Int32Array, ArrowError> {
match part {
DatePart::Year => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months / 12))),
DatePart::Month => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months))),
DatePart::Month => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months % 12))),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

D SELECT datepart('year', interval '12 month');
┌────────────────────────────────────────────────┐
│ datepart('year', CAST('12 month' AS INTERVAL)) │
│                     int64                      │
├────────────────────────────────────────────────┤
│                       1                        │
└────────────────────────────────────────────────┘
D SELECT datepart('month', interval '12 month');
┌─────────────────────────────────────────────────┐
│ datepart('month', CAST('12 month' AS INTERVAL)) │
│                      int64                      │
├─────────────────────────────────────────────────┤
│                        0                        │
└─────────────────────────────────────────────────┘

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Postgres agrees

postgres=# SELECT extract (month from interval '12 month');
 extract
---------
       0
(1 row)

Before this PR arrow also currently says 12.

> SELECT datepart('month', interval '12 month');
+---------------------------------------------------------------------------------------------------------------+
| date_part(Utf8("month"),IntervalMonthDayNano("IntervalMonthDayNano { months: 12, days: 0, nanoseconds: 0 }")) |
+---------------------------------------------------------------------------------------------------------------+
| 12                                                                                                            |
+---------------------------------------------------------------------------------------------------------------+
1 row(s) fetched.
Elapsed 0.001 seconds.

I think this change makes sense

DatePart::Week => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.days / 7))),
DatePart::Day => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.days))),
DatePart::Hour => {
Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 60 * 1_000_000_000)).try_into().ok()))
}
DatePart::Minute => {
Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 1_000_000_000)).try_into().ok()))
Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 1_000_000_000) % 60).try_into().ok()))
}
DatePart::Second => {
Ok(self.unary_opt(|d| (d.nanoseconds / 1_000_000_000).try_into().ok()))
Ok(self.unary_opt(|d| ((d.nanoseconds / 1_000_000_000) % 60).try_into().ok()))
}
DatePart::Millisecond => {
Ok(self.unary_opt(|d| (d.nanoseconds / 1_000_000).try_into().ok()))
}
DatePart::Microsecond => {
Ok(self.unary_opt(|d| (d.nanoseconds / 1_000).try_into().ok()))
DatePart::Millisecond => Ok(self.unary_opt(|d| {
(d.nanoseconds % (60 * 1_000_000_000) / 1_000_000)
.try_into()
.ok()
})),
DatePart::Microsecond => Ok(self.unary_opt(|d| {
(d.nanoseconds % (60 * 1_000_000_000) / 1_000)
.try_into()
.ok()
})),
DatePart::Nanosecond => {
Ok(self.unary_opt(|d| (d.nanoseconds % (60 * 1_000_000_000)).try_into().ok()))
}
DatePart::Nanosecond => Ok(self.unary_opt(|d| d.nanoseconds.try_into().ok())),

DatePart::Quarter
| DatePart::WeekISO
Expand Down Expand Up @@ -1778,9 +1788,14 @@ mod tests {
fn test_interval_day_time_array() {
let input: IntervalDayTimeArray = vec![
IntervalDayTime::ZERO,
IntervalDayTime::new(10, 42),
IntervalDayTime::new(10, 1042),
IntervalDayTime::new(10, MILLISECONDS_IN_DAY as i32 + 1),
IntervalDayTime::new(10, 42), // 10d, 42ms
IntervalDayTime::new(10, 1042), // 10d, 1s, 42ms
IntervalDayTime::new(10, MILLISECONDS_IN_DAY as i32 + 1), // 10d, 24h, 1ms
IntervalDayTime::new(
6,
(MILLISECONDS * 60 * 60 * 4 + MILLISECONDS * 60 * 22 + MILLISECONDS * 11 + 3)
as i32,
), // 6d, 4h, 22m, 11s, 3ms
]
.into();

Expand All @@ -1791,58 +1806,65 @@ mod tests {
assert_eq!(10, actual.value(1));
assert_eq!(10, actual.value(2));
assert_eq!(10, actual.value(3));
assert_eq!(6, actual.value(4));

let actual = date_part(&input, DatePart::Week).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(1, actual.value(1));
assert_eq!(1, actual.value(2));
assert_eq!(1, actual.value(3));
assert_eq!(0, actual.value(4));

// Days doesn't affect time.
let actual = date_part(&input, DatePart::Nanosecond).unwrap();
let actual = date_part(&input, DatePart::Hour).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42_000_000, actual.value(1));
assert_eq!(1_042_000_000, actual.value(2));
// Overflow returns zero.
assert_eq!(0, actual.value(1));
assert_eq!(0, actual.value(2));
assert_eq!(24, actual.value(3));
assert_eq!(4, actual.value(4));

let actual = date_part(&input, DatePart::Minute).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(0, actual.value(2));
assert_eq!(0, actual.value(3));
assert_eq!(22, actual.value(4));

let actual = date_part(&input, DatePart::Microsecond).unwrap();
let actual = date_part(&input, DatePart::Second).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42_000, actual.value(1));
assert_eq!(1_042_000, actual.value(2));
// Overflow returns zero.
assert_eq!(0, actual.value(1));
assert_eq!(1, actual.value(2));
assert_eq!(0, actual.value(3));
assert_eq!(11, actual.value(4));

let actual = date_part(&input, DatePart::Millisecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42, actual.value(1));
assert_eq!(1042, actual.value(2));
assert_eq!(MILLISECONDS_IN_DAY as i32 + 1, actual.value(3));

let actual = date_part(&input, DatePart::Second).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(1, actual.value(2));
assert_eq!(24 * 60 * 60, actual.value(3));
assert_eq!(1, actual.value(3));
assert_eq!(11003, actual.value(4));

let actual = date_part(&input, DatePart::Minute).unwrap();
let actual = date_part(&input, DatePart::Microsecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(0, actual.value(2));
assert_eq!(24 * 60, actual.value(3));
assert_eq!(42_000, actual.value(1));
assert_eq!(1_042_000, actual.value(2));
assert_eq!(1_000, actual.value(3));
assert_eq!(11_003_000, actual.value(4));

let actual = date_part(&input, DatePart::Hour).unwrap();
let actual = date_part(&input, DatePart::Nanosecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(0, actual.value(2));
assert_eq!(24, actual.value(3));
assert_eq!(42_000_000, actual.value(1));
assert_eq!(1_042_000_000, actual.value(2));
assert_eq!(1_000_000, actual.value(3));
// Overflow returns zero.
assert_eq!(0, actual.value(4));

// Month and year are not valid (since days in month varies).
assert!(date_part(&input, DatePart::Month).is_err());
Expand All @@ -1855,8 +1877,18 @@ mod tests {
fn test_interval_month_day_nano_array() {
let input: IntervalMonthDayNanoArray = vec![
IntervalMonthDayNano::ZERO,
IntervalMonthDayNano::new(5, 10, 42),
IntervalMonthDayNano::new(16, 35, MILLISECONDS_IN_DAY * 1_000_000 + 1),
IntervalMonthDayNano::new(5, 10, 42), // 5m, 1w, 3d, 42ns
IntervalMonthDayNano::new(16, 35, NANOSECONDS_IN_DAY + 1), // 1y, 4m, 5w, 24h, 1ns
IntervalMonthDayNano::new(
0,
0,
NANOSECONDS * 60 * 60 * 4
+ NANOSECONDS * 60 * 22
+ NANOSECONDS * 11
+ 1_000_000 * 33
+ 1_000 * 44
+ 5,
), // 4hr, 22m, 11s, 33ms, 44us, 5ns
]
.into();

Expand All @@ -1866,64 +1898,73 @@ mod tests {
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(1, actual.value(2));
assert_eq!(0, actual.value(3));

let actual = date_part(&input, DatePart::Month).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(5, actual.value(1));
assert_eq!(16, actual.value(2));
assert_eq!(4, actual.value(2));
assert_eq!(0, actual.value(3));

// Week and day follow from day, but are not affected by months or nanos.
let actual = date_part(&input, DatePart::Week).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(1, actual.value(1));
assert_eq!(5, actual.value(2));
assert_eq!(0, actual.value(3));

let actual = date_part(&input, DatePart::Day).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(10, actual.value(1));
assert_eq!(35, actual.value(2));
assert_eq!(0, actual.value(3));

// Times follow from nanos, but are not affected by months or days.
let actual = date_part(&input, DatePart::Hour).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24, actual.value(2));
assert_eq!(4, actual.value(3));

let actual = date_part(&input, DatePart::Minute).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24 * 60, actual.value(2));
assert_eq!(0, actual.value(2));
assert_eq!(22, actual.value(3));

let actual = date_part(&input, DatePart::Second).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24 * 60 * 60, actual.value(2));
assert_eq!(0, actual.value(2));
assert_eq!(11, actual.value(3));

let actual = date_part(&input, DatePart::Millisecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24 * 60 * 60 * 1_000, actual.value(2));
assert_eq!(0, actual.value(2));
assert_eq!(11_033, actual.value(3));

let actual = date_part(&input, DatePart::Microsecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
// Overflow gives zero.
assert_eq!(0, actual.value(2));
assert_eq!(11_033_044, actual.value(3));

let actual = date_part(&input, DatePart::Nanosecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42, actual.value(1));
// Overflow gives zero.
assert_eq!(0, actual.value(2));
assert_eq!(1, actual.value(2));
// Overflow returns zero.
assert_eq!(0, actual.value(3));
}

#[test]
Expand Down
Loading