Skip to content

Commit 712b231

Browse files
committed
add date_trunc spark function
fix style fix fix fix fix fix fix update for reuse fix style fix fix fix fix fix style update update fix fix update update fix fix update update update update add ut add ut
1 parent 46c5fbd commit 712b231

File tree

10 files changed

+583
-189
lines changed

10 files changed

+583
-189
lines changed

velox/docs/functions/spark/datetime.rst

+37-2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,41 @@ These functions support TIMESTAMP and DATE input types.
6161
``num_days`` can be positive or negative.
6262
Supported types for ``num_days`` are: TINYINT, SMALLINT, INTEGER.
6363

64+
.. spark:function:: date_trunc(fmt, ts) -> timestamp
65+
66+
Returns timestamp ``ts`` truncated to the unit specified by the format model ``fmt``.
67+
Returns null if ``fmt`` is invalid.
68+
69+
``fmt`` is case insensitive and must be one of the following:
70+
* "YEAR", "YYYY", "YY" - truncate to the first date of the year that the ``ts`` falls in, the time part will be zero out
71+
* "QUARTER" - truncate to the first date of the quarter that the ``ts`` falls in, the time part will be zero out
72+
* "MONTH", "MM", "MON" - truncate to the first date of the month that the ``ts`` falls in, the time part will be zero out
73+
* "WEEK" - truncate to the Monday of the week that the ``ts`` falls in, the time part will be zero out
74+
* "DAY", "DD" - zero out the time part
75+
* "HOUR" - zero out the minute and second with fraction part
76+
* "MINUTE"- zero out the second with fraction part
77+
* "SECOND" - zero out the second fraction part
78+
* "MILLISECOND" - zero out the microseconds
79+
* "MICROSECOND" - everything remains.
80+
81+
::
82+
83+
SELECT date_trunc('YEAR', '2015-03-05T09:32:05.359'); -- 2015-01-01 00:00:00
84+
SELECT date_trunc('YYYY', '2015-03-05T09:32:05.359'); -- 2015-01-01 00:00:00
85+
SELECT date_trunc('YY', '2015-03-05T09:32:05.359'); -- 2015-01-01 00:00:00
86+
SELECT date_trunc('QUARTER', '2015-03-05T09:32:05.359'); -- 2015-01-01 00:00:00
87+
SELECT date_trunc('MONTH', '2015-03-05T09:32:05.359'); -- 2015-03-01 00:00:00
88+
SELECT date_trunc('MM', '2015-03-05T09:32:05.359'); -- 2015-03-01 00:00:00
89+
SELECT date_trunc('MON', '2015-03-05T09:32:05.359'); -- 2015-03-01 00:00:00
90+
SELECT date_trunc('WEEK', '2015-03-05T09:32:05.359'); -- 2015-03-02 00:00:00
91+
SELECT date_trunc('DAY', '2015-03-05T09:32:05.359'); -- 2015-03-05 00:00:00
92+
SELECT date_trunc('DD', '2015-03-05T09:32:05.359'); -- 2015-03-05 00:00:00
93+
SELECT date_trunc('HOUR', '2015-03-05T09:32:05.359'); -- 2015-03-05 09:00:00
94+
SELECT date_trunc('MINUTE', '2015-03-05T09:32:05.359'); -- 2015-03-05 09:32:00
95+
SELECT date_trunc('SECOND', '2015-03-05T09:32:05.359'); -- 2015-03-05 09:32:05
96+
SELECT date_trunc('MILLISECOND', '2015-03-05T09:32:05.123456'); -- 2015-03-05 09:32:05.123
97+
SELECT date_trunc('MICROSECOND', '2015-03-05T09:32:05.123456'); -- 2015-03-05 09:32:05.123456
98+
6499
.. spark:function:: datediff(endDate, startDate) -> integer
65100
66101
Returns the number of days from startDate to endDate. Only DATE type is allowed
@@ -285,7 +320,7 @@ These functions support TIMESTAMP and DATE input types.
285320
SELECT unix_millis('1970-01-01 00:00:01'); -- 1000
286321

287322
.. spark:function:: unix_seconds(timestamp) -> bigint
288-
323+
289324
Returns the number of seconds since 1970-01-01 00:00:00 UTC. ::
290325

291326
SELECT unix_seconds('1970-01-01 00:00:01'); -- 1
@@ -297,7 +332,7 @@ These functions support TIMESTAMP and DATE input types.
297332
.. spark:function:: unix_timestamp(string) -> integer
298333
:noindex:
299334

300-
Returns the UNIX timestamp of time specified by ``string``. Assumes the
335+
Returns the UNIX timestamp of time specified by ``string``. Assumes the
301336
format ``yyyy-MM-dd HH:mm:ss``. Returns null if ``string`` does not match
302337
``format``.
303338

velox/functions/lib/DateTimeFormatter.h

+13
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,19 @@ enum class DateTimeFormatSpecifier : uint8_t {
118118
WEEK_OF_MONTH = 24
119119
};
120120

121+
enum class DateTimeUnit {
122+
kMicrosecond,
123+
kMillisecond,
124+
kSecond,
125+
kMinute,
126+
kHour,
127+
kDay,
128+
kWeek,
129+
kMonth,
130+
kQuarter,
131+
kYear
132+
};
133+
121134
struct FormatPattern {
122135
DateTimeFormatSpecifier specifier;
123136

velox/functions/lib/TimeUtils.h

+204
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,14 @@
1515
*/
1616
#pragma once
1717

18+
#include <boost/algorithm/string/case_conv.hpp>
1819
#include <velox/type/Timestamp.h>
1920
#include "velox/core/QueryConfig.h"
21+
#include "velox/expression/ComplexViewTypes.h"
2022
#include "velox/external/date/date.h"
2123
#include "velox/external/date/iso_week.h"
2224
#include "velox/functions/Macros.h"
25+
#include "velox/functions/lib/DateTimeFormatter.h"
2326
#include "velox/type/tz/TimeZoneMap.h"
2427

2528
namespace facebook::velox::functions {
@@ -123,4 +126,205 @@ struct InitSessionTimezone {
123126
timeZone_ = getTimeZoneFromConfig(config);
124127
}
125128
};
129+
130+
/// Converts string as date time unit. Throws for invalid input string.
131+
///
132+
/// @param unitString The input string to represent date time unit.
133+
/// @param throwIfInvalid Whether to throw an exception for invalid input
134+
/// string.
135+
/// @param allowMicro Whether to allow microsecond.
136+
/// @param allowAbbreviated Whether to allow abbreviated unit string.
137+
FOLLY_ALWAYS_INLINE std::optional<DateTimeUnit> fromDateTimeUnitString(
138+
const StringView& unitString,
139+
bool throwIfInvalid,
140+
bool allowMicro = false,
141+
bool allowAbbreviated = false) {
142+
const auto unit = boost::algorithm::to_lower_copy(unitString.str());
143+
144+
if (unit == "microsecond" && allowMicro) {
145+
return DateTimeUnit::kMicrosecond;
146+
}
147+
if (unit == "millisecond") {
148+
return DateTimeUnit::kMillisecond;
149+
}
150+
if (unit == "second") {
151+
return DateTimeUnit::kSecond;
152+
}
153+
if (unit == "minute") {
154+
return DateTimeUnit::kMinute;
155+
}
156+
if (unit == "hour") {
157+
return DateTimeUnit::kHour;
158+
}
159+
if (unit == "day") {
160+
return DateTimeUnit::kDay;
161+
}
162+
if (unit == "week") {
163+
return DateTimeUnit::kWeek;
164+
}
165+
if (unit == "month") {
166+
return DateTimeUnit::kMonth;
167+
}
168+
if (unit == "quarter") {
169+
return DateTimeUnit::kQuarter;
170+
}
171+
if (unit == "year") {
172+
return DateTimeUnit::kYear;
173+
}
174+
if (allowAbbreviated) {
175+
if (unit == "dd") {
176+
return DateTimeUnit::kDay;
177+
}
178+
if (unit == "mon" || unit == "mm") {
179+
return DateTimeUnit::kMonth;
180+
}
181+
if (unit == "yyyy" || unit == "yy") {
182+
return DateTimeUnit::kYear;
183+
}
184+
}
185+
if (throwIfInvalid) {
186+
VELOX_UNSUPPORTED("Unsupported datetime unit: {}", unitString);
187+
}
188+
return std::nullopt;
189+
}
190+
191+
/// Adjusts the given date time object to the start of the specified date time
192+
/// unit (e.g., year, quarter, month, week, day, hour, minute).
193+
FOLLY_ALWAYS_INLINE void adjustDateTime(
194+
std::tm& dateTime,
195+
const DateTimeUnit& unit) {
196+
switch (unit) {
197+
case DateTimeUnit::kYear:
198+
dateTime.tm_mon = 0;
199+
dateTime.tm_yday = 0;
200+
FMT_FALLTHROUGH;
201+
case DateTimeUnit::kQuarter:
202+
dateTime.tm_mon = dateTime.tm_mon / 3 * 3;
203+
FMT_FALLTHROUGH;
204+
case DateTimeUnit::kMonth:
205+
dateTime.tm_mday = 1;
206+
dateTime.tm_hour = 0;
207+
dateTime.tm_min = 0;
208+
dateTime.tm_sec = 0;
209+
break;
210+
case DateTimeUnit::kWeek:
211+
// Subtract the truncation.
212+
dateTime.tm_mday -= dateTime.tm_wday == 0 ? 6 : dateTime.tm_wday - 1;
213+
// Setting the day of the week to Monday.
214+
dateTime.tm_wday = 1;
215+
216+
// If the adjusted day of the month falls in the previous month
217+
// Move to the previous month.
218+
if (dateTime.tm_mday < 1) {
219+
dateTime.tm_mon -= 1;
220+
221+
// If the adjusted month falls in the previous year
222+
// Set to December and Move to the previous year.
223+
if (dateTime.tm_mon < 0) {
224+
dateTime.tm_mon = 11;
225+
dateTime.tm_year -= 1;
226+
}
227+
228+
// Calculate the correct day of the month based on the number of days
229+
// in the adjusted month.
230+
static const int daysInMonth[] = {
231+
31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
232+
int daysInPrevMonth = daysInMonth[dateTime.tm_mon];
233+
234+
// Adjust for leap year if February.
235+
if (dateTime.tm_mon == 1 && (dateTime.tm_year + 1900) % 4 == 0 &&
236+
((dateTime.tm_year + 1900) % 100 != 0 ||
237+
(dateTime.tm_year + 1900) % 400 == 0)) {
238+
daysInPrevMonth = 29;
239+
}
240+
// Set to the correct day in the previous month.
241+
dateTime.tm_mday += daysInPrevMonth;
242+
}
243+
dateTime.tm_hour = 0;
244+
dateTime.tm_min = 0;
245+
dateTime.tm_sec = 0;
246+
break;
247+
case DateTimeUnit::kDay:
248+
dateTime.tm_hour = 0;
249+
FMT_FALLTHROUGH;
250+
case DateTimeUnit::kHour:
251+
dateTime.tm_min = 0;
252+
FMT_FALLTHROUGH;
253+
case DateTimeUnit::kMinute:
254+
dateTime.tm_sec = 0;
255+
break;
256+
default:
257+
VELOX_UNREACHABLE();
258+
}
259+
}
260+
261+
/// Returns timestamp with seconds adjusted to the nearest lower multiple of the
262+
/// specified interval. If the given seconds is negative and not an exact
263+
/// multiple of the interval, it adjusts further down.
264+
FOLLY_ALWAYS_INLINE Timestamp
265+
adjustEpoch(int64_t seconds, int64_t intervalSeconds) {
266+
int64_t s = seconds / intervalSeconds;
267+
if (seconds < 0 && seconds % intervalSeconds) {
268+
s = s - 1;
269+
}
270+
int64_t truncatedSeconds = s * intervalSeconds;
271+
return Timestamp(truncatedSeconds, 0);
272+
}
273+
274+
// Returns timestamp truncated to the specified unit.
275+
FOLLY_ALWAYS_INLINE Timestamp truncateTimestamp(
276+
const Timestamp& timestamp,
277+
DateTimeUnit unit,
278+
const tz::TimeZone* timeZone) {
279+
Timestamp result;
280+
switch (unit) {
281+
// For seconds ,millisecond, microsecond we just truncate the nanoseconds
282+
// part of the timestamp; no timezone conversion required.
283+
case DateTimeUnit::kMicrosecond:
284+
return Timestamp(
285+
timestamp.getSeconds(), timestamp.getNanos() / 1000 * 1000);
286+
287+
case DateTimeUnit::kMillisecond:
288+
return Timestamp(
289+
timestamp.getSeconds(), timestamp.getNanos() / 1000000 * 1000000);
290+
291+
case DateTimeUnit::kSecond:
292+
return Timestamp(timestamp.getSeconds(), 0);
293+
294+
// Same for minutes; timezones and daylight savings time are at least in
295+
// the granularity of 30 mins, so we can just truncate the epoch directly.
296+
case DateTimeUnit::kMinute:
297+
return adjustEpoch(timestamp.getSeconds(), 60);
298+
299+
// Hour truncation has to handle the corner case of daylight savings time
300+
// boundaries. Since conversions from local timezone to UTC may be
301+
// ambiguous, we need to be carefull about the roundtrip of converting to
302+
// local time and back. So what we do is to calculate the truncation delta
303+
// in UTC, then applying it to the input timestamp.
304+
case DateTimeUnit::kHour: {
305+
auto epochToAdjust = getSeconds(timestamp, timeZone);
306+
auto secondsDelta =
307+
epochToAdjust - adjustEpoch(epochToAdjust, 60 * 60).getSeconds();
308+
return Timestamp(timestamp.getSeconds() - secondsDelta, 0);
309+
}
310+
311+
// For the truncations below, we may first need to convert to the local
312+
// timestamp, truncate, then convert back to GMT.
313+
case DateTimeUnit::kDay:
314+
result = adjustEpoch(getSeconds(timestamp, timeZone), 24 * 60 * 60);
315+
break;
316+
317+
default:
318+
auto dateTime = getDateTime(timestamp, timeZone);
319+
adjustDateTime(dateTime, unit);
320+
result = Timestamp(Timestamp::calendarUtcToEpoch(dateTime), 0);
321+
break;
322+
}
323+
324+
if (timeZone != nullptr) {
325+
result.toGMT(*timeZone);
326+
}
327+
return result;
328+
}
329+
126330
} // namespace facebook::velox::functions

velox/functions/lib/tests/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ add_executable(
2424
Re2FunctionsTest.cpp
2525
RepeatTest.cpp
2626
TDigestTest.cpp
27+
TimeUtilsTest.cpp
2728
Utf8Test.cpp
2829
ZetaDistributionTest.cpp)
2930

0 commit comments

Comments
 (0)