Skip to content

Commit 15510df

Browse files
committed
feat: Add Spark date_trunc and trunc function
1 parent 704f67b commit 15510df

File tree

7 files changed

+429
-217
lines changed

7 files changed

+429
-217
lines changed

velox/docs/functions/spark/datetime.rst

+14
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ These functions support TIMESTAMP and DATE input types.
5151
SELECT date_from_unix_date(1); -- '1970-01-02'
5252
SELECT date_from_unix_date(-1); -- '1969-12-31'
5353

54+
.. spark:function:: date_trunc(unit_string, ts) -> timestamp
55+
56+
Returns timestamp ``ts`` truncated to the unit specified by the format model ``unit_string``. ::
57+
58+
SELECT date_trunc('YEAR', '2025-02-15 12:05:30.127127'); -- '2025-01-01 00:00:00.000'
59+
SELECT date_trunc('HOUR', '2025-02-15 12:05:30.127127'); -- '2025-02-15 12:00:00.000'
60+
5461
.. spark:function:: date_sub(start_date, num_days) -> date
5562
5663
Returns the date that is ``num_days`` before ``start_date``. According to the inputs,
@@ -263,6 +270,13 @@ These functions support TIMESTAMP and DATE input types.
263270

264271
SELECT to_utc_timestamp('2015-07-24 00:00:00', 'America/Los_Angeles'); -- '2015-07-24 07:00:00'
265272

273+
.. spark:function:: trunc(date, unit_string) -> date
274+
275+
Returns ``date`` with the time portion of the day truncated to the unit specified by the format model ``unit_string``. ::
276+
277+
SELECT trunc('2025-02-15', 'MONTH'); -- '2025-02-01'
278+
SELECT trunc('2025-02-15', 'QUARTER'); -- '2025-01-01'
279+
266280
.. spark:function:: unix_date(date) -> integer
267281
268282
Returns the number of days since 1970-01-01. ::

velox/functions/lib/TimeUtils.h

+262
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@
1515
*/
1616
#pragma once
1717

18+
#include <boost/algorithm/string/case_conv.hpp>
19+
1820
#include <velox/type/Timestamp.h>
1921
#include "velox/core/QueryConfig.h"
22+
#include "velox/expression/ComplexViewTypes.h"
2023
#include "velox/external/date/date.h"
2124
#include "velox/external/date/iso_week.h"
2225
#include "velox/functions/Macros.h"
@@ -123,4 +126,263 @@ struct InitSessionTimezone {
123126
timeZone_ = getTimeZoneFromConfig(config);
124127
}
125128
};
129+
130+
enum class DateTimeUnit {
131+
kMicrosecond,
132+
kMillisecond,
133+
kSecond,
134+
kMinute,
135+
kHour,
136+
kDay,
137+
kWeek,
138+
kMonth,
139+
kQuarter,
140+
kYear
141+
};
142+
143+
FOLLY_ALWAYS_INLINE std::string dateTimeUnitString(DateTimeUnit unit) {
144+
switch (unit) {
145+
case DateTimeUnit::kMicrosecond:
146+
return "microsecond";
147+
case DateTimeUnit::kMillisecond:
148+
return "millisecond";
149+
case DateTimeUnit::kSecond:
150+
return "second";
151+
case DateTimeUnit::kMinute:
152+
return "minute";
153+
case DateTimeUnit::kHour:
154+
return "hour";
155+
case DateTimeUnit::kDay:
156+
return "day";
157+
case DateTimeUnit::kWeek:
158+
return "week";
159+
case DateTimeUnit::kMonth:
160+
return "month";
161+
case DateTimeUnit::kQuarter:
162+
return "quarter";
163+
case DateTimeUnit::kYear:
164+
return "year";
165+
default:
166+
return fmt::format("UNKNOWN: {}", static_cast<int>(unit));
167+
}
168+
}
169+
170+
FOLLY_ALWAYS_INLINE std::optional<DateTimeUnit> fromDateTimeUnitString(
171+
const StringView& unitString,
172+
bool isSparkUnitString) {
173+
static const StringView kMicrosecond("microsecond");
174+
static const StringView kMillisecond("millisecond");
175+
static const StringView kSecond("second");
176+
static const StringView kMinute("minute");
177+
static const StringView kHour("hour");
178+
static const StringView kDay("day");
179+
static const StringView kWeek("week");
180+
static const StringView kMonth("month");
181+
static const StringView kQuarter("quarter");
182+
static const StringView kYear("year");
183+
184+
const auto unit = boost::algorithm::to_lower_copy(unitString.str());
185+
186+
if (unit == kMillisecond) {
187+
return DateTimeUnit::kMillisecond;
188+
}
189+
if (unit == kSecond) {
190+
return DateTimeUnit::kSecond;
191+
}
192+
if (unit == kMinute) {
193+
return DateTimeUnit::kMinute;
194+
}
195+
if (unit == kHour) {
196+
return DateTimeUnit::kHour;
197+
}
198+
if (unit == kDay) {
199+
return DateTimeUnit::kDay;
200+
}
201+
if (unit == kWeek) {
202+
return DateTimeUnit::kWeek;
203+
}
204+
if (unit == kMonth) {
205+
return DateTimeUnit::kMonth;
206+
}
207+
if (unit == kQuarter) {
208+
return DateTimeUnit::kQuarter;
209+
}
210+
if (unit == kYear) {
211+
return DateTimeUnit::kYear;
212+
}
213+
if (isSparkUnitString) {
214+
if (unit == kMicrosecond) {
215+
return DateTimeUnit::kMicrosecond;
216+
}
217+
if (unit == "dd") {
218+
return DateTimeUnit::kDay;
219+
}
220+
if (unit == "mon" || unit == "mm") {
221+
return DateTimeUnit::kMonth;
222+
}
223+
if (unit == "yyyy" || unit == "yy") {
224+
return DateTimeUnit::kYear;
225+
}
226+
}
227+
return std::nullopt;
228+
}
229+
230+
FOLLY_ALWAYS_INLINE std::optional<DateTimeUnit> getDateTimeUnit(
231+
const StringView& unitString,
232+
bool throwIfInvalid,
233+
const DateTimeUnit& minUnit = DateTimeUnit::kMillisecond,
234+
bool isSparkUnitString = false) {
235+
std::optional<DateTimeUnit> unit =
236+
fromDateTimeUnitString(unitString, isSparkUnitString);
237+
if (unit.has_value() && unit.value() >= minUnit) {
238+
return unit;
239+
}
240+
if (throwIfInvalid) {
241+
if (unit.has_value()) {
242+
VELOX_USER_FAIL(
243+
"{} is not a valid datetime unit field, minimum unit is {}",
244+
unitString,
245+
dateTimeUnitString(minUnit));
246+
} else {
247+
VELOX_UNSUPPORTED("Unsupported datetime unit: {}", unitString);
248+
}
249+
}
250+
return std::nullopt;
251+
}
252+
253+
FOLLY_ALWAYS_INLINE void adjustDateTime(
254+
std::tm& dateTime,
255+
const DateTimeUnit& unit) {
256+
switch (unit) {
257+
case DateTimeUnit::kYear:
258+
dateTime.tm_mon = 0;
259+
dateTime.tm_yday = 0;
260+
FMT_FALLTHROUGH;
261+
case DateTimeUnit::kQuarter:
262+
dateTime.tm_mon = dateTime.tm_mon / 3 * 3;
263+
FMT_FALLTHROUGH;
264+
case DateTimeUnit::kMonth:
265+
dateTime.tm_mday = 1;
266+
dateTime.tm_hour = 0;
267+
dateTime.tm_min = 0;
268+
dateTime.tm_sec = 0;
269+
break;
270+
case DateTimeUnit::kWeek:
271+
// Subtract the truncation
272+
dateTime.tm_mday -= dateTime.tm_wday == 0 ? 6 : dateTime.tm_wday - 1;
273+
// Setting the day of the week to Monday
274+
dateTime.tm_wday = 1;
275+
276+
// If the adjusted day of the month falls in the previous month
277+
// Move to the previous month
278+
if (dateTime.tm_mday < 1) {
279+
dateTime.tm_mon -= 1;
280+
281+
// If the adjusted month falls in the previous year
282+
// Set to December and Move to the previous year
283+
if (dateTime.tm_mon < 0) {
284+
dateTime.tm_mon = 11;
285+
dateTime.tm_year -= 1;
286+
}
287+
288+
// Calculate the correct day of the month based on the number of days
289+
// in the adjusted month
290+
static const int daysInMonth[] = {
291+
31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
292+
int daysInPrevMonth = daysInMonth[dateTime.tm_mon];
293+
294+
// Adjust for leap year if February
295+
if (dateTime.tm_mon == 1 && (dateTime.tm_year + 1900) % 4 == 0 &&
296+
((dateTime.tm_year + 1900) % 100 != 0 ||
297+
(dateTime.tm_year + 1900) % 400 == 0)) {
298+
daysInPrevMonth = 29;
299+
}
300+
// Set to the correct day in the previous month
301+
dateTime.tm_mday += daysInPrevMonth;
302+
}
303+
dateTime.tm_hour = 0;
304+
dateTime.tm_min = 0;
305+
dateTime.tm_sec = 0;
306+
break;
307+
case DateTimeUnit::kDay:
308+
dateTime.tm_hour = 0;
309+
FMT_FALLTHROUGH;
310+
case DateTimeUnit::kHour:
311+
dateTime.tm_min = 0;
312+
FMT_FALLTHROUGH;
313+
case DateTimeUnit::kMinute:
314+
dateTime.tm_sec = 0;
315+
break;
316+
default:
317+
VELOX_UNREACHABLE();
318+
}
319+
}
320+
321+
/// For fixed interval like second, minute, hour, day and week
322+
/// we can truncate date by a simple arithmetic expression:
323+
/// floor(seconds / intervalSeconds) * intervalSeconds.
324+
FOLLY_ALWAYS_INLINE Timestamp
325+
adjustEpoch(int64_t seconds, int64_t intervalSeconds) {
326+
int64_t s = seconds / intervalSeconds;
327+
if (seconds < 0 && seconds % intervalSeconds) {
328+
s = s - 1;
329+
}
330+
int64_t truncedSeconds = s * intervalSeconds;
331+
return Timestamp(truncedSeconds, 0);
332+
}
333+
334+
FOLLY_ALWAYS_INLINE Timestamp truncateTimestamp(
335+
const Timestamp& timestamp,
336+
const DateTimeUnit& unit,
337+
const tz::TimeZone* timeZone) {
338+
Timestamp result;
339+
switch (unit) {
340+
case DateTimeUnit::kMicrosecond:
341+
return Timestamp(
342+
timestamp.getSeconds(), timestamp.getNanos() / 1000 * 1000);
343+
344+
case DateTimeUnit::kMillisecond:
345+
return Timestamp(
346+
timestamp.getSeconds(), timestamp.getNanos() / 1000000 * 1000000);
347+
348+
// For seconds, we just truncate the nanoseconds part of the timestamp; no
349+
// timezone conversion required.
350+
case DateTimeUnit::kSecond:
351+
return Timestamp(timestamp.getSeconds(), 0);
352+
353+
// Same for minutes; timezones and daylight savings time are at least in
354+
// the granularity of 30 mins, so we can just truncate the epoch directly.
355+
case DateTimeUnit::kMinute:
356+
return adjustEpoch(timestamp.getSeconds(), 60);
357+
358+
// Hour truncation has to handle the corner case of daylight savings time
359+
// boundaries. Since conversions from local timezone to UTC may be
360+
// ambiguous, we need to be carefull about the roundtrip of converting to
361+
// local time and back. So what we do is to calculate the truncation delta
362+
// in UTC, then applying it to the input timestamp.
363+
case DateTimeUnit::kHour: {
364+
auto epochToAdjust = getSeconds(timestamp, timeZone);
365+
auto secondsDelta =
366+
epochToAdjust - adjustEpoch(epochToAdjust, 60 * 60).getSeconds();
367+
return Timestamp(timestamp.getSeconds() - secondsDelta, 0);
368+
}
369+
370+
// For the truncations below, we may first need to convert to the local
371+
// timestamp, truncate, then convert back to GMT.
372+
case DateTimeUnit::kDay:
373+
result = adjustEpoch(getSeconds(timestamp, timeZone), 24 * 60 * 60);
374+
break;
375+
376+
default:
377+
auto dateTime = getDateTime(timestamp, timeZone);
378+
adjustDateTime(dateTime, unit);
379+
result = Timestamp(Timestamp::calendarUtcToEpoch(dateTime), 0);
380+
break;
381+
}
382+
383+
if (timeZone != nullptr) {
384+
result.toGMT(*timeZone);
385+
}
386+
return result;
387+
}
126388
} // namespace facebook::velox::functions

0 commit comments

Comments
 (0)