Skip to content

Commit e10c0a3

Browse files
Bikramjeet Vigfacebook-github-bot
Bikramjeet Vig
authored andcommitted
refactor: move json_extract_scalar back to simple function
Summary: We initially moved json_extract_scalar to a vector function implementation so that we can reuse the JSON parsing logic and apply it to VARCHAR inputs. JSON parsing achieves the following for VARCHAR input: (1) it formats spaces, (2) it sorts the keys inside objects, (3) it flags invalid JSON, and (4) it unescapes string scalars. For json_extract_scalar, we only care about (3) and (4) since the output is a scalar. Therefore, we can avoid the costly operation of rewriting the large input JSON in a normalized form and only apply the operations we require. To achieve (3), SIMD JSON can be used to quickly check for valid JSON, and for (4), SIMD JSON provides an API to extract a string scalar after removing escaping. This change moves json_extract_scalar back to simple function implementation and ensures (3) and (4) are applied. Differential Revision: D70978059
1 parent 7f1e4ef commit e10c0a3

File tree

4 files changed

+118
-140
lines changed

4 files changed

+118
-140
lines changed

velox/functions/prestosql/JsonFunctions.cpp

+20-137
Original file line numberDiff line numberDiff line change
@@ -541,9 +541,6 @@ class JsonParseFunction : public exec::VectorFunction {
541541

542542
class JsonExtractFunction : public exec::VectorFunction {
543543
public:
544-
JsonExtractFunction(bool extractScalarOnly)
545-
: extractScalarOnly_(extractScalarOnly) {}
546-
547544
void apply(
548545
const SelectivityVector& rows,
549546
std::vector<VectorPtr>& args,
@@ -564,21 +561,7 @@ class JsonExtractFunction : public exec::VectorFunction {
564561
context.moveOrCopyResult(localResult, rows, result);
565562
}
566563

567-
static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures(
568-
bool extractScalarOnly) {
569-
if (extractScalarOnly) {
570-
return {
571-
exec::FunctionSignatureBuilder()
572-
.returnType("varchar")
573-
.argumentType("json")
574-
.argumentType("varchar")
575-
.build(),
576-
exec::FunctionSignatureBuilder()
577-
.returnType("varchar")
578-
.argumentType("varchar")
579-
.argumentType("varchar")
580-
.build()};
581-
}
564+
static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
582565
return {
583566
exec::FunctionSignatureBuilder()
584567
.returnType("json")
@@ -612,13 +595,8 @@ class JsonExtractFunction : public exec::VectorFunction {
612595
auto jsonValue = json->as<ConstantVector<StringView>>()->valueAt(0);
613596
auto pathValue = path->as<ConstantVector<StringView>>()->valueAt(0);
614597
try {
615-
if (extractScalarOnly_) {
616-
nullResult = processJsonExtractScalar(
617-
jsonValue, pathValue, output) != simdjson::SUCCESS;
618-
} else {
619-
nullResult = processJsonExtract(jsonValue, pathValue, output) !=
620-
simdjson::SUCCESS;
621-
}
598+
nullResult = processJsonExtract(jsonValue, pathValue, output) !=
599+
simdjson::SUCCESS;
622600
} catch (const VeloxException& e) {
623601
if (!e.isUserError()) {
624602
throw;
@@ -645,39 +623,21 @@ class JsonExtractFunction : public exec::VectorFunction {
645623
exec::VectorWriter<Json> resultWriter;
646624
resultWriter.init(*flatResult);
647625

648-
if (extractScalarOnly_) {
649-
context.applyToSelectedNoThrow(rows, [&](auto row) {
650-
VELOX_DCHECK(!decodedPath->isNullAt(row));
651-
resultWriter.setOffset(row);
652-
std::string output;
653-
if (!decodedJson->isNullAt(row) &&
654-
processJsonExtractScalar(
655-
decodedJson->valueAt<StringView>(row),
656-
decodedPath->valueAt<StringView>(row),
657-
output) == simdjson::SUCCESS) {
658-
resultWriter.current() = output;
659-
resultWriter.commit(true);
660-
} else {
661-
resultWriter.commit(false);
662-
}
663-
});
664-
} else {
665-
context.applyToSelectedNoThrow(rows, [&](auto row) {
666-
VELOX_DCHECK(!decodedPath->isNullAt(row));
667-
resultWriter.setOffset(row);
668-
std::string output;
669-
if (!decodedJson->isNullAt(row) &&
670-
processJsonExtract(
671-
decodedJson->valueAt<StringView>(row),
672-
decodedPath->valueAt<StringView>(row),
673-
output) == simdjson::SUCCESS) {
674-
resultWriter.current() = output;
675-
resultWriter.commit(true);
676-
} else {
677-
resultWriter.commit(false);
678-
}
679-
});
680-
}
626+
context.applyToSelectedNoThrow(rows, [&](auto row) {
627+
VELOX_DCHECK(!decodedPath->isNullAt(row));
628+
resultWriter.setOffset(row);
629+
std::string output;
630+
if (!decodedJson->isNullAt(row) &&
631+
processJsonExtract(
632+
decodedJson->valueAt<StringView>(row),
633+
decodedPath->valueAt<StringView>(row),
634+
output) == simdjson::SUCCESS) {
635+
resultWriter.current() = output;
636+
resultWriter.commit(true);
637+
} else {
638+
resultWriter.commit(false);
639+
}
640+
});
681641
resultWriter.finish();
682642
}
683643

@@ -750,59 +710,6 @@ class JsonExtractFunction : public exec::VectorFunction {
750710
return simdjson::SUCCESS;
751711
}
752712

753-
FOLLY_ALWAYS_INLINE simdjson::error_code processJsonExtractScalar(
754-
const StringView& json,
755-
const StringView& jsonPath,
756-
std::string& output) const {
757-
bool resultPopulated = false;
758-
std::optional<std::string> resultStr;
759-
auto consumer = [&resultStr, &resultPopulated](auto& v) {
760-
if (resultPopulated) {
761-
// We should just get a single value, if we see multiple, it's an error
762-
// and we should return null.
763-
resultStr = std::nullopt;
764-
return simdjson::SUCCESS;
765-
}
766-
767-
resultPopulated = true;
768-
769-
SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type());
770-
switch (vtype) {
771-
case simdjson::ondemand::json_type::boolean: {
772-
SIMDJSON_ASSIGN_OR_RAISE(bool vbool, v.get_bool());
773-
resultStr = vbool ? "true" : "false";
774-
break;
775-
}
776-
case simdjson::ondemand::json_type::string: {
777-
SIMDJSON_ASSIGN_OR_RAISE(resultStr, v.get_string());
778-
break;
779-
}
780-
case simdjson::ondemand::json_type::object:
781-
case simdjson::ondemand::json_type::array:
782-
case simdjson::ondemand::json_type::null:
783-
// Do nothing.
784-
break;
785-
default: {
786-
SIMDJSON_ASSIGN_OR_RAISE(resultStr, simdjson::to_json_string(v));
787-
}
788-
}
789-
return simdjson::SUCCESS;
790-
};
791-
792-
auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
793-
bool isDefinitePath = true;
794-
simdjson::padded_string paddedJson(json.data(), json.size());
795-
SIMDJSON_TRY(extractor.extract(paddedJson, consumer, isDefinitePath));
796-
797-
if (resultStr.has_value()) {
798-
output = std::move(resultStr.value());
799-
return simdjson::SUCCESS;
800-
} else {
801-
return simdjson::NO_SUCH_FIELD;
802-
}
803-
}
804-
805-
bool extractScalarOnly_{false};
806713
JsonParseImpl parser_;
807714
};
808715

@@ -856,37 +763,13 @@ VELOX_DECLARE_VECTOR_FUNCTION(
856763
JsonFormatFunction::signatures(),
857764
std::make_unique<JsonFormatFunction>());
858765

859-
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(
860-
udf_json_extract_scalar,
861-
JsonExtractFunction::signatures(true),
862-
[](const std::string& /*name*/,
863-
const std::vector<exec::VectorFunctionArg>&,
864-
const velox::core::QueryConfig&) {
865-
return std::make_shared<JsonExtractFunction>(true);
866-
});
867-
868-
// Only used internally at Meta.
869-
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(
870-
udf_json_extract_scalar_varchar_only,
871-
(std::vector<std::shared_ptr<exec::FunctionSignature>>{
872-
facebook::velox::exec::FunctionSignatureBuilder()
873-
.returnType("varchar")
874-
.argumentType("varchar")
875-
.argumentType("varchar")
876-
.build()}),
877-
[](const std::string& /*name*/,
878-
const std::vector<exec::VectorFunctionArg>&,
879-
const velox::core::QueryConfig&) {
880-
return std::make_shared<JsonExtractFunction>(true);
881-
});
882-
883766
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(
884767
udf_json_extract,
885-
JsonExtractFunction::signatures(false),
768+
JsonExtractFunction::signatures(),
886769
[](const std::string& /*name*/,
887770
const std::vector<exec::VectorFunctionArg>&,
888771
const velox::core::QueryConfig&) {
889-
return std::make_shared<JsonExtractFunction>(false);
772+
return std::make_shared<JsonExtractFunction>();
890773
});
891774

892775
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(

velox/functions/prestosql/JsonFunctions.h

+80
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,86 @@ struct JsonArrayGetFunction {
169169
}
170170
};
171171

172+
// json_extract_scalar(json, json_path) -> varchar
173+
// Like json_extract(), but returns the result value as a string (as opposed
174+
// to being encoded as JSON). The value referenced by json_path must be a scalar
175+
// (boolean, number or string)
176+
template <typename T>
177+
struct JsonExtractScalarFunction {
178+
VELOX_DEFINE_FUNCTION_TYPES(T);
179+
180+
FOLLY_ALWAYS_INLINE bool call(
181+
out_type<Varchar>& result,
182+
const arg_type<Json>& json,
183+
const arg_type<Varchar>& jsonPath) {
184+
return callImpl(result, json, jsonPath) == simdjson::SUCCESS;
185+
}
186+
187+
private:
188+
FOLLY_ALWAYS_INLINE bool callImpl(
189+
out_type<Varchar>& result,
190+
const arg_type<Json>& json,
191+
const arg_type<Varchar>& jsonPath) {
192+
bool resultPopulated = false;
193+
std::optional<std::string> resultStr;
194+
195+
auto consumer = [&resultStr, &resultPopulated](auto& v) {
196+
if (resultPopulated) {
197+
// We should just get a single value, if we see multiple, it's an error
198+
// and we should return null.
199+
resultStr = std::nullopt;
200+
return simdjson::SUCCESS;
201+
}
202+
203+
resultPopulated = true;
204+
205+
SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type());
206+
switch (vtype) {
207+
case simdjson::ondemand::json_type::boolean: {
208+
SIMDJSON_ASSIGN_OR_RAISE(bool vbool, v.get_bool());
209+
resultStr = vbool ? "true" : "false";
210+
break;
211+
}
212+
case simdjson::ondemand::json_type::string: {
213+
SIMDJSON_ASSIGN_OR_RAISE(resultStr, v.get_string());
214+
break;
215+
}
216+
case simdjson::ondemand::json_type::object:
217+
case simdjson::ondemand::json_type::array:
218+
case simdjson::ondemand::json_type::null:
219+
// Do nothing.
220+
break;
221+
default: {
222+
SIMDJSON_ASSIGN_OR_RAISE(resultStr, simdjson::to_json_string(v));
223+
}
224+
}
225+
return simdjson::SUCCESS;
226+
};
227+
228+
auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
229+
230+
// Check for valid json
231+
simdjson::padded_string paddedJson(json.data(), json.size());
232+
{
233+
SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, simdjsonParse(paddedJson));
234+
simdjson::ondemand::document parsedDoc;
235+
if (simdjsonParse(paddedJson).get(parsedDoc)) {
236+
return false;
237+
}
238+
}
239+
240+
bool isDefinitePath = true;
241+
SIMDJSON_TRY(extractor.extract(paddedJson, consumer, isDefinitePath));
242+
243+
if (resultStr.has_value()) {
244+
result.copy_from(*resultStr);
245+
return simdjson::SUCCESS;
246+
} else {
247+
return simdjson::NO_SUCH_FIELD;
248+
}
249+
}
250+
};
251+
172252
template <typename T>
173253
struct JsonSizeFunction {
174254
VELOX_DEFINE_FUNCTION_TYPES(T);

velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ void registerJsonFunctions(const std::string& prefix) {
2727
registerFunction<IsJsonScalarFunction, bool, Varchar>(
2828
{prefix + "is_json_scalar"});
2929

30+
registerFunction<JsonExtractScalarFunction, Varchar, Json, Varchar>(
31+
{prefix + "json_extract_scalar"});
32+
registerFunction<JsonExtractScalarFunction, Varchar, Varchar, Varchar>(
33+
{prefix + "json_extract_scalar"});
34+
3035
registerFunction<JsonArrayLengthFunction, int64_t, Json>(
3136
{prefix + "json_array_length"});
3237
registerFunction<JsonArrayLengthFunction, int64_t, Varchar>(
@@ -61,9 +66,6 @@ void registerJsonFunctions(const std::string& prefix) {
6166

6267
VELOX_REGISTER_VECTOR_FUNCTION(udf_json_extract, prefix + "json_extract");
6368

64-
VELOX_REGISTER_VECTOR_FUNCTION(
65-
udf_json_extract_scalar, prefix + "json_extract_scalar");
66-
6769
VELOX_REGISTER_VECTOR_FUNCTION(udf_json_format, prefix + "json_format");
6870

6971
VELOX_REGISTER_VECTOR_FUNCTION(udf_json_parse, prefix + "json_parse");

velox/functions/prestosql/tests/JsonExtractScalarTest.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,19 @@ TEST_F(JsonExtractScalarTest, invalidPath) {
193193
jsonExtractScalar(R"({"k1":"v1)", "$.k1]"), "Invalid JSON path");
194194
}
195195

196+
TEST_F(JsonExtractScalarTest, invalidJson) {
197+
// Verify that we return null on invalid JSON regardless of whether the
198+
// invalid section is after the target path.
199+
EXPECT_EQ(jsonExtractScalar(R"({"a": "b", "c": "d})", "$.a"), std::nullopt);
200+
EXPECT_EQ(jsonExtractScalar(R"([["a"], ["b]])", "$[0][0]"), std::nullopt);
201+
}
202+
203+
TEST_F(JsonExtractScalarTest, escapedString) {
204+
// Verify the the returned string is unescaped.
205+
EXPECT_EQ(
206+
jsonExtractScalar(R"({"x": {"a" : 1, "b" : "b\/c"} })", "$.x.b"), "b/c");
207+
}
208+
196209
// simdjson, like Presto java, returns the large number as-is as a string,
197210
// without trying to convert it to an integer.
198211
TEST_F(JsonExtractScalarTest, overflow) {

0 commit comments

Comments
 (0)