diff --git a/velox/docs/functions/presto/json.rst b/velox/docs/functions/presto/json.rst index cd02588552d0..a7d50f9e170d 100644 --- a/velox/docs/functions/presto/json.rst +++ b/velox/docs/functions/presto/json.rst @@ -156,8 +156,6 @@ JSON Functions SELECT json_extract(json, '$.store.book'); Current implementation supports limited subset of JSONPath syntax. - If ``json`` is a varchar then it is expected to conform to `RFC 7159`_ and will be converted to its canonical - format before extraction. .. _JSONPath: http://goessner.net/articles/JsonPath/ @@ -171,10 +169,6 @@ JSON Functions SELECT json_extract_scalar('[1, 2, 3]', '$[2]'); SELECT json_extract_scalar(json, '$.store.book[0].author'); - Current implementation supports limited subset of JSONPath syntax. - If ``json`` is a varchar then it is expected to conform to `RFC 7159`_ and will be converted to its canonical - format before extraction. - .. _JSONPath: http://goessner.net/articles/JsonPath/ .. function:: json_format(json) -> varchar @@ -189,9 +183,8 @@ JSON Functions .. function:: json_parse(varchar) -> json - Expects a JSON text conforming to `RFC 7159`_, and returns the JSON value (in its canonical form) deserialized - from the JSON text. The JSON value can be a JSON object, a JSON array, a JSON string, a JSON number, ``true``, - ``false`` or ``null``:: + expects a JSON text conforming to `RFC 7159`_, and returns the JSON value deserialized from the JSON text. + The JSON value can be a JSON object, a JSON array, a JSON string, a JSON number, ``true``, ``false`` or ``null``:: SELECT json_parse('[1, 2, 3]'); -- JSON '[1,2,3]' SELECT json_parse('"abc"'); -- JSON '"abc"' diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp index 21eb54e847fa..f91cdf59ca6d 100644 --- a/velox/functions/prestosql/JsonFunctions.cpp +++ b/velox/functions/prestosql/JsonFunctions.cpp @@ -17,10 +17,8 @@ #include "velox/common/base/SortingNetwork.h" #include "velox/expression/VectorFunction.h" -#include "velox/expression/VectorWriters.h" #include "velox/functions/lib/string/StringImpl.h" #include "velox/functions/prestosql/json/JsonStringUtil.h" -#include "velox/functions/prestosql/json/SIMDJsonExtractor.h" #include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/functions/prestosql/types/JsonCastOperator.h" #include "velox/functions/prestosql/types/JsonType.h" @@ -136,23 +134,23 @@ class JsonFormatFunction : public exec::VectorFunction { } }; -// A performant json parsing implementation. This is also leveraged by json -// functions other than json_parse that need to parse a varchar input. If -// `nullOnError` is true, the result will have null values for invalid jsons -// otherwise it will set exceptions for those rows in 'context'. -class JsonParseImpl { +class JsonParseFunction : public exec::VectorFunction { public: void apply( const SelectivityVector& rows, - const VectorPtr& arg, + std::vector& args, const TypePtr& /* outputType */, exec::EvalCtx& context, - VectorPtr& localResult, - bool nullOnError) const { + VectorPtr& result) const override { // Initialize errors here so that we get the proper exception context. folly::call_once( initializeErrors_, [this] { simdjsonErrorsToExceptions(errors_); }); + VectorPtr localResult; + + // Input can be constant or flat. + assert(args.size() > 0); + const auto& arg = args[0]; if (arg->isConstantEncoding()) { auto value = arg->as>()->valueAt(0); auto size = value.size(); @@ -160,6 +158,7 @@ class JsonParseImpl { BufferPtr stringViews = AlignedBuffer::allocate(1, context.pool()); auto rawStringViews = stringViews->asMutable(); + try { bool needNormalize = needNormalizeForJsonParse(value.data(), value.size()); @@ -171,13 +170,8 @@ class JsonParseImpl { VELOX_CHECK_EQ(prepareInput(value, needNormalize), size); if (auto error = parse(size, needNormalize)) { + context.setErrors(rows, errors_[error]); clearState(); - if (nullOnError) { - localResult = BaseVector::createNullConstant( - JSON(), rows.end(), context.pool()); - } else { - context.setErrors(rows, errors_[error]); - } return; } auto* output = buffer->asMutable(); @@ -189,16 +183,14 @@ class JsonParseImpl { if (!e.isUserError()) { throw; } - if (nullOnError) { - localResult = BaseVector::createNullConstant( - JSON(), rows.end(), context.pool()); - } else { - context.setErrors(rows, std::current_exception()); - } + context.setErrors(rows, std::current_exception()); + FB_LOG_EVERY_MS(WARNING, 1000) << "Caught user error in json_parse: " << e.message(); + return; } + auto constantBase = std::make_shared>( context.pool(), JSON(), @@ -208,100 +200,96 @@ class JsonParseImpl { std::vector{buffer}); localResult = BaseVector::wrapInConstant(rows.end(), 0, constantBase); - } - VectorPtr jsonInput = arg; - if (!arg->isFlatEncoding()) { - BaseVector::flattenVector(jsonInput); - } - auto flatInput = jsonInput->asFlatVector(); - BufferPtr stringViews = AlignedBuffer::allocate( - rows.end(), context.pool(), StringView()); - auto rawStringViews = stringViews->asMutable(); - - VELOX_CHECK_LE(rows.end(), flatInput->size()); - - size_t maxSize = 0; - size_t totalOutputSize = 0; - std::vector needNormalizes(rows.end()); - auto nullsOnErrors = AlignedBuffer::allocate( - rows.end(), context.pool(), bits::kNotNull); - auto rawNullsOnErrors = nullsOnErrors->asMutable(); - rows.applyToSelected([&](auto row) { - auto value = flatInput->valueAt(row); - bool needNormalize = - needNormalizeForJsonParse(value.data(), value.size()); - auto size = value.size(); - if (needNormalize) { - try { - size = normalizedSizeForJsonParse(value.data(), value.size()); - } catch (const VeloxException& e) { - if (!e.isUserError()) { - throw; - } - if (!nullOnError) { + + } else { + auto flatInput = arg->asFlatVector(); + BufferPtr stringViews = AlignedBuffer::allocate( + rows.end(), context.pool(), StringView()); + auto rawStringViews = stringViews->asMutable(); + + VELOX_CHECK_LE(rows.end(), flatInput->size()); + + size_t maxSize = 0; + size_t totalOutputSize = 0; + std::vector needNormalizes(rows.end()); + std::vector hasError(rows.end()); + rows.applyToSelected([&](auto row) { + auto value = flatInput->valueAt(row); + bool needNormalize = + needNormalizeForJsonParse(value.data(), value.size()); + auto size = value.size(); + if (needNormalize) { + try { + size = normalizedSizeForJsonParse(value.data(), value.size()); + } catch (const VeloxException& e) { + if (!e.isUserError()) { + throw; + } context.setVeloxExceptionError(row, std::current_exception()); + hasError[row] = true; + return; } - // We use this to skip error-ed out rows when generating output. - bits::setNull(rawNullsOnErrors, row, true); - return; } - } - needNormalizes[row] = needNormalize; - maxSize = std::max(maxSize, size); - totalOutputSize += size; - }); + needNormalizes[row] = needNormalize; + maxSize = std::max(maxSize, size); + totalOutputSize += size; + }); - paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); - BufferPtr buffer = - AlignedBuffer::allocate(totalOutputSize, context.pool()); - auto* output = buffer->asMutable(); + paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); + BufferPtr buffer = + AlignedBuffer::allocate(totalOutputSize, context.pool()); + auto* output = buffer->asMutable(); - rows.applyToSelected([&](auto row) { - if (bits::isBitNull(rawNullsOnErrors, row)) { - // Skip if error-ed out earlier. - return; - } + rows.applyToSelected([&](auto row) { + if (hasError[row]) { + return; + } - try { - auto value = flatInput->valueAt(row); - auto size = prepareInput(value, needNormalizes[row]); - if (auto error = parse(size, needNormalizes[row])) { - if (!nullOnError) { + try { + auto value = flatInput->valueAt(row); + auto size = prepareInput(value, needNormalizes[row]); + if (auto error = parse(size, needNormalizes[row])) { context.setVeloxExceptionError(row, errors_[error]); - } else { - bits::setNull(rawNullsOnErrors, row, true); + clearState(); + return; + } + auto outputSize = concatViews(views_, output); + rawStringViews[row] = StringView(output, outputSize); + if (!StringView::isInline(outputSize)) { + output += outputSize; } + } catch (const VeloxException& e) { clearState(); - return; - } - auto outputSize = concatViews(views_, output); - rawStringViews[row] = StringView(output, outputSize); - if (!StringView::isInline(outputSize)) { - output += outputSize; - } - } catch (const VeloxException& e) { - clearState(); - if (!e.isUserError()) { - throw; - } - if (!nullOnError) { + if (!e.isUserError()) { + throw; + } + context.setVeloxExceptionError(row, std::current_exception()); - } else { - bits::setNull(rawNullsOnErrors, row, true); + + FB_LOG_EVERY_MS(WARNING, 1000) + << "Caught user error in json_parse: " << e.message(); } - FB_LOG_EVERY_MS(WARNING, 1000) - << "Caught user error in json_parse: " << e.message(); - } - }); + }); + + localResult = std::make_shared>( + context.pool(), + JSON(), + nullptr, + rows.end(), + stringViews, + std::vector{buffer}); + } - localResult = std::make_shared>( - context.pool(), - JSON(), - nullOnError ? nullsOnErrors : nullptr, - rows.end(), - stringViews, - std::vector{buffer}); + context.moveOrCopyResult(localResult, rows, result); + } + + static std::vector> signatures() { + // varchar -> json + return {exec::FunctionSignatureBuilder() + .returnType("json") + .argumentType("varchar") + .build()}; } private: @@ -510,302 +498,6 @@ class JsonParseImpl { mutable std::vector fastSortKeys_; }; -class JsonParseFunction : public exec::VectorFunction { - public: - void apply( - const SelectivityVector& rows, - std::vector& args, - const TypePtr& outputType, - exec::EvalCtx& context, - VectorPtr& result) const override { - // Input can be constant or flat. - assert(args.size() > 0); - const auto& arg = args[0]; - VectorPtr localResult; - parser.apply( - rows, arg, outputType, context, localResult, false /* nullOnError */); - context.moveOrCopyResult(localResult, rows, result); - } - - static std::vector> signatures() { - // varchar -> json - return {exec::FunctionSignatureBuilder() - .returnType("json") - .argumentType("varchar") - .build()}; - } - - private: - JsonParseImpl parser; -}; - -class JsonExtractFunction : public exec::VectorFunction { - public: - JsonExtractFunction(bool extractScalarOnly) - : extractScalarOnly_(extractScalarOnly) {} - - void apply( - const SelectivityVector& rows, - std::vector& args, - const TypePtr& outputType, - exec::EvalCtx& context, - VectorPtr& result) const override { - VELOX_CHECK_EQ(args.size(), 2); - VectorPtr jsonInput = args[0]; - if (jsonInput->type() != JSON()) { - VELOX_CHECK_EQ(args[0]->type(), VARCHAR()); - VectorPtr parsedJson; - parser_.apply( - rows, jsonInput, JSON(), context, parsedJson, true /* nullOnError */); - jsonInput = parsedJson; - } - VectorPtr localResult; - applyImpl(rows, jsonInput, args[1], outputType, context, localResult); - context.moveOrCopyResult(localResult, rows, result); - } - - static std::vector> signatures( - bool extractScalarOnly) { - if (extractScalarOnly) { - return { - exec::FunctionSignatureBuilder() - .returnType("varchar") - .argumentType("json") - .argumentType("varchar") - .build(), - exec::FunctionSignatureBuilder() - .returnType("varchar") - .argumentType("varchar") - .argumentType("varchar") - .build()}; - } - return { - exec::FunctionSignatureBuilder() - .returnType("json") - .argumentType("json") - .argumentType("varchar") - .build(), - exec::FunctionSignatureBuilder() - .returnType("json") - .argumentType("varchar") - .argumentType("varchar") - .build()}; - } - - private: - void applyImpl( - const SelectivityVector& rows, - const VectorPtr& json, - const VectorPtr& path, - const TypePtr& outputType, - exec::EvalCtx& context, - VectorPtr& localResult) const { - VELOX_CHECK_EQ(json->type(), JSON()); - - if (json->isConstantEncoding() && path->isConstantEncoding()) { - bool nullResult = false; - std::string output; - if (json->as>()->isNullAt(0) || - path->as>()->isNullAt(0)) { - nullResult = true; - } else { - auto jsonValue = json->as>()->valueAt(0); - auto pathValue = path->as>()->valueAt(0); - try { - if (extractScalarOnly_) { - nullResult = processJsonExtractScalar( - jsonValue, pathValue, output) != simdjson::SUCCESS; - } else { - nullResult = processJsonExtract(jsonValue, pathValue, output) != - simdjson::SUCCESS; - } - } catch (const VeloxException& e) { - if (!e.isUserError()) { - throw; - } - nullResult = true; - context.setErrors(rows, std::current_exception()); - } - } - - if (nullResult) { - localResult = BaseVector::createNullConstant( - outputType, rows.end(), context.pool()); - } else { - localResult = BaseVector::createConstant( - outputType, output, rows.end(), context.pool()); - } - return; - } - localResult = context.getVector(outputType, rows.end()); - auto flatResult = localResult->asFlatVector(); - VELOX_CHECK_NOT_NULL(flatResult); - exec::LocalDecodedVector decodedJson(context, *json, rows); - exec::LocalDecodedVector decodedPath(context, *path, rows); - exec::VectorWriter resultWriter; - resultWriter.init(*flatResult); - - if (extractScalarOnly_) { - context.applyToSelectedNoThrow(rows, [&](auto row) { - VELOX_DCHECK(!decodedPath->isNullAt(row)); - resultWriter.setOffset(row); - std::string output; - if (!decodedJson->isNullAt(row) && - processJsonExtractScalar( - decodedJson->valueAt(row), - decodedPath->valueAt(row), - output) == simdjson::SUCCESS) { - resultWriter.current() = output; - resultWriter.commit(true); - } else { - resultWriter.commit(false); - } - }); - } else { - context.applyToSelectedNoThrow(rows, [&](auto row) { - VELOX_DCHECK(!decodedPath->isNullAt(row)); - resultWriter.setOffset(row); - std::string output; - if (!decodedJson->isNullAt(row) && - processJsonExtract( - decodedJson->valueAt(row), - decodedPath->valueAt(row), - output) == simdjson::SUCCESS) { - resultWriter.current() = output; - resultWriter.commit(true); - } else { - resultWriter.commit(false); - } - }); - } - resultWriter.finish(); - } - - FOLLY_ALWAYS_INLINE simdjson::error_code processJsonExtract( - const StringView& json, - const StringView& jsonPath, - std::string& output) const { - static constexpr std::string_view kNullString{"null"}; - static constexpr std::string_view emptyArrayString{"[]"}; - std::vector results; - auto consumer = [&results](auto& v) { - // We could just convert v to a string using to_json_string directly, but - // in that case the JSON wouldn't be parsed (it would just return the - // contents directly) and we might miss invalid JSON. - SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type()); - switch (vtype) { - case simdjson::ondemand::json_type::object: { - SIMDJSON_ASSIGN_OR_RAISE( - auto jsonStr, simdjson::to_json_string(v.get_object())); - results.push_back(std::move(jsonStr)); - break; - } - case simdjson::ondemand::json_type::array: { - SIMDJSON_ASSIGN_OR_RAISE( - auto jsonStr, simdjson::to_json_string(v.get_array())); - results.push_back(std::move(jsonStr)); - break; - } - case simdjson::ondemand::json_type::string: - case simdjson::ondemand::json_type::number: - case simdjson::ondemand::json_type::boolean: { - SIMDJSON_ASSIGN_OR_RAISE(auto jsonStr, simdjson::to_json_string(v)); - results.push_back(std::move(jsonStr)); - break; - } - case simdjson::ondemand::json_type::null: - results.push_back(kNullString); - break; - } - return simdjson::SUCCESS; - }; - - auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); - bool isDefinitePath = true; - simdjson::padded_string paddedJson(json.data(), json.size()); - SIMDJSON_TRY(extractor.extract(paddedJson, consumer, isDefinitePath)); - - if (results.size() == 0) { - if (isDefinitePath) { - // If the path didn't map to anything in the JSON object, return null. - return simdjson::NO_SUCH_FIELD; - } - output = emptyArrayString; - return simdjson::SUCCESS; - } - std::stringstream ss; - if (!isDefinitePath) { - ss << "["; - } - for (int i = 0; i < results.size(); i++) { - if (i > 0) { - ss << ","; - } - ss << results[i]; - } - if (!isDefinitePath) { - ss << "]"; - } - output = ss.str(); - return simdjson::SUCCESS; - } - - FOLLY_ALWAYS_INLINE simdjson::error_code processJsonExtractScalar( - const StringView& json, - const StringView& jsonPath, - std::string& output) const { - bool resultPopulated = false; - std::optional resultStr; - auto consumer = [&resultStr, &resultPopulated](auto& v) { - if (resultPopulated) { - // We should just get a single value, if we see multiple, it's an error - // and we should return null. - resultStr = std::nullopt; - return simdjson::SUCCESS; - } - - resultPopulated = true; - - SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type()); - switch (vtype) { - case simdjson::ondemand::json_type::boolean: { - SIMDJSON_ASSIGN_OR_RAISE(bool vbool, v.get_bool()); - resultStr = vbool ? "true" : "false"; - break; - } - case simdjson::ondemand::json_type::string: { - SIMDJSON_ASSIGN_OR_RAISE(resultStr, v.get_string()); - break; - } - case simdjson::ondemand::json_type::object: - case simdjson::ondemand::json_type::array: - case simdjson::ondemand::json_type::null: - // Do nothing. - break; - default: { - SIMDJSON_ASSIGN_OR_RAISE(resultStr, simdjson::to_json_string(v)); - } - } - return simdjson::SUCCESS; - }; - - auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); - bool isDefinitePath = true; - simdjson::padded_string paddedJson(json.data(), json.size()); - SIMDJSON_TRY(extractor.extract(paddedJson, consumer, isDefinitePath)); - - if (resultStr.has_value()) { - output = std::move(resultStr.value()); - return simdjson::SUCCESS; - } else { - return simdjson::NO_SUCH_FIELD; - } - } - - bool extractScalarOnly_{false}; - JsonParseImpl parser_; -}; - // This function is called when $internal$json_string_to_array/map/row // is called. It is used for expressions like 'Cast(json_parse(x) as // ARRAY<...>)' etc. This is an optimization to avoid parsing the json string @@ -856,39 +548,6 @@ VELOX_DECLARE_VECTOR_FUNCTION( JsonFormatFunction::signatures(), std::make_unique()); -VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( - udf_json_extract_scalar, - JsonExtractFunction::signatures(true), - [](const std::string& /*name*/, - const std::vector&, - const velox::core::QueryConfig&) { - return std::make_shared(true); - }); - -// Only used internally at Meta. -VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( - udf_json_extract_scalar_varchar_only, - (std::vector>{ - facebook::velox::exec::FunctionSignatureBuilder() - .returnType("varchar") - .argumentType("varchar") - .argumentType("varchar") - .build()}), - [](const std::string& /*name*/, - const std::vector&, - const velox::core::QueryConfig&) { - return std::make_shared(true); - }); - -VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( - udf_json_extract, - JsonExtractFunction::signatures(false), - [](const std::string& /*name*/, - const std::vector&, - const velox::core::QueryConfig&) { - return std::make_shared(false); - }); - VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( udf_json_parse, JsonParseFunction::signatures(), diff --git a/velox/functions/prestosql/JsonFunctions.h b/velox/functions/prestosql/JsonFunctions.h index 5e5de3b11609..be951943d798 100644 --- a/velox/functions/prestosql/JsonFunctions.h +++ b/velox/functions/prestosql/JsonFunctions.h @@ -169,6 +169,155 @@ struct JsonArrayGetFunction { } }; +// jsonExtractScalar(json, json_path) -> varchar +// Like jsonExtract(), but returns the result value as a string (as opposed +// to being encoded as JSON). The value referenced by json_path must be a scalar +// (boolean, number or string) +template +struct JsonExtractScalarFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE bool call( + out_type& result, + const arg_type& json, + const arg_type& jsonPath) { + return callImpl(result, json, jsonPath) == simdjson::SUCCESS; + } + + private: + FOLLY_ALWAYS_INLINE bool callImpl( + out_type& result, + const arg_type& json, + const arg_type& jsonPath) { + bool resultPopulated = false; + std::optional resultStr; + auto consumer = [&resultStr, &resultPopulated](auto& v) { + if (resultPopulated) { + // We should just get a single value, if we see multiple, it's an error + // and we should return null. + resultStr = std::nullopt; + return simdjson::SUCCESS; + } + + resultPopulated = true; + + SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type()); + switch (vtype) { + case simdjson::ondemand::json_type::boolean: { + SIMDJSON_ASSIGN_OR_RAISE(bool vbool, v.get_bool()); + resultStr = vbool ? "true" : "false"; + break; + } + case simdjson::ondemand::json_type::string: { + SIMDJSON_ASSIGN_OR_RAISE(resultStr, v.get_string()); + break; + } + case simdjson::ondemand::json_type::object: + case simdjson::ondemand::json_type::array: + case simdjson::ondemand::json_type::null: + // Do nothing. + break; + default: { + SIMDJSON_ASSIGN_OR_RAISE(resultStr, simdjson::to_json_string(v)); + } + } + return simdjson::SUCCESS; + }; + + auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); + bool isDefinitePath = true; + SIMDJSON_TRY(extractor.extract(json, consumer, isDefinitePath)); + + if (resultStr.has_value()) { + result.copy_from(*resultStr); + return simdjson::SUCCESS; + } else { + return simdjson::NO_SUCH_FIELD; + } + } +}; + +template +struct JsonExtractFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + bool call( + out_type& result, + const arg_type& json, + const arg_type& jsonPath) { + return callImpl(result, json, jsonPath) == simdjson::SUCCESS; + } + + private: + simdjson::error_code callImpl( + out_type& result, + const arg_type& json, + const arg_type& jsonPath) { + static constexpr std::string_view kNullString{"null"}; + std::string results; + size_t resultSize = 0; + auto consumer = [&results, &resultSize](auto& v) { + // Add the separator for the JSON array. + if (resultSize++ > 0) { + results += ","; + } + // We could just convert v to a string using to_json_string directly, but + // in that case the JSON wouldn't be parsed (it would just return the + // contents directly) and we might miss invalid JSON. + SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type()); + switch (vtype) { + case simdjson::ondemand::json_type::object: { + SIMDJSON_ASSIGN_OR_RAISE( + auto jsonStr, simdjson::to_json_string(v.get_object())); + results += jsonStr; + break; + } + case simdjson::ondemand::json_type::array: { + SIMDJSON_ASSIGN_OR_RAISE( + auto jsonStr, simdjson::to_json_string(v.get_array())); + results += jsonStr; + break; + } + case simdjson::ondemand::json_type::string: + case simdjson::ondemand::json_type::number: + case simdjson::ondemand::json_type::boolean: { + SIMDJSON_ASSIGN_OR_RAISE(auto jsonStr, simdjson::to_json_string(v)); + results += jsonStr; + break; + } + case simdjson::ondemand::json_type::null: + results += kNullString; + break; + } + return simdjson::SUCCESS; + }; + + auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); + bool isDefinitePath = true; + SIMDJSON_TRY(extractor.extract(json, consumer, isDefinitePath)); + + if (resultSize == 0) { + if (isDefinitePath) { + // If the path didn't map to anything in the JSON object, return null. + return simdjson::NO_SUCH_FIELD; + } + + result.copy_from("[]"); + } else if (resultSize == 1 && isDefinitePath) { + // If there was only one value mapped to by the path, don't wrap it in an + // array. + result.copy_from(results); + } else { + // Add the square brackets to make it a valid JSON array. + result.reserve(2 + results.size()); + result.append("["); + result.append(results); + result.append("]"); + } + return simdjson::SUCCESS; + } +}; + template struct JsonSizeFunction { VELOX_DEFINE_FUNCTION_TYPES(T); @@ -217,8 +366,7 @@ struct JsonSizeFunction { auto& extractor = SIMDJsonExtractor::getInstance(jsonPath); bool isDefinitePath = true; - simdjson::padded_string paddedJson(json.data(), json.size()); - SIMDJSON_TRY(extractor.extract(paddedJson, consumer, isDefinitePath)); + SIMDJSON_TRY(extractor.extract(json, consumer, isDefinitePath)); if (resultCount == 0) { // If the path didn't map to anything in the JSON object, return null. diff --git a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp index b766aa02726d..f05b2456a099 100644 --- a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp +++ b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp @@ -26,14 +26,6 @@ #include "velox/functions/prestosql/types/JsonRegistration.h" #include "velox/functions/prestosql/types/JsonType.h" -namespace facebook::velox::functions { -void registerJsonVectorFunctions() { - VELOX_REGISTER_VECTOR_FUNCTION( - udf_json_extract_scalar, "json_extract_scalar"); - VELOX_REGISTER_VECTOR_FUNCTION(udf_json_extract, "json_extract"); -} -} // namespace facebook::velox::functions - namespace facebook::velox::functions::prestosql { namespace { @@ -196,14 +188,17 @@ class JsonBenchmark : public velox::functions::test::FunctionBenchmarkBase { {"json_array_length"}); registerFunction( {"folly_json_array_length"}); + registerFunction( + {"json_extract_scalar"}); registerFunction( {"folly_json_extract_scalar"}); + registerFunction( + {"json_extract"}); registerFunction( {"folly_json_extract"}); registerFunction({"json_size"}); registerFunction( {"folly_json_size"}); - registerJsonVectorFunctions(); } std::string prepareData(int jsonSize) { diff --git a/velox/functions/prestosql/json/SIMDJsonExtractor.h b/velox/functions/prestosql/json/SIMDJsonExtractor.h index 7ebca1a18894..e45bc156eb2e 100644 --- a/velox/functions/prestosql/json/SIMDJsonExtractor.h +++ b/velox/functions/prestosql/json/SIMDJsonExtractor.h @@ -31,7 +31,7 @@ class SIMDJsonExtractor { public: /** * Extract element(s) from a JSON object using the given path. - * @param json: A json string of type simdjson::padded_string + * @param json: A JSON object * @param path: Path to locate a JSON object. Following operators are * supported. * "$" Root member of a JSON structure no matter if it's an @@ -43,12 +43,10 @@ class SIMDJsonExtractor { * @param consumer: Function to consume the extracted elements. Should be able * to take an argument that can either be a * simdjson::ondemand::document or a - * simdjson::ondemand::value. Note: If the consumer holds - * onto string_view(s) generated from applying - * simdjson::to_json_string to the arguments, then those - * string_view(s) will reference the original `json` - * padded_string and remain valid as long as it remains - * in scope. + * simdjson::ondemand::value. Note that once consumer + * returns, it should be assumed that the argument passed in + * is no longer valid, so do not attempt to store it as is + * in the consumer. * @param isDefinitePath is an output param that will get set to * false if a token is evaluated which can return * multiple results like '*'. @@ -57,7 +55,7 @@ class SIMDJsonExtractor { */ template simdjson::error_code extract( - const simdjson::padded_string& json, + const velox::StringView& json, TConsumer& consumer, bool& isDefinitePath); @@ -106,9 +104,10 @@ simdjson::error_code extractArray( template simdjson::error_code SIMDJsonExtractor::extract( - const simdjson::padded_string& paddedJson, + const velox::StringView& json, TConsumer& consumer, bool& isDefinitePath) { + simdjson::padded_string paddedJson(json.data(), json.size()); SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, simdjsonParse(paddedJson)); SIMDJSON_ASSIGN_OR_RAISE(auto isScalar, jsonDoc.is_scalar()); if (isScalar) { diff --git a/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp b/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp index 7a1a33dce68f..1a5d0000269c 100644 --- a/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp +++ b/velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp @@ -32,9 +32,10 @@ simdjson::error_code simdJsonExtract( TConsumer&& consumer) { auto& extractor = SIMDJsonExtractor::getInstance(path); bool isDefinitePath = true; - simdjson::padded_string paddedJson(json.data(), json.size()); return extractor.extract( - paddedJson, std::forward(consumer), isDefinitePath); + velox::StringView(json), + std::forward(consumer), + isDefinitePath); } class SIMDJsonExtractorTest : public testing::Test { diff --git a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp index 16fccaaef4f5..23ba1b1d195f 100644 --- a/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/JsonFunctionsRegistration.cpp @@ -27,6 +27,16 @@ void registerJsonFunctions(const std::string& prefix) { registerFunction( {prefix + "is_json_scalar"}); + registerFunction( + {prefix + "json_extract_scalar"}); + registerFunction( + {prefix + "json_extract_scalar"}); + + registerFunction( + {prefix + "json_extract"}); + registerFunction( + {prefix + "json_extract"}); + registerFunction( {prefix + "json_array_length"}); registerFunction( @@ -59,11 +69,6 @@ void registerJsonFunctions(const std::string& prefix) { registerFunction( {prefix + "json_size"}); - VELOX_REGISTER_VECTOR_FUNCTION(udf_json_extract, prefix + "json_extract"); - - VELOX_REGISTER_VECTOR_FUNCTION( - udf_json_extract_scalar, prefix + "json_extract_scalar"); - VELOX_REGISTER_VECTOR_FUNCTION(udf_json_format, prefix + "json_format"); VELOX_REGISTER_VECTOR_FUNCTION(udf_json_parse, prefix + "json_parse"); diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 2ecc631f9380..134942e26ba1 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -162,27 +162,6 @@ class JsonFunctionsTest : public functions::test::FunctionBaseTest { exprSet.eval(rows, evalCtx, result); velox::test::assertEqualVectors(expected, result[0]); }; - - // Utility function to evaluate json_extract both with and without constant - // inputs. Ensures that the results are same in both cases. 'wrapInTry' is - // used to test the cases where json_extract throws a user error and verify - // that they are captured by the TRY operator. The inputs are expected to have - // only one row. - std::optional - jsonExtract(VectorPtr json, VectorPtr path, bool wrapInTry = false) { - std::string expr = - !wrapInTry ? "json_extract(c0, c1)" : "try(json_extract(c0, c1))"; - - auto result = evaluateOnce(expr, makeRowVector({json, path})); - auto resultConstantInput = evaluateOnce( - expr, - makeRowVector( - {BaseVector::wrapInConstant(1, 0, json), - BaseVector::wrapInConstant(1, 0, path)})); - EXPECT_EQ(result, resultConstantInput) - << "Equal results expected for constant and non constant inputs"; - return result; - } }; TEST_F(JsonFunctionsTest, jsonFormat) { @@ -953,11 +932,11 @@ TEST_F(JsonFunctionsTest, invalidPath) { TEST_F(JsonFunctionsTest, jsonExtract) { auto jsonExtract = [&](std::optional json, - const std::string& path, - bool wrapInTry = false) { - auto jsonInput = makeJsonVector(json); - auto pathInput = makeFlatVector({path}); - return JsonFunctionsTest::jsonExtract(jsonInput, pathInput, wrapInTry); + const std::string& path) { + return evaluateOnce( + "json_extract(c0, c1)", + makeRowVector( + {makeJsonVector(json), makeFlatVector({path})})); }; EXPECT_EQ( @@ -1046,50 +1025,6 @@ TEST_F(JsonFunctionsTest, jsonExtract) { VELOX_ASSERT_THROW( jsonExtract(kJson, "concat($..category)"), "Invalid JSON path"); VELOX_ASSERT_THROW(jsonExtract(kJson, "$.store.keys()"), "Invalid JSON path"); - - // Ensure User errors are captured in try() and not thrown. - EXPECT_EQ(std::nullopt, jsonExtract(kJson, "$..price", true)); - EXPECT_EQ( - std::nullopt, - jsonExtract(kJson, "$.store.book[?(@.price <10)].title", true)); - EXPECT_EQ(std::nullopt, jsonExtract(kJson, "max($..price)", true)); - EXPECT_EQ(std::nullopt, jsonExtract(kJson, "concat($..category)", true)); - EXPECT_EQ(std::nullopt, jsonExtract(kJson, "$.store.keys()", true)); -} - -TEST_F(JsonFunctionsTest, jsonExtractVarcharInput) { - auto jsonExtract = [&](std::optional json, - const std::string& path, - bool wrapInTry = false) { - std::optional s = json.has_value() - ? std::make_optional(StringView(json.value())) - : std::nullopt; - auto varcharInput = makeNullableFlatVector({s}, VARCHAR()); - auto pathInput = makeFlatVector({path}); - return JsonFunctionsTest::jsonExtract(varcharInput, pathInput, wrapInTry); - }; - - // Valid json - EXPECT_EQ( - R"({"x":{"a":1,"b":2}})", - jsonExtract(R"({"x": {"a" : 1, "b" : 2} })", "$")); - EXPECT_EQ( - R"({"a":1,"b":2})", jsonExtract(R"({"x": {"a" : 1, "b" : 2} })", "$.x")); - - // Invalid JSON - EXPECT_EQ(std::nullopt, jsonExtract(R"({"x": {"a" : 1, "b" : "2""} })", "$")); - // Non-canonicalized json - EXPECT_EQ( - R"({"x":{"a":1,"b":2}})", - jsonExtract(R"({"x": {"b" : 2, "a" : 1} })", "$")); - // Input has escape characters - EXPECT_EQ( - R"({"x":{"a":"/1","b":"/2"}})", - jsonExtract(R"({"x": {"a" : "\/1", "b" : "\/2"} })", "$")); - // Invalid path - VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path"); - // Ensure User error is captured in try() and not thrown. - EXPECT_EQ(std::nullopt, jsonExtract(kJson, "$..price", true)); } // The following tests ensure that the internal json functions