From 737c9d5071d36a82239c8218b1a201d1ee49a354 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Fri, 28 Feb 2025 11:49:11 -0500 Subject: [PATCH] feat!: Add support for hierarchical and auto-generated keys for log filtering. (#62) Co-authored-by: davemarco <83603688+davemarco@users.noreply.github.com> --- src/clp_ffi_js/ir/StreamReader.cpp | 3 +- .../ir/StructuredIrStreamReader.cpp | 43 +++- src/clp_ffi_js/ir/StructuredIrUnitHandler.cpp | 239 +++++++++++++----- src/clp_ffi_js/ir/StructuredIrUnitHandler.hpp | 117 ++++++--- src/submodules/clp | 2 +- 5 files changed, 306 insertions(+), 98 deletions(-) diff --git a/src/clp_ffi_js/ir/StreamReader.cpp b/src/clp_ffi_js/ir/StreamReader.cpp index 0359000f..7b30ecb6 100644 --- a/src/clp_ffi_js/ir/StreamReader.cpp +++ b/src/clp_ffi_js/ir/StreamReader.cpp @@ -118,7 +118,8 @@ EMSCRIPTEN_BINDINGS(ClpStreamReader) { emscripten::register_type("Uint8Array"); emscripten::register_type("number[] | null"); emscripten::register_type( - "{logLevelKey: string, timestampKey: string} | null" + "{logLevelKey: {isAutoGenerated: boolean; parts: string[];} | null;" + " timestampKey: {isAutoGenerated: boolean; parts: string[];} | null;}" ); // JS types used as outputs diff --git a/src/clp_ffi_js/ir/StructuredIrStreamReader.cpp b/src/clp_ffi_js/ir/StructuredIrStreamReader.cpp index 15735437..3fdb4db8 100644 --- a/src/clp_ffi_js/ir/StructuredIrStreamReader.cpp +++ b/src/clp_ffi_js/ir/StructuredIrStreamReader.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +29,8 @@ namespace clp_ffi_js::ir { namespace { constexpr std::string_view cEmptyJsonStr{"{}"}; +constexpr std::string_view cFilterOptionIsAutoGeneratedKey{"isAutoGenerated"}; +constexpr std::string_view cFilterOptionPartsKey{"parts"}; constexpr std::string_view cReaderOptionsLogLevelKey{"logLevelKey"}; constexpr std::string_view cReaderOptionsTimestampKey{"timestampKey"}; constexpr std::string_view cMergedKvPairsAutoGeneratedKey{"auto-generated"}; @@ -39,12 +43,39 @@ constexpr std::string_view cMergedKvPairsUserGeneratedKey{"user-generated"}; * @param json * @return Serialized JSON. */ -auto dump_json_with_replace(nlohmann::json const& json) -> std::string; +[[nodiscard]] auto dump_json_with_replace(nlohmann::json const& json) -> std::string; + +/** + * @param filter_option The JavaScript object representing a filter option. + * @param leaf_node_type The type of the leaf node for the schema tree full branch. + * @return A schema tree full branch constructed based on the provided filter option and leaf node + * type. + * @return A schema tree full branch constructed based on the provided reader option. + * @return std::nullopt if `option` is `null`. + */ +[[nodiscard]] auto get_schema_tree_full_branch_from_filter_option( + emscripten::val const& filter_option, + clp::ffi::SchemaTree::Node::Type leaf_node_type +) -> std::optional; auto dump_json_with_replace(nlohmann::json const& json) -> std::string { return json.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace); } +auto get_schema_tree_full_branch_from_filter_option( + emscripten::val const& filter_option, + clp::ffi::SchemaTree::Node::Type leaf_node_type +) -> std::optional { + if (filter_option.isNull() || filter_option.isUndefined()) { + return std::nullopt; + } + return StructuredIrUnitHandler::SchemaTreeFullBranch{ + filter_option[cFilterOptionIsAutoGeneratedKey.data()].as(), + emscripten::vecFromJSArray(filter_option[cFilterOptionPartsKey.data()]), + leaf_node_type + }; +} + EMSCRIPTEN_BINDINGS(ClpStructuredIrStreamReader) { emscripten::constant( "MERGED_KV_PAIRS_AUTO_GENERATED_KEY", @@ -67,8 +98,14 @@ auto StructuredIrStreamReader::create( *zstd_decompressor, StructuredIrUnitHandler{ deserialized_log_events, - reader_options[cReaderOptionsLogLevelKey.data()].as(), - reader_options[cReaderOptionsTimestampKey.data()].as() + get_schema_tree_full_branch_from_filter_option( + reader_options[cReaderOptionsLogLevelKey.data()], + clp::ffi::SchemaTree::Node::Type::Str + ), + get_schema_tree_full_branch_from_filter_option( + reader_options[cReaderOptionsTimestampKey.data()], + clp::ffi::SchemaTree::Node::Type::Int + ) } )}; if (result.has_error()) { diff --git a/src/clp_ffi_js/ir/StructuredIrUnitHandler.cpp b/src/clp_ffi_js/ir/StructuredIrUnitHandler.cpp index 16de62d5..df93388d 100644 --- a/src/clp_ffi_js/ir/StructuredIrUnitHandler.cpp +++ b/src/clp_ffi_js/ir/StructuredIrUnitHandler.cpp @@ -5,15 +5,16 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include +#include #include #include #include @@ -28,11 +29,23 @@ namespace { * Parses a string to determine the corresponding `LogLevel` enum value. * @param str * @return `LogLevel` enum corresponding to `str` if `str` matches a string in `cLogLevelNames`. - * @return `LogLevel::NONE` otherwise. + * @return std::nullopt otherwise. */ -auto parse_log_level(std::string_view str) -> LogLevel; +[[nodiscard]] auto parse_log_level(std::string_view str) -> std::optional; -auto parse_log_level(std::string_view str) -> LogLevel { +/** + * Parses the log level from the given value. + * @param value + * @return The parsed log level forwarded from `parse_log_level`. + * @return std::nullopt on failures: + * - The given value's type cannot be decoded as a string. + * - Forwards `clp::ir::EncodedTextAst::decode_and_unparse`'s return values. + * - Forwards `parse_log_level`'s return values. + */ +[[nodiscard]] auto parse_log_level_from_value(clp::ffi::Value const& value +) -> std::optional; + +auto parse_log_level(std::string_view str) -> std::optional { // Convert the string to uppercase. std::string log_level_name_upper_case{str}; std::ranges::transform( @@ -48,18 +61,78 @@ auto parse_log_level(std::string_view str) -> LogLevel { log_level_name_upper_case ); if (it == cLogLevelNames.end()) { - return LogLevel::NONE; + return std::nullopt; } return static_cast(std::distance(cLogLevelNames.begin(), it)); } + +auto parse_log_level_from_value(clp::ffi::Value const& value) -> std::optional { + if (value.is()) { + return parse_log_level(value.get_immutable_view()); + } + + if (value.is()) { + auto const optional_log_level + = value.get_immutable_view().decode_and_unparse(); + if (false == optional_log_level.has_value()) { + return std::nullopt; + } + return parse_log_level(optional_log_level.value()); + } + + if (value.is()) { + auto const optional_log_level + = value.get_immutable_view().decode_and_unparse(); + if (false == optional_log_level.has_value()) { + return std::nullopt; + } + return parse_log_level(optional_log_level.value()); + } + + SPDLOG_ERROR("Protocol Error: The log level value must be a valid string-convertible type."); + return std::nullopt; +} } // namespace +auto StructuredIrUnitHandler::SchemaTreeFullBranch::match( + clp::ffi::SchemaTree const& schema_tree, + clp::ffi::SchemaTree::NodeLocator const& leaf_locator +) const -> bool { + if (leaf_locator.get_type() != m_leaf_type) { + return false; + } + + auto const optional_node_id{schema_tree.try_get_node_id(leaf_locator)}; + if (false == optional_node_id.has_value()) { + return false; + } + auto node_id{optional_node_id.value()}; + for (auto const& key : m_leaf_to_root_path) { + auto const& node{schema_tree.get_node(node_id)}; + if (node.is_root()) { + // Reaching root before matching all the keys in the expected path. + return false; + } + if (node.get_key_name() != key) { + return false; + } + node_id = node.get_parent_id_unsafe(); + } + + if (false == schema_tree.get_node(node_id).is_root()) { + // The root is not reached yet. + // The expected leaf-to-root path only matches the bottom branch from the leaf. + return false; + } + + return true; +} + auto StructuredIrUnitHandler::handle_log_event(StructuredLogEvent&& log_event ) -> clp::ffi::ir_stream::IRErrorCode { - auto const& id_value_pairs{log_event.get_user_gen_node_id_value_pairs()}; - auto const timestamp = get_timestamp(id_value_pairs); - auto const log_level = get_log_level(id_value_pairs); + auto const timestamp = get_timestamp(log_event); + auto const log_level = get_log_level(log_event); m_deserialized_log_events->emplace_back(std::move(log_event), log_level, timestamp); @@ -76,91 +149,129 @@ auto StructuredIrUnitHandler::handle_utc_offset_change( auto StructuredIrUnitHandler::handle_schema_tree_node_insertion( bool is_auto_generated, - clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator + clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, + std::shared_ptr const& schema_tree ) -> clp::ffi::ir_stream::IRErrorCode { - if (is_auto_generated) { - // TODO: Currently, all auto-generated keys are ignored. - return clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; + auto const optional_inserted_node_id{schema_tree->try_get_node_id(schema_tree_node_locator)}; + if (false == optional_inserted_node_id.has_value()) { + return clp::ffi::ir_stream::IRErrorCode_Corrupted_IR; } + auto const inserted_node_id{optional_inserted_node_id.value()}; - ++m_current_node_id; + if (false == m_optional_log_level_node_id.has_value() + && m_optional_log_level_full_branch.has_value() + && is_auto_generated == m_optional_log_level_full_branch->is_auto_generated()) + { + if (m_optional_log_level_full_branch->match(*schema_tree, schema_tree_node_locator)) { + m_optional_log_level_node_id.emplace(inserted_node_id); + } + } - auto const& key_name{schema_tree_node_locator.get_key_name()}; - if (key_name == m_log_level_key) { - m_log_level_node_id.emplace(m_current_node_id); - } else if (key_name == m_timestamp_key) { - m_timestamp_node_id.emplace(m_current_node_id); + if (false == m_optional_timestamp_node_id.has_value() + && m_optional_timestamp_full_branch.has_value() + && is_auto_generated == m_optional_timestamp_full_branch->is_auto_generated()) + { + if (m_optional_timestamp_full_branch->match(*schema_tree, schema_tree_node_locator)) { + m_optional_timestamp_node_id.emplace(inserted_node_id); + } } return clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } -auto StructuredIrUnitHandler::handle_end_of_stream() -> clp::ffi::ir_stream::IRErrorCode { +auto StructuredIrUnitHandler::handle_end_of_stream() const -> clp::ffi::ir_stream::IRErrorCode { + if (m_optional_log_level_full_branch.has_value() + && false == m_optional_log_level_node_id.has_value()) + { + SPDLOG_WARN("Log-level filter option is given, but the key is not found in the IR stream."); + } + if (m_optional_timestamp_full_branch.has_value() + && false == m_optional_timestamp_node_id.has_value()) + { + SPDLOG_WARN("Timestamp filter option is given, but the key is not found in the IR stream."); + } return clp::ffi::ir_stream::IRErrorCode::IRErrorCode_Success; } -auto StructuredIrUnitHandler::get_log_level( - StructuredLogEvent::NodeIdValuePairs const& id_value_pairs -) const -> LogLevel { - LogLevel log_level{LogLevel::NONE}; +auto StructuredIrUnitHandler::get_log_level(StructuredLogEvent const& log_event) const -> LogLevel { + constexpr LogLevel cDefaultLogLevel{LogLevel::NONE}; - if (false == m_log_level_node_id.has_value()) { - return log_level; + if (false == m_optional_log_level_full_branch.has_value()) { + return cDefaultLogLevel; } - auto const& optional_log_level_value{id_value_pairs.at(m_log_level_node_id.value())}; + + if (false == m_optional_log_level_node_id.has_value()) { + return cDefaultLogLevel; + } + + auto const log_level_node_id = m_optional_log_level_node_id.value(); + auto const& node_id_value_pairs = m_optional_log_level_full_branch->is_auto_generated() + ? log_event.get_auto_gen_node_id_value_pairs() + : log_event.get_user_gen_node_id_value_pairs(); + if (false == node_id_value_pairs.contains(log_level_node_id)) { + return cDefaultLogLevel; + } + + auto const& optional_log_level_value = node_id_value_pairs.at(log_level_node_id); if (false == optional_log_level_value.has_value()) { - return log_level; - } - auto const log_level_value = optional_log_level_value.value(); - - if (log_level_value.is()) { - auto const& log_level_str = log_level_value.get_immutable_view(); - log_level = parse_log_level(log_level_str); - } else if (log_level_value.is()) { - auto const& log_level_int = log_level_value.get_immutable_view(); - if (log_level_int >= clp::enum_to_underlying_type(cValidLogLevelsBeginIdx) - && log_level_int < clp::enum_to_underlying_type(LogLevel::LENGTH)) - { - log_level = static_cast(log_level_int); - } - } else { - auto log_event_idx = m_deserialized_log_events->size(); SPDLOG_ERROR( - "Authoritative log level's value is not an int or string for log event index {}", - log_event_idx + "Protocol error: The log level cannot be an empty value. Log event index: {}", + m_deserialized_log_events->size() ); + return cDefaultLogLevel; } - return log_level; + auto const optional_log_level = parse_log_level_from_value(optional_log_level_value.value()); + if (false == optional_log_level.has_value()) { + SPDLOG_DEBUG( + "Failed to parse log level for log event index {}", + m_deserialized_log_events->size() + ); + return cDefaultLogLevel; + } + + return optional_log_level.value(); } -auto StructuredIrUnitHandler::get_timestamp( - StructuredLogEvent::NodeIdValuePairs const& id_value_pairs +auto StructuredIrUnitHandler::get_timestamp(StructuredLogEvent const& log_event ) const -> clp::ir::epoch_time_ms_t { - clp::ir::epoch_time_ms_t timestamp{0}; + constexpr clp::ir::epoch_time_ms_t cDefaultTimestamp{0}; - if (false == m_timestamp_node_id.has_value()) { - return timestamp; + if (false == m_optional_timestamp_full_branch.has_value()) { + return cDefaultTimestamp; } - auto const& optional_timestamp_value{id_value_pairs.at(m_timestamp_node_id.value())}; - if (false == optional_timestamp_value.has_value()) { - return timestamp; + + if (false == m_optional_timestamp_node_id.has_value()) { + return cDefaultTimestamp; } - auto const timestamp_value = optional_timestamp_value.value(); - if (timestamp_value.is()) { - timestamp = static_cast( - timestamp_value.get_immutable_view() - ); - } else { - // TODO: Add support for parsing string-type timestamp values. - auto log_event_idx = m_deserialized_log_events->size(); + auto const timestamp_node_id = m_optional_timestamp_node_id.value(); + auto const& node_id_value_pairs = m_optional_timestamp_full_branch->is_auto_generated() + ? log_event.get_auto_gen_node_id_value_pairs() + : log_event.get_user_gen_node_id_value_pairs(); + if (false == node_id_value_pairs.contains(timestamp_node_id)) { + return cDefaultTimestamp; + } + + auto const& optional_ts = node_id_value_pairs.at(timestamp_node_id); + if (false == optional_ts.has_value()) { SPDLOG_ERROR( - "Authoritative timestamp's value is not an int for log event index {}", - log_event_idx + "Protocol error: The timestamp cannot be an empty value. Log event index: {}", + m_deserialized_log_events->size() ); + return cDefaultTimestamp; } - return timestamp; + auto const& timestamp{optional_ts.value()}; + if (false == timestamp.is()) { + SPDLOG_ERROR( + "Protocol error: The timestamp value must be a valid integer. Log event index: {}", + m_deserialized_log_events->size() + ); + return cDefaultTimestamp; + } + return static_cast( + timestamp.get_immutable_view() + ); } } // namespace clp_ffi_js::ir diff --git a/src/clp_ffi_js/ir/StructuredIrUnitHandler.hpp b/src/clp_ffi_js/ir/StructuredIrUnitHandler.hpp index 411c0652..914f558e 100644 --- a/src/clp_ffi_js/ir/StructuredIrUnitHandler.hpp +++ b/src/clp_ffi_js/ir/StructuredIrUnitHandler.hpp @@ -1,6 +1,7 @@ #ifndef CLP_FFI_JS_IR_STRUCTUREDIRUNITHANDLER_HPP #define CLP_FFI_JS_IR_STRUCTUREDIRUNITHANDLER_HPP +#include #include #include #include @@ -9,7 +10,6 @@ #include #include -#include #include #include #include @@ -18,28 +18,84 @@ #include namespace clp_ffi_js::ir { -using schema_tree_node_id_t = std::optional; - /** * Class that implements the `clp::ffi::ir_stream::IrUnitHandlerInterface` to buffer log events and * determine the schema-tree node IDs of the log level and timestamp kv-pairs. */ class StructuredIrUnitHandler { public: + // Types + /** + * Class to represent a full branch from the root to a leaf node in a schema tree. + * A branch is uniquely identified by the sequence of key names along the path and the type of + * the leaf node. All non-leaf nodes are implicitly of type `Obj`. + */ + class SchemaTreeFullBranch { + public: + // Constructor + /** + * @param is_auto_gen + * @param root_to_leaf_path + * @param leaf_type + */ + SchemaTreeFullBranch( + bool is_auto_gen, + std::vector root_to_leaf_path, + clp::ffi::SchemaTree::Node::Type leaf_type + ) + : m_is_auto_generated{is_auto_gen}, + m_leaf_to_root_path{std::move(root_to_leaf_path)}, + m_leaf_type{leaf_type} { + std::ranges::reverse(m_leaf_to_root_path); + } + + // Default move constructor and assignment operator + SchemaTreeFullBranch(SchemaTreeFullBranch&&) = default; + auto operator=(SchemaTreeFullBranch&&) -> SchemaTreeFullBranch& = default; + + // Delete copy constructor and assignment operator + SchemaTreeFullBranch(SchemaTreeFullBranch const&) = delete; + auto operator=(SchemaTreeFullBranch const&) -> SchemaTreeFullBranch& = delete; + + // Destructor + ~SchemaTreeFullBranch() = default; + + /** + * @return Whether this branch belongs to the auto-generated schema tree. + */ + [[nodiscard]] auto is_auto_generated() const -> bool { return m_is_auto_generated; } + + /** + * @param schema_tree + * @param leaf_locator + * @return Whether the branch from root to the leaf located by `leaf_locator` + * matches the underlying full branch. + */ + [[nodiscard]] auto match( + clp::ffi::SchemaTree const& schema_tree, + clp::ffi::SchemaTree::NodeLocator const& leaf_locator + ) const -> bool; + + private: + bool m_is_auto_generated; + std::vector m_leaf_to_root_path; + clp::ffi::SchemaTree::Node::Type m_leaf_type; + }; + // Constructors /** * @param deserialized_log_events The vector in which to store deserialized log events. - * @param log_level_key Key name of schema-tree node that contains the authoritative log level. - * @param timestamp_key Key name of schema-tree node that contains the authoritative timestamp. + * @param log_level_full_branch A schema tree full branch for the authoritative log level. + * @param timestamp_full_branch A schema tree full branch for the authoritative timestamp. */ StructuredIrUnitHandler( std::shared_ptr>> deserialized_log_events, - std::string log_level_key, - std::string timestamp_key + std::optional log_level_full_branch, + std::optional timestamp_full_branch ) - : m_log_level_key{std::move(log_level_key)}, - m_timestamp_key{std::move(timestamp_key)}, + : m_optional_log_level_full_branch{std::move(log_level_full_branch)}, + m_optional_timestamp_full_branch{std::move(timestamp_full_branch)}, m_deserialized_log_events{std::move(deserialized_log_events)} {} // Methods implementing `clp::ffi::ir_stream::IrUnitHandlerInterface`. @@ -67,47 +123,50 @@ class StructuredIrUnitHandler { * kv-pair. * @param is_auto_generated * @param schema_tree_node_locator + * @param schema_tree * @return IRErrorCode::IRErrorCode_Success */ [[nodiscard]] auto handle_schema_tree_node_insertion( bool is_auto_generated, - clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator + clp::ffi::SchemaTree::NodeLocator schema_tree_node_locator, + std::shared_ptr const& schema_tree ) -> clp::ffi::ir_stream::IRErrorCode; /** - * Dummy implementation that does nothing but conforms to the interface. + * Handles end-of-stream IR unit. * @return IRErrorCode::IRErrorCode_Success */ - [[nodiscard]] static auto handle_end_of_stream() -> clp::ffi::ir_stream::IRErrorCode; + [[nodiscard]] auto handle_end_of_stream() const -> clp::ffi::ir_stream::IRErrorCode; private: // Methods /** - * @param id_value_pairs - * @return `LogLevel::NONE` if `m_log_level_node_id` is unset, the node has no value, or the - * node's value is not an integer or string. - * @return `LogLevel` from node with id `m_log_level_node_id` otherwise. + * @param log_event + * @return `LogLevel` of the given `log_event` on success. + * @return LogLevel::None by default if: + * - `m_optional_log_level_node_id` is unset. + * - `m_optional_log_level_node_id` is set but not appearing in the given node-id-value pairs. + * - `parse_log_level_from_value` fails. */ - [[nodiscard]] auto get_log_level(StructuredLogEvent::NodeIdValuePairs const& id_value_pairs - ) const -> LogLevel; + [[nodiscard]] auto get_log_level(StructuredLogEvent const& log_event) const -> LogLevel; /** - * @param id_value_pairs - * @return 0 if `m_timestamp_node_id` is unset, the node has no value, or the node's value is - * not an integer. - * @return Timestamp from node with ID `m_timestamp_node_id` otherwise. + * @param log_event + * @return Timestamp from the given `log_event` on success. + * @return 0 by default if: + * - `m_optional_timestamp_id` is unset. + * - `m_optional_timestamp_id` is set but not appearing in the given node-id-value pairs. + * - The value is not a valid integer. */ - [[nodiscard]] auto get_timestamp(StructuredLogEvent::NodeIdValuePairs const& id_value_pairs + [[nodiscard]] auto get_timestamp(StructuredLogEvent const& log_event ) const -> clp::ir::epoch_time_ms_t; // Variables - std::string m_log_level_key; - std::string m_timestamp_key; - - clp::ffi::SchemaTree::Node::id_t m_current_node_id{clp::ffi::SchemaTree::cRootId}; + std::optional m_optional_log_level_full_branch; + std::optional m_optional_timestamp_full_branch; - schema_tree_node_id_t m_log_level_node_id; - schema_tree_node_id_t m_timestamp_node_id; + std::optional m_optional_log_level_node_id; + std::optional m_optional_timestamp_node_id; // TODO: Technically, we don't need to use a `shared_ptr` since the parent stream reader will // have a longer lifetime than this class. Instead, we could use `gsl::not_null` once we add diff --git a/src/submodules/clp b/src/submodules/clp index 2aa5c5c7..5dc26c2c 160000 --- a/src/submodules/clp +++ b/src/submodules/clp @@ -1 +1 @@ -Subproject commit 2aa5c5c713b15f7b7dd911a746e386b306a95242 +Subproject commit 5dc26c2c4a00b3ededc247a23a2c83e2a5473894