From 0d673e6241064e00265471753e2f9d903080d469 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 10 Oct 2024 15:42:59 +0100 Subject: [PATCH 1/4] add path-hierarchy-tokenizer-docs Signed-off-by: Anton Rubin --- _analyzers/tokenizers/index.md | 2 +- _analyzers/tokenizers/path-hierarchy.md | 181 ++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 _analyzers/tokenizers/path-hierarchy.md diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index d401851f60..1abc5ee7ff 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -2,7 +2,7 @@ layout: default title: Tokenizers nav_order: 60 -has_children: false +has_children: true has_toc: false --- diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md new file mode 100644 index 0000000000..3327cb2a0e --- /dev/null +++ b/_analyzers/tokenizers/path-hierarchy.md @@ -0,0 +1,181 @@ +--- +layout: default +title: Path hierarchy +parent: Tokenizers +nav_order: 90 +--- + +# Pattern tokenizer + +The `path_hierarchy` tokenizer is designed to tokenize file system-like paths (or similar hierarchical structures) by breaking them down into tokens at each level of the hierarchy. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with `path_hierarchy` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_path_tokenizer": { + "type": "path_hierarchy" + } + }, + "analyzer": { + "my_path_analyzer": { + "type": "custom", + "tokenizer": "my_path_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the created analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_path_analyzer", + "text": "/users/john/documents/report.txt" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "/users", + "start_offset": 0, + "end_offset": 6, + "type": "word", + "position": 0 + }, + { + "token": "/users/john", + "start_offset": 0, + "end_offset": 11, + "type": "word", + "position": 0 + }, + { + "token": "/users/john/documents", + "start_offset": 0, + "end_offset": 21, + "type": "word", + "position": 0 + }, + { + "token": "/users/john/documents/report.txt", + "start_offset": 0, + "end_offset": 32, + "type": "word", + "position": 0 + } + ] +} +``` + +## Configuration + +The `path_hierarchy` tokenizer can be configured with the following parameters: + +- `delimiter`: specifies the character used to separate path components. Default is `/`. (String, _Optional_) +- `replacement`: configures the character used to replace the delimiter in the tokens. The default is `/`. (String, _Optional_) +- `buffer_size`: specifies the size of the buffer. Default is `1024`. (Integer, _Optional_) +- `reverse`: produces tokens in reverse order if set to `true`. Default is `false`. (Boolean, _Optional_) +- `skip`: specifies the number of initial tokens (levels) to skip when tokenizing. The default is `0`. (Integer, _Optional_) + +## Example using custom parameter + +The following example configures custom `delimiter` and `replacement`: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "tokenizer": { + "my_path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "\\", + "replacement": "\\" + } + }, + "analyzer": { + "my_path_analyzer": { + "type": "custom", + "tokenizer": "my_path_tokenizer" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the created analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_path_analyzer", + "text": "C:\\users\\john\\documents\\report.txt" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "C:", + "start_offset": 0, + "end_offset": 2, + "type": "word", + "position": 0 + }, + { + "token": """C:\users""", + "start_offset": 0, + "end_offset": 8, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john""", + "start_offset": 0, + "end_offset": 13, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john\documents""", + "start_offset": 0, + "end_offset": 23, + "type": "word", + "position": 0 + }, + { + "token": """C:\users\john\documents\report.txt""", + "start_offset": 0, + "end_offset": 34, + "type": "word", + "position": 0 + } + ] +} +``` \ No newline at end of file From 9b94bf70d23f54f9839862b24840d4857c1f907f Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 16 Oct 2024 17:06:58 +0100 Subject: [PATCH 2/4] updating parameter table Signed-off-by: Anton Rubin --- _analyzers/tokenizers/path-hierarchy.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md index 3327cb2a0e..fc134b3dbb 100644 --- a/_analyzers/tokenizers/path-hierarchy.md +++ b/_analyzers/tokenizers/path-hierarchy.md @@ -87,13 +87,15 @@ The response contains the generated tokens: ## Configuration -The `path_hierarchy` tokenizer can be configured with the following parameters: - -- `delimiter`: specifies the character used to separate path components. Default is `/`. (String, _Optional_) -- `replacement`: configures the character used to replace the delimiter in the tokens. The default is `/`. (String, _Optional_) -- `buffer_size`: specifies the size of the buffer. Default is `1024`. (Integer, _Optional_) -- `reverse`: produces tokens in reverse order if set to `true`. Default is `false`. (Boolean, _Optional_) -- `skip`: specifies the number of initial tokens (levels) to skip when tokenizing. The default is `0`. (Integer, _Optional_) +The `path_hierarchy` tokenizer can be configured with the following parameters. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`. +`replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`. +`buffer_size` | Optional | Integer | Specifies the size of the buffer. Default is `1024`. +`reverse` | Optional | Boolean | Produces tokens in reverse order if set to `true`. Default is `false`. +`skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`. ## Example using custom parameter From c222d0a5c2718af8b5fa4cfead7353069bd2dd91 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 5 Dec 2024 14:38:59 -0500 Subject: [PATCH 3/4] Doc review Signed-off-by: Fanit Kolchina --- _analyzers/tokenizers/path-hierarchy.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md index fc134b3dbb..bab4bfe2b8 100644 --- a/_analyzers/tokenizers/path-hierarchy.md +++ b/_analyzers/tokenizers/path-hierarchy.md @@ -7,11 +7,11 @@ nav_order: 90 # Pattern tokenizer -The `path_hierarchy` tokenizer is designed to tokenize file system-like paths (or similar hierarchical structures) by breaking them down into tokens at each level of the hierarchy. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. +The `path_hierarchy` tokenizer is designed to tokenize file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. ## Example usage -The following example request creates a new index named `my_index` and configures an analyzer with `path_hierarchy` tokenizer: +The following example request creates a new index named `my_index` and configures an analyzer with a `path_hierarchy` tokenizer: ```json PUT /my_index @@ -37,7 +37,7 @@ PUT /my_index ## Generated tokens -Use the following request to examine the tokens generated using the created analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json POST /my_index/_analyze @@ -85,7 +85,7 @@ The response contains the generated tokens: } ``` -## Configuration +## Parameters The `path_hierarchy` tokenizer can be configured with the following parameters. @@ -93,13 +93,13 @@ Parameter | Required/Optional | Data type | Description :--- | :--- | :--- | :--- `delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`. `replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`. -`buffer_size` | Optional | Integer | Specifies the size of the buffer. Default is `1024`. -`reverse` | Optional | Boolean | Produces tokens in reverse order if set to `true`. Default is `false`. +`buffer_size` | Optional | Integer | Specifies the buffer size. Default is `1024`. +`reverse` | Optional | Boolean | If `true`, specifies to generate tokens in reverse order. Default is `false`. `skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`. -## Example using custom parameter +## Example using delimiter and replacement parameters -The following example configures custom `delimiter` and `replacement`: +The following example configures custom `delimiter` and `replacement` parameters: ```json PUT /my_index @@ -125,9 +125,8 @@ PUT /my_index ``` {% include copy-curl.html %} -## Generated tokens -Use the following request to examine the tokens generated using the created analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json POST /my_index/_analyze From f7cb1772bb06cdb9a37df3d656ae7bc60595b26b Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:02:40 -0500 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/tokenizers/path-hierarchy.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md index bab4bfe2b8..a6609f30cd 100644 --- a/_analyzers/tokenizers/path-hierarchy.md +++ b/_analyzers/tokenizers/path-hierarchy.md @@ -5,9 +5,9 @@ parent: Tokenizers nav_order: 90 --- -# Pattern tokenizer +# Path hierarchy tokenizer -The `path_hierarchy` tokenizer is designed to tokenize file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. +The `path_hierarchy` tokenizer tokenizes file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths. ## Example usage @@ -94,12 +94,12 @@ Parameter | Required/Optional | Data type | Description `delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`. `replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`. `buffer_size` | Optional | Integer | Specifies the buffer size. Default is `1024`. -`reverse` | Optional | Boolean | If `true`, specifies to generate tokens in reverse order. Default is `false`. +`reverse` | Optional | Boolean | If `true`, generates tokens in reverse order. Default is `false`. `skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`. ## Example using delimiter and replacement parameters -The following example configures custom `delimiter` and `replacement` parameters: +The following example request configures custom `delimiter` and `replacement` parameters: ```json PUT /my_index