From 0d673e6241064e00265471753e2f9d903080d469 Mon Sep 17 00:00:00 2001
From: Anton Rubin <anton.rubin@eliatra.com>
Date: Thu, 10 Oct 2024 15:42:59 +0100
Subject: [PATCH 1/4] add path-hierarchy-tokenizer-docs

Signed-off-by: Anton Rubin <anton.rubin@eliatra.com>
---
 _analyzers/tokenizers/index.md          |   2 +-
 _analyzers/tokenizers/path-hierarchy.md | 181 ++++++++++++++++++++++++
 2 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 _analyzers/tokenizers/path-hierarchy.md

diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md
index d401851f60..1abc5ee7ff 100644
--- a/_analyzers/tokenizers/index.md
+++ b/_analyzers/tokenizers/index.md
@@ -2,7 +2,7 @@
 layout: default
 title: Tokenizers
 nav_order: 60
-has_children: false
+has_children: true
 has_toc: false
 ---
 
diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md
new file mode 100644
index 0000000000..3327cb2a0e
--- /dev/null
+++ b/_analyzers/tokenizers/path-hierarchy.md
@@ -0,0 +1,181 @@
+---
+layout: default
+title: Path hierarchy
+parent: Tokenizers
+nav_order: 90
+---
+
+# Pattern tokenizer
+
+The `path_hierarchy` tokenizer is designed to tokenize file system-like paths (or similar hierarchical structures) by breaking them down into tokens at each level of the hierarchy. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths.
+
+## Example usage
+
+The following example request creates a new index named `my_index` and configures an analyzer with `path_hierarchy` tokenizer:
+
+```json
+PUT /my_index
+{
+  "settings": {
+    "analysis": {
+      "tokenizer": {
+        "my_path_tokenizer": {
+          "type": "path_hierarchy"
+        }
+      },
+      "analyzer": {
+        "my_path_analyzer": {
+          "type": "custom",
+          "tokenizer": "my_path_tokenizer"
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the created analyzer:
+
+```json
+POST /my_index/_analyze
+{
+  "analyzer": "my_path_analyzer",
+  "text": "/users/john/documents/report.txt"
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {
+      "token": "/users",
+      "start_offset": 0,
+      "end_offset": 6,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "/users/john",
+      "start_offset": 0,
+      "end_offset": 11,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "/users/john/documents",
+      "start_offset": 0,
+      "end_offset": 21,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": "/users/john/documents/report.txt",
+      "start_offset": 0,
+      "end_offset": 32,
+      "type": "word",
+      "position": 0
+    }
+  ]
+}
+```
+
+## Configuration
+
+The `path_hierarchy` tokenizer can be configured with the following parameters:
+
+- `delimiter`: specifies the character used to separate path components. Default is `/`. (String, _Optional_)
+- `replacement`: configures the character used to replace the delimiter in the tokens. The default is `/`. (String, _Optional_)
+- `buffer_size`: specifies the size of the buffer. Default is `1024`. (Integer, _Optional_)
+- `reverse`: produces tokens in reverse order if set to `true`. Default is `false`. (Boolean, _Optional_)
+- `skip`: specifies the number of initial tokens (levels) to skip when tokenizing. The default is `0`. (Integer, _Optional_)
+
+## Example using custom parameter
+
+The following example configures custom `delimiter` and `replacement`:
+
+```json
+PUT /my_index
+{
+  "settings": {
+    "analysis": {
+      "tokenizer": {
+        "my_path_tokenizer": {
+          "type": "path_hierarchy",
+          "delimiter": "\\",
+          "replacement": "\\"
+        }
+      },
+      "analyzer": {
+        "my_path_analyzer": {
+          "type": "custom",
+          "tokenizer": "my_path_tokenizer"
+        }
+      }
+    }
+  }
+}
+```
+{% include copy-curl.html %}
+
+## Generated tokens
+
+Use the following request to examine the tokens generated using the created analyzer:
+
+```json
+POST /my_index/_analyze
+{
+  "analyzer": "my_path_analyzer",
+  "text": "C:\\users\\john\\documents\\report.txt"
+}
+```
+{% include copy-curl.html %}
+
+The response contains the generated tokens:
+
+```json
+{
+  "tokens": [
+    {
+      "token": "C:",
+      "start_offset": 0,
+      "end_offset": 2,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": """C:\users""",
+      "start_offset": 0,
+      "end_offset": 8,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": """C:\users\john""",
+      "start_offset": 0,
+      "end_offset": 13,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": """C:\users\john\documents""",
+      "start_offset": 0,
+      "end_offset": 23,
+      "type": "word",
+      "position": 0
+    },
+    {
+      "token": """C:\users\john\documents\report.txt""",
+      "start_offset": 0,
+      "end_offset": 34,
+      "type": "word",
+      "position": 0
+    }
+  ]
+}
+```
\ No newline at end of file

From 9b94bf70d23f54f9839862b24840d4857c1f907f Mon Sep 17 00:00:00 2001
From: Anton Rubin <anton.rubin@eliatra.com>
Date: Wed, 16 Oct 2024 17:06:58 +0100
Subject: [PATCH 2/4] updating parameter table

Signed-off-by: Anton Rubin <anton.rubin@eliatra.com>
---
 _analyzers/tokenizers/path-hierarchy.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md
index 3327cb2a0e..fc134b3dbb 100644
--- a/_analyzers/tokenizers/path-hierarchy.md
+++ b/_analyzers/tokenizers/path-hierarchy.md
@@ -87,13 +87,15 @@ The response contains the generated tokens:
 
 ## Configuration
 
-The `path_hierarchy` tokenizer can be configured with the following parameters:
-
-- `delimiter`: specifies the character used to separate path components. Default is `/`. (String, _Optional_)
-- `replacement`: configures the character used to replace the delimiter in the tokens. The default is `/`. (String, _Optional_)
-- `buffer_size`: specifies the size of the buffer. Default is `1024`. (Integer, _Optional_)
-- `reverse`: produces tokens in reverse order if set to `true`. Default is `false`. (Boolean, _Optional_)
-- `skip`: specifies the number of initial tokens (levels) to skip when tokenizing. The default is `0`. (Integer, _Optional_)
+The `path_hierarchy` tokenizer can be configured with the following parameters.
+
+Parameter | Required/Optional | Data type | Description
+:--- | :--- | :--- | :--- 
+`delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`.
+`replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`.
+`buffer_size` | Optional | Integer | Specifies the size of the buffer. Default is `1024`.
+`reverse` | Optional | Boolean | Produces tokens in reverse order if set to `true`. Default is `false`.
+`skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`.
 
 ## Example using custom parameter
 

From c222d0a5c2718af8b5fa4cfead7353069bd2dd91 Mon Sep 17 00:00:00 2001
From: Fanit Kolchina <kolchfa@amazon.com>
Date: Thu, 5 Dec 2024 14:38:59 -0500
Subject: [PATCH 3/4] Doc review

Signed-off-by: Fanit Kolchina <kolchfa@amazon.com>
---
 _analyzers/tokenizers/path-hierarchy.md | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md
index fc134b3dbb..bab4bfe2b8 100644
--- a/_analyzers/tokenizers/path-hierarchy.md
+++ b/_analyzers/tokenizers/path-hierarchy.md
@@ -7,11 +7,11 @@ nav_order: 90
 
 # Pattern tokenizer
 
-The `path_hierarchy` tokenizer is designed to tokenize file system-like paths (or similar hierarchical structures) by breaking them down into tokens at each level of the hierarchy. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths.
+The `path_hierarchy` tokenizer is designed to tokenize file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths.
 
 ## Example usage
 
-The following example request creates a new index named `my_index` and configures an analyzer with `path_hierarchy` tokenizer:
+The following example request creates a new index named `my_index` and configures an analyzer with a `path_hierarchy` tokenizer:
 
 ```json
 PUT /my_index
@@ -37,7 +37,7 @@ PUT /my_index
 
 ## Generated tokens
 
-Use the following request to examine the tokens generated using the created analyzer:
+Use the following request to examine the tokens generated using the analyzer:
 
 ```json
 POST /my_index/_analyze
@@ -85,7 +85,7 @@ The response contains the generated tokens:
 }
 ```
 
-## Configuration
+## Parameters
 
 The `path_hierarchy` tokenizer can be configured with the following parameters.
 
@@ -93,13 +93,13 @@ Parameter | Required/Optional | Data type | Description
 :--- | :--- | :--- | :--- 
 `delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`.
 `replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`.
-`buffer_size` | Optional | Integer | Specifies the size of the buffer. Default is `1024`.
-`reverse` | Optional | Boolean | Produces tokens in reverse order if set to `true`. Default is `false`.
+`buffer_size` | Optional | Integer | Specifies the buffer size. Default is `1024`.
+`reverse` | Optional | Boolean | If `true`, specifies to generate tokens in reverse order. Default is `false`.
 `skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`.
 
-## Example using custom parameter
+## Example using delimiter and replacement parameters
 
-The following example configures custom `delimiter` and `replacement`:
+The following example configures custom `delimiter` and `replacement` parameters:
 
 ```json
 PUT /my_index
@@ -125,9 +125,8 @@ PUT /my_index
 ```
 {% include copy-curl.html %}
 
-## Generated tokens
 
-Use the following request to examine the tokens generated using the created analyzer:
+Use the following request to examine the tokens generated using the analyzer:
 
 ```json
 POST /my_index/_analyze

From f7cb1772bb06cdb9a37df3d656ae7bc60595b26b Mon Sep 17 00:00:00 2001
From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:02:40 -0500
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Nathan Bower <nbower@amazon.com>
Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com>
---
 _analyzers/tokenizers/path-hierarchy.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/_analyzers/tokenizers/path-hierarchy.md b/_analyzers/tokenizers/path-hierarchy.md
index bab4bfe2b8..a6609f30cd 100644
--- a/_analyzers/tokenizers/path-hierarchy.md
+++ b/_analyzers/tokenizers/path-hierarchy.md
@@ -5,9 +5,9 @@ parent: Tokenizers
 nav_order: 90
 ---
 
-# Pattern tokenizer
+# Path hierarchy tokenizer
 
-The `path_hierarchy` tokenizer is designed to tokenize file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths.
+The `path_hierarchy` tokenizer tokenizes file-system-like paths (or similar hierarchical structures) by breaking them down into tokens at each hierarchy level. This tokenizer is particularly useful when working with hierarchical data such as file paths, URLs, or any other delimited paths.
 
 ## Example usage
 
@@ -94,12 +94,12 @@ Parameter | Required/Optional | Data type | Description
 `delimiter` | Optional | String | Specifies the character used to separate path components. Default is `/`.
 `replacement` | Optional | String | Configures the character used to replace the delimiter in the tokens. Default is `/`.
 `buffer_size` | Optional | Integer | Specifies the buffer size. Default is `1024`.
-`reverse` | Optional | Boolean | If `true`, specifies to generate tokens in reverse order. Default is `false`.
+`reverse` | Optional | Boolean | If `true`, generates tokens in reverse order. Default is `false`.
 `skip` | Optional | Integer | Specifies the number of initial tokens (levels) to skip when tokenizing. Default is `0`.
 
 ## Example using delimiter and replacement parameters
 
-The following example configures custom `delimiter` and `replacement` parameters:
+The following example request configures custom `delimiter` and `replacement` parameters:
 
 ```json
 PUT /my_index