Skip to content

Commit 038b1ec

Browse files
authored
Optimize parameter parsing in text chunking processor (#733)
* Optimize parameter parsing in text chunking processor Signed-off-by: yuye-aws <yuyezhu@amazon.com> * add change log Signed-off-by: yuye-aws <yuyezhu@amazon.com> * fix unit tests in delimiter chunker Signed-off-by: yuye-aws <yuyezhu@amazon.com> * fix unit tests in fixed token length chunker Signed-off-by: yuye-aws <yuyezhu@amazon.com> * remove redundant Signed-off-by: yuye-aws <yuyezhu@amazon.com> * refactor chunker parameter parser Signed-off-by: yuye-aws <yuyezhu@amazon.com> * unit tests for chunker parameter parser Signed-off-by: yuye-aws <yuyezhu@amazon.com> * fix comment Signed-off-by: yuye-aws <yuyezhu@amazon.com> * spotless apply Signed-off-by: yuye-aws <yuyezhu@amazon.com> --------- Signed-off-by: yuye-aws <yuyezhu@amazon.com>
1 parent 7c54c86 commit 038b1ec

File tree

9 files changed

+421
-93
lines changed

9 files changed

+421
-93
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1616
### Features
1717
### Enhancements
1818
- Pass empty doc collector instead of top docs collector to improve hybrid query latencies by 20% ([#731](https://github.com/opensearch-project/neural-search/pull/731))
19+
- Optimize parameter parsing in text chunking processor ([#733](https://github.com/opensearch-project/neural-search/pull/733))
1920
### Bug Fixes
2021
### Infrastructure
2122
### Documentation

src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java

+6-5
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030
import static org.opensearch.neuralsearch.processor.chunker.Chunker.DEFAULT_MAX_CHUNK_LIMIT;
3131
import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT;
3232
import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD;
33-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
33+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseInteger;
34+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerWithDefault;
3435

3536
/**
3637
* This processor is used for text chunking.
@@ -115,8 +116,8 @@ private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
115116
}
116117
Map<String, Object> chunkerParameters = (Map<String, Object>) algorithmValue;
117118
// parse processor level max chunk limit
118-
this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
119-
if (maxChunkLimit < 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
119+
this.maxChunkLimit = parseIntegerWithDefault(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
120+
if (maxChunkLimit <= 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
120121
throw new IllegalArgumentException(
121122
String.format(
122123
Locale.ROOT,
@@ -309,10 +310,10 @@ private List<String> chunkString(final String content, final Map<String, Object>
309310
}
310311
List<String> contentResult = chunker.chunk(content, runTimeParameters);
311312
// update chunk_string_count for each string
312-
int chunkStringCount = parseIntegerParameter(runTimeParameters, CHUNK_STRING_COUNT_FIELD, 1);
313+
int chunkStringCount = parseInteger(runTimeParameters, CHUNK_STRING_COUNT_FIELD);
313314
runTimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount - 1);
314315
// update runtime max_chunk_limit if not disabled
315-
int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
316+
int runtimeMaxChunkLimit = parseInteger(runTimeParameters, MAX_CHUNK_LIMIT_FIELD);
316317
if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
317318
runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit - contentResult.size());
318319
}

src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java

+60-15
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,7 @@ private ChunkerParameterParser() {} // no instance of this util class
2222
* Parse String type parameter.
2323
* Throw IllegalArgumentException if parameter is not a string or an empty string.
2424
*/
25-
public static String parseStringParameter(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
26-
if (!parameters.containsKey(fieldName)) {
27-
// all string parameters are optional
28-
return defaultValue;
29-
}
25+
public static String parseString(final Map<String, Object> parameters, final String fieldName) {
3026
Object fieldValue = parameters.get(fieldName);
3127
if (!(fieldValue instanceof String)) {
3228
throw new IllegalArgumentException(
@@ -40,14 +36,23 @@ public static String parseStringParameter(final Map<String, Object> parameters,
4036
}
4137

4238
/**
43-
* Parse integer type parameter.
44-
* Throw IllegalArgumentException if parameter is not an integer.
39+
* Parse String type parameter.
40+
* Return default value if the parameter is missing.
41+
* Throw IllegalArgumentException if parameter is not a string or an empty string.
4542
*/
46-
public static int parseIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
43+
public static String parseStringWithDefault(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
4744
if (!parameters.containsKey(fieldName)) {
48-
// all integer parameters are optional
45+
// all string parameters are optional
4946
return defaultValue;
5047
}
48+
return parseString(parameters, fieldName);
49+
}
50+
51+
/**
52+
* Parse integer type parameter with default value.
53+
* Throw IllegalArgumentException if the parameter is not an integer.
54+
*/
55+
public static int parseInteger(final Map<String, Object> parameters, final String fieldName) {
5156
String fieldValueString = parameters.get(fieldName).toString();
5257
try {
5358
return NumberUtils.createInteger(fieldValueString);
@@ -58,27 +63,54 @@ public static int parseIntegerParameter(final Map<String, Object> parameters, fi
5863
}
5964
}
6065

66+
/**
67+
* Parse integer type parameter with default value.
68+
* Return default value if the parameter is missing.
69+
* Throw IllegalArgumentException if the parameter is not an integer.
70+
*/
71+
public static int parseIntegerWithDefault(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
72+
if (!parameters.containsKey(fieldName)) {
73+
// return the default value when parameter is missing
74+
return defaultValue;
75+
}
76+
return parseInteger(parameters, fieldName);
77+
}
78+
6179
/**
6280
* Parse integer type parameter with positive value.
63-
* Throw IllegalArgumentException if parameter is not a positive integer.
81+
* Return default value if the parameter is missing.
82+
* Throw IllegalArgumentException if the parameter is not a positive integer.
6483
*/
65-
public static int parsePositiveIntegerParameter(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
66-
int fieldValueInt = parseIntegerParameter(parameters, fieldName, defaultValue);
84+
public static int parsePositiveInteger(final Map<String, Object> parameters, final String fieldName) {
85+
int fieldValueInt = parseInteger(parameters, fieldName);
6786
if (fieldValueInt <= 0) {
6887
throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName));
6988
}
7089
return fieldValueInt;
7190
}
7291

7392
/**
74-
* Parse double type parameter.
75-
* Throw IllegalArgumentException if parameter is not a double.
93+
* Parse integer type parameter with positive value.
94+
* Return default value if the parameter is missing.
95+
* Throw IllegalArgumentException if the parameter is not a positive integer.
7696
*/
77-
public static double parseDoubleParameter(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
97+
public static int parsePositiveIntegerWithDefault(
98+
final Map<String, Object> parameters,
99+
final String fieldName,
100+
final Integer defaultValue
101+
) {
78102
if (!parameters.containsKey(fieldName)) {
79103
// all double parameters are optional
80104
return defaultValue;
81105
}
106+
return parsePositiveInteger(parameters, fieldName);
107+
}
108+
109+
/**
110+
* Parse double type parameter.
111+
* Throw IllegalArgumentException if parameter is not a double.
112+
*/
113+
public static double parseDouble(final Map<String, Object> parameters, final String fieldName) {
82114
String fieldValueString = parameters.get(fieldName).toString();
83115
try {
84116
return NumberUtils.createDouble(fieldValueString);
@@ -88,4 +120,17 @@ public static double parseDoubleParameter(final Map<String, Object> parameters,
88120
);
89121
}
90122
}
123+
124+
/**
125+
* Parse double type parameter.
126+
* Return default value if the parameter is missing.
127+
* Throw IllegalArgumentException if parameter is not a double.
128+
*/
129+
public static double parseDoubleWithDefault(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
130+
if (!parameters.containsKey(fieldName)) {
131+
// all double parameters are optional
132+
return defaultValue;
133+
}
134+
return parseDouble(parameters, fieldName);
135+
}
91136
}

src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java

+5-7
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
import java.util.List;
99
import java.util.ArrayList;
1010

11-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
12-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
11+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseInteger;
12+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringWithDefault;
1313

1414
/**
1515
* The implementation {@link Chunker} for delimiter algorithm
@@ -23,7 +23,6 @@ public final class DelimiterChunker implements Chunker {
2323
public static final String DEFAULT_DELIMITER = "\n\n";
2424

2525
private String delimiter;
26-
private int maxChunkLimit;
2726

2827
public DelimiterChunker(final Map<String, Object> parameters) {
2928
parseParameters(parameters);
@@ -39,8 +38,7 @@ public DelimiterChunker(final Map<String, Object> parameters) {
3938
*/
4039
@Override
4140
public void parseParameters(Map<String, Object> parameters) {
42-
this.delimiter = parseStringParameter(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER);
43-
this.maxChunkLimit = parseIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
41+
this.delimiter = parseStringWithDefault(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER);
4442
}
4543

4644
/**
@@ -53,8 +51,8 @@ public void parseParameters(Map<String, Object> parameters) {
5351
*/
5452
@Override
5553
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
56-
int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
57-
int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1);
54+
int runtimeMaxChunkLimit = parseInteger(runtimeParameters, MAX_CHUNK_LIMIT_FIELD);
55+
int chunkStringCount = parseInteger(runtimeParameters, CHUNK_STRING_COUNT_FIELD);
5856

5957
List<String> chunkResult = new ArrayList<>();
6058
int start = 0, end;

src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java

+11-14
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
import org.opensearch.action.admin.indices.analyze.AnalyzeAction;
1515
import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken;
1616
import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze;
17-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter;
18-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter;
19-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
20-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter;
17+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseInteger;
18+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringWithDefault;
19+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleWithDefault;
20+
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerWithDefault;
2121

2222
/**
2323
* The implementation {@link Chunker} for fixed token length algorithm.
@@ -33,10 +33,9 @@ public final class FixedTokenLengthChunker implements Chunker {
3333
public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count";
3434
public static final String TOKENIZER_FIELD = "tokenizer";
3535

36-
// default values for each parameter
36+
// default values for each non-runtime parameter
3737
private static final int DEFAULT_TOKEN_LIMIT = 384;
3838
private static final double DEFAULT_OVERLAP_RATE = 0.0;
39-
private static final int DEFAULT_MAX_TOKEN_COUNT = 10000;
4039
private static final String DEFAULT_TOKENIZER = "standard";
4140

4241
// parameter restrictions
@@ -54,7 +53,6 @@ public final class FixedTokenLengthChunker implements Chunker {
5453

5554
// parameter value
5655
private int tokenLimit;
57-
private int maxChunkLimit;
5856
private String tokenizer;
5957
private double overlapRate;
6058
private final AnalysisRegistry analysisRegistry;
@@ -81,10 +79,9 @@ public FixedTokenLengthChunker(final Map<String, Object> parameters) {
8179
*/
8280
@Override
8381
public void parseParameters(Map<String, Object> parameters) {
84-
this.tokenLimit = parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
85-
this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
86-
this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
87-
this.maxChunkLimit = parseIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT);
82+
this.tokenLimit = parsePositiveIntegerWithDefault(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT);
83+
this.overlapRate = parseDoubleWithDefault(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE);
84+
this.tokenizer = parseStringWithDefault(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
8885
if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) {
8986
throw new IllegalArgumentException(
9087
String.format(
@@ -121,9 +118,9 @@ public void parseParameters(Map<String, Object> parameters) {
121118
*/
122119
@Override
123120
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
124-
int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
125-
int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, this.maxChunkLimit);
126-
int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1);
121+
int maxTokenCount = parseInteger(runtimeParameters, MAX_TOKEN_COUNT_FIELD);
122+
int runtimeMaxChunkLimit = parseInteger(runtimeParameters, MAX_CHUNK_LIMIT_FIELD);
123+
int chunkStringCount = parseInteger(runtimeParameters, CHUNK_STRING_COUNT_FIELD);
127124

128125
List<AnalyzeToken> tokens = tokenize(content, tokenizer, maxTokenCount);
129126
List<String> chunkResult = new ArrayList<>();

0 commit comments

Comments
 (0)