Skip to content

Commit a91843c

Browse files
offline batch ingestion API actions and data ingesters (opensearch-project#2844) (opensearch-project#2885)
* batch ingest API rest and transport actions Signed-off-by: Xun Zhang <xunzh@amazon.com> * add openAI ingester Signed-off-by: Xun Zhang <xunzh@amazon.com> * update batch ingestion field mapping interphase and address comments Signed-off-by: Xun Zhang <xunzh@amazon.com> * support multiple data sources as ingestion inputs Signed-off-by: Xun Zhang <xunzh@amazon.com> * use dedicated thread pool for ingestion Signed-off-by: Xun Zhang <xunzh@amazon.com> --------- Signed-off-by: Xun Zhang <xunzh@amazon.com> (cherry picked from commit 33a7c96) Co-authored-by: Xun Zhang <xunzh@amazon.com>
1 parent 41596c8 commit a91843c

26 files changed

+2314
-5
lines changed

common/src/main/java/org/opensearch/ml/common/MLTaskType.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@ public enum MLTaskType {
1515
@Deprecated
1616
LOAD_MODEL,
1717
REGISTER_MODEL,
18-
DEPLOY_MODEL
18+
DEPLOY_MODEL,
19+
BATCH_INGEST
1920
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.ml.common.transport.batch;
7+
8+
import org.opensearch.action.ActionType;
9+
10+
public class MLBatchIngestionAction extends ActionType<MLBatchIngestionResponse> {
11+
public static MLBatchIngestionAction INSTANCE = new MLBatchIngestionAction();
12+
public static final String NAME = "cluster:admin/opensearch/ml/batch_ingestion";
13+
14+
private MLBatchIngestionAction() {
15+
super(NAME, MLBatchIngestionResponse::new);
16+
}
17+
18+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.ml.common.transport.batch;
7+
8+
import static org.opensearch.core.xcontent.XContentParserUtils.ensureExpectedToken;
9+
10+
import java.io.IOException;
11+
import java.util.HashMap;
12+
import java.util.Map;
13+
14+
import org.opensearch.core.common.io.stream.StreamInput;
15+
import org.opensearch.core.common.io.stream.StreamOutput;
16+
import org.opensearch.core.common.io.stream.Writeable;
17+
import org.opensearch.core.xcontent.ToXContentObject;
18+
import org.opensearch.core.xcontent.XContentBuilder;
19+
import org.opensearch.core.xcontent.XContentParser;
20+
21+
import lombok.Builder;
22+
import lombok.Getter;
23+
24+
/**
25+
* ML batch ingestion data: index, field mapping and input and out files.
26+
*/
27+
public class MLBatchIngestionInput implements ToXContentObject, Writeable {
28+
29+
public static final String INDEX_NAME_FIELD = "index_name";
30+
public static final String FIELD_MAP_FIELD = "field_map";
31+
public static final String DATA_SOURCE_FIELD = "data_source";
32+
public static final String CONNECTOR_CREDENTIAL_FIELD = "credential";
33+
@Getter
34+
private String indexName;
35+
@Getter
36+
private Map<String, Object> fieldMapping;
37+
@Getter
38+
private Map<String, Object> dataSources;
39+
@Getter
40+
private Map<String, String> credential;
41+
42+
@Builder(toBuilder = true)
43+
public MLBatchIngestionInput(
44+
String indexName,
45+
Map<String, Object> fieldMapping,
46+
Map<String, Object> dataSources,
47+
Map<String, String> credential
48+
) {
49+
if (indexName == null) {
50+
throw new IllegalArgumentException(
51+
"The index name for data ingestion is missing. Please provide a valid index name to proceed."
52+
);
53+
}
54+
if (dataSources == null) {
55+
throw new IllegalArgumentException(
56+
"No data sources were provided for ingestion. Please specify at least one valid data source to proceed."
57+
);
58+
}
59+
this.indexName = indexName;
60+
this.fieldMapping = fieldMapping;
61+
this.dataSources = dataSources;
62+
this.credential = credential;
63+
}
64+
65+
public static MLBatchIngestionInput parse(XContentParser parser) throws IOException {
66+
String indexName = null;
67+
Map<String, Object> fieldMapping = null;
68+
Map<String, Object> dataSources = null;
69+
Map<String, String> credential = new HashMap<>();
70+
71+
ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.currentToken(), parser);
72+
while (parser.nextToken() != XContentParser.Token.END_OBJECT) {
73+
String fieldName = parser.currentName();
74+
parser.nextToken();
75+
76+
switch (fieldName) {
77+
case INDEX_NAME_FIELD:
78+
indexName = parser.text();
79+
break;
80+
case FIELD_MAP_FIELD:
81+
fieldMapping = parser.map();
82+
break;
83+
case CONNECTOR_CREDENTIAL_FIELD:
84+
credential = parser.mapStrings();
85+
break;
86+
case DATA_SOURCE_FIELD:
87+
dataSources = parser.map();
88+
break;
89+
default:
90+
parser.skipChildren();
91+
break;
92+
}
93+
}
94+
return new MLBatchIngestionInput(indexName, fieldMapping, dataSources, credential);
95+
}
96+
97+
@Override
98+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
99+
builder.startObject();
100+
if (indexName != null) {
101+
builder.field(INDEX_NAME_FIELD, indexName);
102+
}
103+
if (fieldMapping != null) {
104+
builder.field(FIELD_MAP_FIELD, fieldMapping);
105+
}
106+
if (credential != null) {
107+
builder.field(CONNECTOR_CREDENTIAL_FIELD, credential);
108+
}
109+
if (dataSources != null) {
110+
builder.field(DATA_SOURCE_FIELD, dataSources);
111+
}
112+
builder.endObject();
113+
return builder;
114+
}
115+
116+
@Override
117+
public void writeTo(StreamOutput output) throws IOException {
118+
output.writeOptionalString(indexName);
119+
if (fieldMapping != null) {
120+
output.writeBoolean(true);
121+
output.writeMap(fieldMapping, StreamOutput::writeString, StreamOutput::writeGenericValue);
122+
} else {
123+
output.writeBoolean(false);
124+
}
125+
if (credential != null) {
126+
output.writeBoolean(true);
127+
output.writeMap(credential, StreamOutput::writeString, StreamOutput::writeString);
128+
} else {
129+
output.writeBoolean(false);
130+
}
131+
if (dataSources != null) {
132+
output.writeBoolean(true);
133+
output.writeMap(dataSources, StreamOutput::writeString, StreamOutput::writeGenericValue);
134+
} else {
135+
output.writeBoolean(false);
136+
}
137+
}
138+
139+
public MLBatchIngestionInput(StreamInput input) throws IOException {
140+
indexName = input.readOptionalString();
141+
if (input.readBoolean()) {
142+
fieldMapping = input.readMap(s -> s.readString(), s -> s.readGenericValue());
143+
}
144+
if (input.readBoolean()) {
145+
credential = input.readMap(s -> s.readString(), s -> s.readString());
146+
}
147+
if (input.readBoolean()) {
148+
dataSources = input.readMap(s -> s.readString(), s -> s.readGenericValue());
149+
}
150+
}
151+
152+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.ml.common.transport.batch;
7+
8+
import static org.opensearch.action.ValidateActions.addValidationError;
9+
10+
import java.io.ByteArrayInputStream;
11+
import java.io.ByteArrayOutputStream;
12+
import java.io.IOException;
13+
import java.io.UncheckedIOException;
14+
15+
import org.opensearch.action.ActionRequest;
16+
import org.opensearch.action.ActionRequestValidationException;
17+
import org.opensearch.core.common.io.stream.InputStreamStreamInput;
18+
import org.opensearch.core.common.io.stream.OutputStreamStreamOutput;
19+
import org.opensearch.core.common.io.stream.StreamInput;
20+
import org.opensearch.core.common.io.stream.StreamOutput;
21+
22+
import lombok.AccessLevel;
23+
import lombok.Builder;
24+
import lombok.Getter;
25+
import lombok.ToString;
26+
import lombok.experimental.FieldDefaults;
27+
28+
@Getter
29+
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
30+
@ToString
31+
public class MLBatchIngestionRequest extends ActionRequest {
32+
33+
private MLBatchIngestionInput mlBatchIngestionInput;
34+
35+
@Builder
36+
public MLBatchIngestionRequest(MLBatchIngestionInput mlBatchIngestionInput) {
37+
this.mlBatchIngestionInput = mlBatchIngestionInput;
38+
}
39+
40+
public MLBatchIngestionRequest(StreamInput in) throws IOException {
41+
super(in);
42+
this.mlBatchIngestionInput = new MLBatchIngestionInput(in);
43+
}
44+
45+
@Override
46+
public void writeTo(StreamOutput out) throws IOException {
47+
super.writeTo(out);
48+
this.mlBatchIngestionInput.writeTo(out);
49+
}
50+
51+
@Override
52+
public ActionRequestValidationException validate() {
53+
ActionRequestValidationException exception = null;
54+
if (mlBatchIngestionInput == null) {
55+
exception = addValidationError("The input for ML batch ingestion cannot be null.", exception);
56+
}
57+
if (mlBatchIngestionInput != null && mlBatchIngestionInput.getCredential() == null) {
58+
exception = addValidationError("The credential for ML batch ingestion cannot be null", exception);
59+
}
60+
if (mlBatchIngestionInput != null && mlBatchIngestionInput.getDataSources() == null) {
61+
exception = addValidationError("The data sources for ML batch ingestion cannot be null", exception);
62+
}
63+
64+
return exception;
65+
}
66+
67+
public static MLBatchIngestionRequest fromActionRequest(ActionRequest actionRequest) {
68+
if (actionRequest instanceof MLBatchIngestionRequest) {
69+
return (MLBatchIngestionRequest) actionRequest;
70+
}
71+
72+
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); OutputStreamStreamOutput osso = new OutputStreamStreamOutput(baos)) {
73+
actionRequest.writeTo(osso);
74+
try (StreamInput input = new InputStreamStreamInput(new ByteArrayInputStream(baos.toByteArray()))) {
75+
return new MLBatchIngestionRequest(input);
76+
}
77+
} catch (IOException e) {
78+
throw new UncheckedIOException("failed to parse ActionRequest into MLBatchIngestionRequest", e);
79+
}
80+
81+
}
82+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.ml.common.transport.batch;
7+
8+
import java.io.ByteArrayInputStream;
9+
import java.io.ByteArrayOutputStream;
10+
import java.io.IOException;
11+
import java.io.UncheckedIOException;
12+
13+
import org.opensearch.core.action.ActionResponse;
14+
import org.opensearch.core.common.io.stream.InputStreamStreamInput;
15+
import org.opensearch.core.common.io.stream.OutputStreamStreamOutput;
16+
import org.opensearch.core.common.io.stream.StreamInput;
17+
import org.opensearch.core.common.io.stream.StreamOutput;
18+
import org.opensearch.core.xcontent.ToXContent;
19+
import org.opensearch.core.xcontent.ToXContentObject;
20+
import org.opensearch.core.xcontent.XContentBuilder;
21+
import org.opensearch.ml.common.MLTaskType;
22+
23+
import lombok.Getter;
24+
25+
@Getter
26+
public class MLBatchIngestionResponse extends ActionResponse implements ToXContentObject {
27+
public static final String TASK_ID_FIELD = "task_id";
28+
public static final String TASK_TYPE_FIELD = "task_type";
29+
public static final String STATUS_FIELD = "status";
30+
31+
private String taskId;
32+
private MLTaskType taskType;
33+
private String status;
34+
35+
public MLBatchIngestionResponse(StreamInput in) throws IOException {
36+
super(in);
37+
this.taskId = in.readString();
38+
this.taskType = in.readEnum(MLTaskType.class);
39+
this.status = in.readString();
40+
}
41+
42+
public MLBatchIngestionResponse(String taskId, MLTaskType mlTaskType, String status) {
43+
this.taskId = taskId;
44+
this.taskType = mlTaskType;
45+
this.status = status;
46+
}
47+
48+
@Override
49+
public void writeTo(StreamOutput out) throws IOException {
50+
out.writeString(taskId);
51+
out.writeEnum(taskType);
52+
out.writeString(status);
53+
}
54+
55+
@Override
56+
public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
57+
builder.startObject();
58+
builder.field(TASK_ID_FIELD, taskId);
59+
if (taskType != null) {
60+
builder.field(TASK_TYPE_FIELD, taskType);
61+
}
62+
builder.field(STATUS_FIELD, status);
63+
builder.endObject();
64+
return builder;
65+
}
66+
67+
public static MLBatchIngestionResponse fromActionResponse(ActionResponse actionResponse) {
68+
if (actionResponse instanceof MLBatchIngestionResponse) {
69+
return (MLBatchIngestionResponse) actionResponse;
70+
}
71+
72+
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); OutputStreamStreamOutput osso = new OutputStreamStreamOutput(baos)) {
73+
actionResponse.writeTo(osso);
74+
try (StreamInput input = new InputStreamStreamInput(new ByteArrayInputStream(baos.toByteArray()))) {
75+
return new MLBatchIngestionResponse(input);
76+
}
77+
} catch (IOException e) {
78+
throw new UncheckedIOException("failed to parse ActionResponse into MLBatchIngestionResponse", e);
79+
}
80+
}
81+
}

common/src/main/java/org/opensearch/ml/common/utils/StringUtils.java

+14
Original file line numberDiff line numberDiff line change
@@ -279,4 +279,18 @@ public static Map<String, String> parseParameters(Map<String, String> parameters
279279
return parameters;
280280
}
281281

282+
public static String obtainFieldNameFromJsonPath(String jsonPath) {
283+
String[] parts = jsonPath.split("\\.");
284+
285+
// Get the last part which is the field name
286+
return parts[parts.length - 1];
287+
}
288+
289+
public static String getJsonPath(String jsonPathWithSource) {
290+
// Find the index of the first occurrence of "$."
291+
int startIndex = jsonPathWithSource.indexOf("$.");
292+
293+
// Extract the substring from the startIndex to the end of the input string
294+
return (startIndex != -1) ? jsonPathWithSource.substring(startIndex) : jsonPathWithSource;
295+
}
282296
}

0 commit comments

Comments
 (0)