Skip to content

Commit a484199

Browse files
authored
Change ingestion input to JSON lines format (#639)
Signed-off-by: Tyler Ohlsen <ohltyler@amazon.com>
1 parent 501ac28 commit a484199

16 files changed

+285
-32
lines changed

common/constants.ts

+1
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ export const ML_RESPONSE_PROCESSOR_EXAMPLE_DOCS_LINK =
266266
'https://opensearch.org/docs/latest/search-plugins/search-pipelines/ml-inference-search-response/#example-externally-hosted-text-embedding-model';
267267
export const UPDATE_MODEL_DOCS_LINK =
268268
'https://opensearch.org/docs/latest/ml-commons-plugin/api/model-apis/update-model/';
269+
export const JSONLINES_LINK = 'https://jsonlines.org/';
269270

270271
// Large Language Models Documentation Links
271272
export const BEDROCK_CLAUDE_3_SONNET_DOCS_LINK =

common/interfaces.ts

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ export type ConfigFieldType =
3030
| 'json'
3131
| 'jsonArray'
3232
| 'jsonString'
33+
| 'jsonLines'
3334
| 'select'
3435
| 'model'
3536
| 'map'

common/utils.ts

+4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ export function customStringify(jsonObj: {} | []): string {
4040
return JSON.stringify(jsonObj, undefined, 2);
4141
}
4242

43+
export function customStringifySingleLine(jsonObj: {}): string {
44+
return JSON.stringify(jsonObj, undefined, 0);
45+
}
46+
4347
export function isVectorSearchUseCase(workflow: Workflow | undefined): boolean {
4448
return (
4549
workflow?.ui_metadata?.type !== undefined &&

public/pages/workflow_detail/workflow_inputs/ingest_inputs/source_data.tsx

+2-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ export function SourceData(props: SourceDataProps) {
4444
// empty/populated docs state
4545
let docs = [];
4646
try {
47-
docs = JSON.parse(getIn(values, 'ingest.docs', []));
47+
const lines = getIn(values, 'ingest.docs', '').split('\n') as string[];
48+
lines.forEach((line) => docs.push(JSON.parse(line)));
4849
} catch {}
4950
const docsPopulated = docs.length > 0;
5051

public/pages/workflow_detail/workflow_inputs/ingest_inputs/source_data_modal.tsx

+26-9
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,17 @@ import {
2121
EuiSmallButtonEmpty,
2222
EuiButtonGroup,
2323
EuiCompressedComboBox,
24+
EuiLink,
2425
} from '@elastic/eui';
25-
import { JsonField } from '../input_fields';
26+
import { JsonLinesField } from '../input_fields';
2627
import {
27-
customStringify,
28+
customStringifySingleLine,
2829
FETCH_ALL_QUERY_LARGE,
2930
IConfigField,
3031
IndexMappings,
3132
IngestDocsFormValues,
3233
isVectorSearchUseCase,
34+
JSONLINES_LINK,
3335
MAX_BYTES_FORMATTED,
3436
MAX_DOCS_TO_IMPORT,
3537
SearchHit,
@@ -72,11 +74,11 @@ export function SourceDataModal(props: SourceDataProps) {
7274

7375
// sub-form values/schema
7476
const docsFormValues = {
75-
docs: getInitialValue('jsonArray'),
77+
docs: getInitialValue('jsonLines'),
7678
} as IngestDocsFormValues;
7779
const docsFormSchema = yup.object({
7880
docs: getFieldSchema({
79-
type: 'jsonArray',
81+
type: 'jsonLines',
8082
} as IConfigField),
8183
}) as yup.Schema;
8284

@@ -177,8 +179,14 @@ export function SourceDataModal(props: SourceDataProps) {
177179
.then((resp) => {
178180
const docObjs = resp?.hits?.hits
179181
?.slice(0, MAX_DOCS_TO_IMPORT)
180-
?.map((hit: SearchHit) => hit?._source);
181-
formikProps.setFieldValue('docs', customStringify(docObjs));
182+
?.map((hit: SearchHit) => hit?._source) as {}[];
183+
let jsonLinesStr = '';
184+
try {
185+
docObjs.forEach((docObj) => {
186+
jsonLinesStr += customStringifySingleLine(docObj) + '\n';
187+
});
188+
} catch {}
189+
formikProps.setFieldValue('docs', jsonLinesStr);
182190
});
183191
}
184192
}, [selectedIndex]);
@@ -234,7 +242,7 @@ export function SourceDataModal(props: SourceDataProps) {
234242
{props.selectedOption === SOURCE_OPTIONS.UPLOAD && (
235243
<>
236244
<EuiCompressedFilePicker
237-
accept="application/json"
245+
accept=".jsonl"
238246
multiple={false}
239247
initialPromptText="Upload file"
240248
onChange={(files) => {
@@ -247,6 +255,7 @@ export function SourceDataModal(props: SourceDataProps) {
247255
'docs',
248256
e.target.result as string
249257
);
258+
formikProps.setFieldTouched('docs');
250259
}
251260
};
252261
fileReader.readAsText(files[0]);
@@ -286,12 +295,20 @@ export function SourceDataModal(props: SourceDataProps) {
286295
<EuiSpacer size="xs" />
287296
</>
288297
)}
289-
<JsonField
298+
<JsonLinesField
290299
label="Documents to be imported"
291300
fieldPath={'docs'}
292-
helpText="Documents must be in a JSON array format."
301+
helpText={
302+
<EuiText size="s">
303+
Documents must be in JSON lines format.{' '}
304+
<EuiLink href={JSONLINES_LINK} target="_blank">
305+
Learn more
306+
</EuiLink>
307+
</EuiText>
308+
}
293309
editorHeight="40vh"
294310
readOnly={false}
311+
validate={true}
295312
/>
296313
</>
297314
</EuiModalBody>

public/pages/workflow_detail/workflow_inputs/input_fields/index.ts

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
export { TextField } from './text_field';
77
export { JsonField } from './json_field';
8+
export { JsonLinesField } from './json_lines_field';
89
export { ModelField } from './model_field';
910
export { MapField } from './map_field';
1011
export { MapArrayField } from './map_array_field';

public/pages/workflow_detail/workflow_inputs/input_fields/json_field.tsx

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ interface JsonFieldProps {
2929
* in some custom JSON
3030
*/
3131
export function JsonField(props: JsonFieldProps) {
32-
const validate = props.validate !== undefined ? props.validate : true;
32+
const validate = props.validate ?? true;
3333

3434
const { errors, touched, values } = useFormikContext<WorkflowFormValues>();
3535

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
import React, { ReactNode, useEffect, useState } from 'react';
7+
import { Field, FieldProps, getIn, useFormikContext } from 'formik';
8+
import { isEmpty } from 'lodash';
9+
import {
10+
EuiCodeEditor,
11+
EuiCompressedFormRow,
12+
EuiLink,
13+
EuiText,
14+
} from '@elastic/eui';
15+
import {
16+
customStringifySingleLine,
17+
WorkflowFormValues,
18+
} from '../../../../../common';
19+
import { camelCaseToTitleString } from '../../../../utils';
20+
21+
interface JsonLinesFieldProps {
22+
fieldPath: string; // the full path in string-form to the field (e.g., 'ingest.enrich.processors.text_embedding_processor.inputField')
23+
validate?: boolean;
24+
label?: string;
25+
helpLink?: string;
26+
helpText?: string | ReactNode;
27+
editorHeight?: string;
28+
readOnly?: boolean;
29+
}
30+
31+
/**
32+
* An input field for a component where users input data in JSON Lines format.
33+
* https://jsonlines.org/
34+
*/
35+
export function JsonLinesField(props: JsonLinesFieldProps) {
36+
const validate = props.validate ?? true;
37+
38+
const { errors, touched, values } = useFormikContext<WorkflowFormValues>();
39+
40+
// temp input state. only format when users click out of the code editor
41+
const [jsonStr, setJsonStr] = useState<string>('{}');
42+
const [customErrMsg, setCustomErrMsg] = useState<string | undefined>(
43+
undefined
44+
);
45+
46+
// initializing the text to be the stringified form value
47+
useEffect(() => {
48+
if (props.fieldPath && values) {
49+
const formValue = getIn(values, props.fieldPath) as string;
50+
if (formValue) {
51+
setJsonStr(formValue);
52+
}
53+
}
54+
}, [props.fieldPath, values]);
55+
56+
return (
57+
<Field name={props.fieldPath}>
58+
{({ field, form }: FieldProps) => {
59+
return (
60+
<EuiCompressedFormRow
61+
fullWidth={true}
62+
key={props.fieldPath}
63+
label={props.label || camelCaseToTitleString(field.name)}
64+
labelAppend={
65+
props.helpLink ? (
66+
<EuiText size="xs">
67+
<EuiLink href={props.helpLink} target="_blank">
68+
Learn more
69+
</EuiLink>
70+
</EuiText>
71+
) : undefined
72+
}
73+
helpText={props.helpText || undefined}
74+
error={
75+
validate ? (
76+
<>
77+
{customErrMsg?.split('\n')?.map((errMsg, idx) => {
78+
return (
79+
<EuiText key={idx} color="danger" size="s">
80+
{errMsg}
81+
</EuiText>
82+
);
83+
})}
84+
</>
85+
) : undefined
86+
}
87+
isInvalid={
88+
validate
89+
? getIn(errors, field.name) && getIn(touched, field.name)
90+
: false
91+
}
92+
>
93+
<EuiCodeEditor
94+
mode="hjson"
95+
theme="textmate"
96+
width="100%"
97+
height={props.editorHeight || '15vh'}
98+
value={jsonStr}
99+
onChange={(input) => {
100+
setJsonStr(input);
101+
form.setFieldValue(field.name, input);
102+
setCustomErrMsg(undefined);
103+
}}
104+
onBlur={() => {
105+
form.setFieldTouched(field.name);
106+
let finalJsonStr = '';
107+
let errs = [] as string[];
108+
try {
109+
const lines = jsonStr?.split('\n');
110+
lines.forEach((line: string, idx) => {
111+
if (line.trim() !== '') {
112+
let parsedLine = {};
113+
try {
114+
parsedLine = JSON.parse(line);
115+
} catch (error) {
116+
errs.push(
117+
getFormattedErrorMsg(error as Error, idx + 1)
118+
);
119+
}
120+
if (!isEmpty(parsedLine)) {
121+
finalJsonStr +=
122+
customStringifySingleLine(JSON.parse(line)) + '\n';
123+
}
124+
}
125+
});
126+
// remove trailing newline
127+
if (finalJsonStr !== '') {
128+
finalJsonStr = finalJsonStr.slice(0, -1);
129+
}
130+
131+
if (errs?.length > 0) {
132+
setCustomErrMsg(getFormattedErrorMsgList(errs));
133+
} else {
134+
form.setFieldValue(field.name, finalJsonStr);
135+
setCustomErrMsg(undefined);
136+
}
137+
} catch (error) {}
138+
}}
139+
readOnly={props.readOnly || false}
140+
setOptions={{
141+
fontSize: '14px',
142+
useWorker: false,
143+
highlightActiveLine: !props.readOnly,
144+
highlightSelectedWord: !props.readOnly,
145+
highlightGutterLine: !props.readOnly,
146+
wrap: true,
147+
}}
148+
aria-label="Code Editor"
149+
/>
150+
</EuiCompressedFormRow>
151+
);
152+
}}
153+
</Field>
154+
);
155+
}
156+
157+
// Parse out the useful information from an error triggered during JSON parsing failure
158+
function getFormattedErrorMsg(error: Error, idx: number): string {
159+
return `Error on line ${idx}: ${getIn(error, 'message', 'Invalid JSON')
160+
.replace(/^(.*?)\s+in JSON.*/, '$1')
161+
.replace(/^(.*?)\s+after JSON.*/, '$1')}`;
162+
}
163+
164+
// Verbosely display a few error messages, list the count of remaining ones.
165+
function getFormattedErrorMsgList(errors: string[]): string {
166+
let finalMsg = '';
167+
const verboseErrors = errors.slice(0, 3);
168+
const nonVerboseErrorCount = errors.length - 3;
169+
verboseErrors.forEach((error) => {
170+
finalMsg += error + '\n';
171+
});
172+
if (nonVerboseErrorCount > 0) {
173+
finalMsg += `${nonVerboseErrorCount} more error${
174+
nonVerboseErrorCount > 1 ? 's' : ''
175+
}`;
176+
} else if (finalMsg !== '') {
177+
// remove trailing newline
178+
finalMsg = finalMsg.slice(0, -1);
179+
}
180+
return finalMsg;
181+
}

public/pages/workflow_detail/workflow_inputs/processor_inputs/ml_processor_inputs/modals/configure_expression_modal.tsx

+9-4
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ export function ConfigureExpressionModal(props: ConfigureExpressionModalProps) {
123123
const docs = getIn(values, 'ingest.docs');
124124
let docObjs = [] as {}[] | undefined;
125125
try {
126-
docObjs = JSON.parse(docs);
126+
const lines = docs?.split('\n') as string[];
127+
lines.forEach((line) => docObjs?.push(JSON.parse(line)));
127128
} catch {}
128129
const query = getIn(values, 'search.request');
129130
let queryObj = {} as {} | undefined;
@@ -465,9 +466,13 @@ export function ConfigureExpressionModal(props: ConfigureExpressionModalProps) {
465466
});
466467
} else {
467468
try {
468-
const docObjs = JSON.parse(
469-
values.ingest.docs
470-
) as {}[];
469+
const docObjs = [] as {}[];
470+
const lines = values?.ingest?.docs?.split(
471+
'\n'
472+
) as string[];
473+
lines.forEach((line) =>
474+
docObjs?.push(JSON.parse(line))
475+
);
471476
if (docObjs.length > 0) {
472477
setSourceInput(
473478
customStringify(docObjs[0])

public/pages/workflow_detail/workflow_inputs/processor_inputs/ml_processor_inputs/modals/configure_multi_expression_modal.tsx

+2-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ export function ConfigureMultiExpressionModal(
129129
const docs = getIn(values, 'ingest.docs');
130130
let docObjs = [] as {}[] | undefined;
131131
try {
132-
docObjs = JSON.parse(docs);
132+
const lines = docs?.split('\n') as string[];
133+
lines.forEach((line) => docObjs?.push(JSON.parse(line)));
133134
} catch {}
134135
const query = getIn(values, 'search.request');
135136
let queryObj = {} as {} | undefined;

public/pages/workflow_detail/workflow_inputs/processor_inputs/ml_processor_inputs/modals/configure_template_modal.tsx

+2-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ export function ConfigureTemplateModal(props: ConfigureTemplateModalProps) {
149149
const docs = getIn(values, 'ingest.docs');
150150
let docObjs = [] as {}[] | undefined;
151151
try {
152-
docObjs = JSON.parse(docs);
152+
const lines = docs?.split('\n') as string[];
153+
lines.forEach((line) => docObjs?.push(JSON.parse(line)));
153154
} catch {}
154155
const query = getIn(values, 'search.request');
155156
let queryObj = {} as {} | undefined;

public/pages/workflow_detail/workflow_inputs/workflow_inputs.tsx

+5-3
Original file line numberDiff line numberDiff line change
@@ -275,8 +275,9 @@ export function WorkflowInputs(props: WorkflowInputsProps) {
275275
useEffect(() => {
276276
let parsedDocsObjs = [] as {}[];
277277
try {
278-
parsedDocsObjs = JSON.parse(props.ingestDocs);
279-
} catch (e) {}
278+
const lines = props.ingestDocs?.split('\n') as string[];
279+
lines.forEach((line) => parsedDocsObjs.push(JSON.parse(line)));
280+
} catch {}
280281
setDocsPopulated(parsedDocsObjs.length > 0 && !isEmpty(parsedDocsObjs[0]));
281282
}, [props.ingestDocs]);
282283

@@ -607,7 +608,8 @@ export function WorkflowInputs(props: WorkflowInputsProps) {
607608
try {
608609
let ingestDocsObjs = [] as {}[];
609610
try {
610-
ingestDocsObjs = JSON.parse(props.ingestDocs);
611+
const lines = props.ingestDocs?.split('\n') as string[];
612+
lines.forEach((line) => ingestDocsObjs.push(JSON.parse(line)));
611613
} catch (e) {}
612614
if (ingestDocsObjs.length > 0 && !isEmpty(ingestDocsObjs[0])) {
613615
success = await validateAndUpdateWorkflow(false, true, false);

0 commit comments

Comments
 (0)