Skip to content

Commit 4d5f50c

Browse files
authored
Onboard neural sparse search (#141)
Signed-off-by: Tyler Ohlsen <ohltyler@amazon.com>
1 parent e18bb11 commit 4d5f50c

23 files changed

+576
-119
lines changed

common/constants.ts

+34-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
MODEL_ALGORITHM,
88
PRETRAINED_MODEL_FORMAT,
99
PretrainedSentenceTransformer,
10+
PretrainedSparseEncodingModel,
1011
WORKFLOW_STATE,
1112
} from './interfaces';
1213

@@ -61,11 +62,15 @@ export const CREATE_INGEST_PIPELINE_STEP_TYPE = 'create_ingest_pipeline';
6162
export const CREATE_INDEX_STEP_TYPE = 'create_index';
6263
export const REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE =
6364
'register_local_pretrained_model';
65+
export const REGISTER_LOCAL_SPARSE_ENCODING_MODEL_STEP_TYPE =
66+
'register_local_sparse_encoding_model';
6467

6568
/**
6669
* ML PLUGIN PRETRAINED MODELS
67-
* (based off of https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers)
70+
* (based off of https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models)
6871
*/
72+
73+
// ---- SENTENCE TRANSFORMERS ----
6974
export const ROBERTA_SENTENCE_TRANSFORMER = {
7075
name: 'huggingface/sentence-transformers/all-distilroberta-v1',
7176
shortenedName: 'all-distilroberta-v1',
@@ -96,6 +101,34 @@ export const BERT_SENTENCE_TRANSFORMER = {
96101
vectorDimensions: 768,
97102
} as PretrainedSentenceTransformer;
98103

104+
// ---- SPARSE ENCODERS ----
105+
export const NEURAL_SPARSE_TRANSFORMER = {
106+
name: 'amazon/neural-sparse/opensearch-neural-sparse-encoding-v1',
107+
shortenedName: 'opensearch-neural-sparse-encoding-v1',
108+
description: 'A general neural sparse encoding model',
109+
format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT,
110+
algorithm: MODEL_ALGORITHM.SPARSE_ENCODING,
111+
version: '1.0.1',
112+
} as PretrainedSparseEncodingModel;
113+
114+
export const NEURAL_SPARSE_DOC_TRANSFORMER = {
115+
name: 'amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1',
116+
shortenedName: 'opensearch-neural-sparse-encoding-doc-v1',
117+
description: 'A general neural sparse encoding model',
118+
format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT,
119+
algorithm: MODEL_ALGORITHM.SPARSE_ENCODING,
120+
version: '1.0.1',
121+
} as PretrainedSparseEncodingModel;
122+
123+
export const NEURAL_SPARSE_TOKENIZER_TRANSFORMER = {
124+
name: 'amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1',
125+
shortenedName: 'opensearch-neural-sparse-tokenizer-v1',
126+
description: 'A neural sparse tokenizer model',
127+
format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT,
128+
algorithm: MODEL_ALGORITHM.SPARSE_ENCODING,
129+
version: '1.0.1',
130+
} as PretrainedSparseEncodingModel;
131+
99132
/**
100133
* MISCELLANEOUS
101134
*/

common/interfaces.ts

+25-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import { Node, Edge } from 'reactflow';
77
import { IComponentData } from '../public/component_types';
8+
import { COMPONENT_CLASS } from '../public/utils';
89

910
export type Index = {
1011
name: string;
@@ -16,7 +17,11 @@ export type Index = {
1617
*/
1718

1819
export type ReactFlowComponent = Node<IComponentData>;
19-
export type ReactFlowEdge = Edge<{}> & {};
20+
export type ReactFlowEdge = Edge<{}> & {
21+
key: string;
22+
sourceClasses: COMPONENT_CLASS[];
23+
targetClasses: COMPONENT_CLASS[];
24+
};
2025

2126
type ReactFlowViewport = {
2227
x: number;
@@ -49,6 +54,22 @@ export type TextEmbeddingProcessor = IngestProcessor & {
4954
};
5055
};
5156

57+
export type SparseEncodingProcessor = IngestProcessor & {
58+
sparse_encoding: {
59+
model_id: string;
60+
field_map: {};
61+
};
62+
};
63+
64+
export type IndexConfiguration = {
65+
settings: {};
66+
mappings: IndexMappings;
67+
};
68+
69+
export type IndexMappings = {
70+
properties: {};
71+
};
72+
5273
export type TemplateNode = {
5374
id: string;
5475
type: string;
@@ -135,6 +156,7 @@ export type Workflow = WorkflowTemplate & {
135156

136157
export enum USE_CASE {
137158
SEMANTIC_SEARCH = 'SEMANTIC_SEARCH',
159+
NEURAL_SPARSE_SEARCH = 'NEURAL_SPARSE_SEARCH',
138160
}
139161

140162
/**
@@ -196,6 +218,8 @@ export type PretrainedSentenceTransformer = PretrainedModel & {
196218
vectorDimensions: number;
197219
};
198220

221+
export type PretrainedSparseEncodingModel = PretrainedModel & {};
222+
199223
export type ModelConfig = {
200224
modelType?: string;
201225
embeddingDimension?: number;

public/component_types/index.ts

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
export * from './interfaces';
77
export * from './transformer';
88
export * from './indexer';
9+
export * from './other';

public/component_types/indexer/indexer.ts

+3-3
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ export class Indexer extends BaseComponent {
2020
this.baseClasses = [this.type];
2121
this.inputs = [
2222
{
23-
id: 'transformer',
24-
label: 'Transformer',
25-
baseClass: COMPONENT_CLASS.TRANSFORMER,
23+
id: 'document',
24+
label: 'Document',
25+
baseClass: COMPONENT_CLASS.DOCUMENT,
2626
acceptMultiple: false,
2727
},
2828
];

public/component_types/indexer/knn_indexer.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export class KnnIndexer extends Indexer {
1313
constructor() {
1414
super();
1515
this.type = COMPONENT_CLASS.KNN_INDEXER;
16-
this.label = 'K-NN Indexer';
16+
this.label = 'K-NN Index';
1717
this.description = 'A specialized indexer for K-NN indices';
1818
this.baseClasses = [...this.baseClasses, this.type];
1919
this.createFields = [
+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../utils';
7+
import { BaseComponent } from '../base_component';
8+
9+
/**
10+
* A basic Document placeholder UI component.
11+
* Does not have any functionality.
12+
*/
13+
export class Document extends BaseComponent {
14+
constructor() {
15+
super();
16+
this.type = COMPONENT_CLASS.DOCUMENT;
17+
this.label = 'Document';
18+
this.description = 'A document to be ingested';
19+
this.categories = [COMPONENT_CATEGORY.INGEST];
20+
this.allowsCreation = false;
21+
this.baseClasses = [this.type];
22+
this.inputs = [];
23+
this.outputs = [
24+
{
25+
label: this.label,
26+
baseClasses: this.baseClasses,
27+
},
28+
];
29+
}
30+
}

public/component_types/other/index.ts

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
export * from './document';

public/component_types/transformer/index.ts

+1
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55

66
export * from './ml_transformer';
77
export * from './text_embedding_transformer';
8+
export * from './sparse_encoder_transformer';
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../../common';
7+
import { MLTransformer } from '.';
8+
9+
/**
10+
* A specialized sparse encoder ML transformer UI component
11+
*/
12+
export class SparseEncoderTransformer extends MLTransformer {
13+
constructor() {
14+
super();
15+
this.type = COMPONENT_CLASS.SPARSE_ENCODER_TRANSFORMER;
16+
this.label = 'Sparse Encoder';
17+
this.description =
18+
'A specialized ML transformer to perform sparse encoding';
19+
this.categories = [COMPONENT_CATEGORY.INGEST];
20+
this.baseClasses = [...this.baseClasses, this.type];
21+
this.inputs = [
22+
{
23+
id: 'document',
24+
label: 'Document',
25+
baseClass: COMPONENT_CLASS.DOCUMENT,
26+
acceptMultiple: false,
27+
},
28+
];
29+
this.createFields = [
30+
{
31+
label: 'Sparse Encoding Model',
32+
id: 'model',
33+
type: 'model',
34+
helpText:
35+
'A sparse encoding model to be used for generating sparse vectors.',
36+
helpLink:
37+
'https://opensearch.org/docs/latest/ml-commons-plugin/integrating-ml-models/#choosing-a-model',
38+
},
39+
{
40+
label: 'Input Field',
41+
id: 'inputField',
42+
type: 'string',
43+
helpText:
44+
'The name of the document field from which to obtain text for generating sparse embeddings.',
45+
helpLink:
46+
'https://opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#configuration-parameters',
47+
},
48+
{
49+
label: 'Vector Field',
50+
id: 'vectorField',
51+
type: 'string',
52+
helpText: `The name of the document's vector field in which to store the generated sparse embeddings.`,
53+
helpLink:
54+
'https://opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#configuration-parameters',
55+
},
56+
];
57+
this.outputs = [
58+
{
59+
label: 'Transformed Document',
60+
baseClasses: [COMPONENT_CLASS.DOCUMENT],
61+
},
62+
];
63+
}
64+
}

public/component_types/transformer/text_embedding_transformer.ts

+15-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6-
import { COMPONENT_CLASS } from '../../../common';
6+
import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../../common';
77
import { MLTransformer } from '.';
88

99
/**
@@ -13,10 +13,18 @@ export class TextEmbeddingTransformer extends MLTransformer {
1313
constructor() {
1414
super();
1515
this.type = COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER;
16-
this.label = 'Text Embedding Transformer';
16+
this.label = 'Text Embedder';
1717
this.description = 'A specialized ML transformer for embedding text';
18+
this.categories = [COMPONENT_CATEGORY.INGEST];
1819
this.baseClasses = [...this.baseClasses, this.type];
19-
this.inputs = [];
20+
this.inputs = [
21+
{
22+
id: 'document',
23+
label: 'Document',
24+
baseClass: COMPONENT_CLASS.DOCUMENT,
25+
acceptMultiple: false,
26+
},
27+
];
2028
this.createFields = [
2129
{
2230
label: 'Text Embedding Model',
@@ -31,24 +39,23 @@ export class TextEmbeddingTransformer extends MLTransformer {
3139
id: 'inputField',
3240
type: 'string',
3341
helpText:
34-
'The name of the field from which to obtain text for generating text embeddings.',
42+
'The name of the document field from which to obtain text for generating text embeddings.',
3543
helpLink:
3644
'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-embedding/',
3745
},
3846
{
3947
label: 'Vector Field',
4048
id: 'vectorField',
4149
type: 'string',
42-
helpText:
43-
' The name of the vector field in which to store the generated text embeddings.',
50+
helpText: `The name of the document's vector field in which to store the generated text embeddings.`,
4451
helpLink:
4552
'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-embedding/',
4653
},
4754
];
4855
this.outputs = [
4956
{
50-
label: this.label,
51-
baseClasses: this.baseClasses,
57+
label: 'Transformed Document',
58+
baseClasses: [COMPONENT_CLASS.DOCUMENT],
5259
},
5360
];
5461
}

public/pages/workflow_detail/component_details/component_inputs.tsx

+5-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
*/
55

66
import React, { useState } from 'react';
7-
import { EuiSpacer, EuiText, EuiTitle } from '@elastic/eui';
7+
import { EuiHorizontalRule, EuiSpacer, EuiText, EuiTitle } from '@elastic/eui';
88
import { InputFieldList } from './input_field_list';
99
import { NODE_CATEGORY, ReactFlowComponent } from '../../../../common';
1010
import { NewOrExistingTabs } from '../workspace/workspace_components/new_or_existing_tabs';
@@ -58,11 +58,12 @@ export function ComponentInputs(props: ComponentInputsProps) {
5858
<EuiText color="subdued">
5959
{props.selectedComponent.data.description}
6060
</EuiText>
61-
<NewOrExistingTabs
61+
{/* TODO: Add tabs back once it is finalized how much flexibility we want */}
62+
{/* <NewOrExistingTabs
6263
selectedTabId={selectedTabId}
6364
setSelectedTabId={setSelectedTabId}
64-
/>
65-
<EuiSpacer size="s" />
65+
/> */}
66+
<EuiHorizontalRule size="full" />
6667

6768
<InputFieldList
6869
componentId={props.selectedComponent.id}

public/pages/workflow_detail/component_details/input_fields/model_field.tsx

+23
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ import {
2626
ModelFormValue,
2727
MODEL_CATEGORY,
2828
MPNET_SENTENCE_TRANSFORMER,
29+
NEURAL_SPARSE_TRANSFORMER,
30+
NEURAL_SPARSE_DOC_TRANSFORMER,
31+
NEURAL_SPARSE_TOKENIZER_TRANSFORMER,
2932
} from '../../../../../common';
3033
import { AppState } from '../../../../store';
3134

@@ -113,6 +116,24 @@ export function ModelField(props: ModelFieldProps) {
113116
category: MODEL_CATEGORY.PRETRAINED,
114117
algorithm: BERT_SENTENCE_TRANSFORMER.algorithm,
115118
},
119+
{
120+
id: NEURAL_SPARSE_TRANSFORMER.name,
121+
name: NEURAL_SPARSE_TRANSFORMER.shortenedName,
122+
category: MODEL_CATEGORY.PRETRAINED,
123+
algorithm: NEURAL_SPARSE_TRANSFORMER.algorithm,
124+
},
125+
{
126+
id: NEURAL_SPARSE_DOC_TRANSFORMER.name,
127+
name: NEURAL_SPARSE_DOC_TRANSFORMER.shortenedName,
128+
category: MODEL_CATEGORY.PRETRAINED,
129+
algorithm: NEURAL_SPARSE_DOC_TRANSFORMER.algorithm,
130+
},
131+
{
132+
id: NEURAL_SPARSE_TOKENIZER_TRANSFORMER.name,
133+
name: NEURAL_SPARSE_TOKENIZER_TRANSFORMER.shortenedName,
134+
category: MODEL_CATEGORY.PRETRAINED,
135+
algorithm: NEURAL_SPARSE_TOKENIZER_TRANSFORMER.algorithm,
136+
},
116137
];
117138
setPretrainedModels(modelItems);
118139
}, []);
@@ -121,6 +142,8 @@ export function ModelField(props: ModelFieldProps) {
121142
// e.g., only show deployed models when 'deployed' button is selected
122143
useEffect(() => {
123144
if (selectedRadioId !== undefined) {
145+
// TODO: add fine-grained filtering so only relevant pretrained and existing models
146+
// are visible based on the use case
124147
if (selectedRadioId === MODEL_CATEGORY.DEPLOYED) {
125148
setSelectableModels(deployedModels);
126149
} else {

0 commit comments

Comments
 (0)