Skip to content

Commit a8d103f

Browse files
authored
Update OpenVINO Code (#721)
- Freeze Huggingface dependencies, downgrade to fix model outputs - Add experimental support for code completion streaming - Add DEVELOPER.md - Update settings descriptions
1 parent 597662c commit a8d103f

15 files changed

+236
-71
lines changed

modules/openvino_code/DEVELOPER.md

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# OpenVINO Code - VSCode extension for AI code completion with OpenVINO™
2+
3+
VSCode extension for helping developers writing code with AI code assistant. OpenVINO Code is working with Large Language Model for Code (Code LLM) deployed on local or remote server.
4+
5+
## Installing Extension
6+
7+
VSCode extension can be installed from built `*.vsix` file:
8+
9+
1. Open `Extensions` side bar in VSCode.
10+
2. Click on the menu icon (three dots menu icon aka "meatballs" icon) in the top right corner of Extensions side panel.
11+
3. Select "Instal from VSIX..." option and select extension file.
12+
13+
For instructions on how to build extension `vsix` file please refer to the [Build Extension](#build-extension) section.
14+
15+
## Extension Configuration
16+
17+
To work with extension you should configure endpoint to server with Code LLM where requests will be sent:
18+
19+
1. Open extension settings.
20+
2. Fill `Server URL` parameter with server endpoint URL.
21+
22+
For instructions on how to start server locally please refer to the [server README.md](./server/README.md).
23+
24+
Also in extension settings you can configure special tokens.
25+
26+
## Working with Extension
27+
28+
TDB
29+
30+
1. Create a new python file
31+
2. Try typing `def main():`
32+
3. Press shortcut buttons (TBD) for code completion
33+
34+
### Checking output
35+
36+
You can see input to and output from the code generation API:
37+
38+
1. Open VSCode `OUTPUT` panel
39+
2. Select extension output source from the dropdown menu
40+
41+
## Developing
42+
43+
> **Prerequisite:** You should have `Node.js` installed (v16 and above).
44+
45+
#### Install dependencies
46+
47+
To install dependencies run the following command from the project root directory:
48+
49+
```
50+
npm install
51+
```
52+
53+
#### Run Extension from Source & Debugging
54+
55+
Open `Run and Debug` side bar in VSCode and click `Launch Extension` (or press `F5`).
56+
57+
#### Build Extension
58+
59+
To build extension and generate `*.vsix` file for further installation in VSCode, run the following command:
60+
61+
```
62+
npm run vsce:package
63+
```
64+
65+
#### Linting
66+
67+
To perform linting with `ESLint`, execute the following command:
68+
69+
```
70+
npm run lint
71+
```
72+
73+
#### Testing
74+
75+
TBD

modules/openvino_code/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ OpenVINO Code provides the following features:
1313

1414
1. Create a new python file
1515
2. Try typing `def main():`
16-
3. Press shortcut buttons (TBD) for code completion
16+
3. Press shortcut button `ctrl+alt+space` for code completion
1717

1818
### Checking output
1919

Binary file not shown.
Binary file not shown.

modules/openvino_code/package-lock.json

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

modules/openvino_code/package.json

+22-10
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"publisher": "OpenVINO",
33
"name": "openvino-code-completion",
4-
"version": "0.0.2",
4+
"version": "0.0.3",
55
"displayName": "OpenVINO Code Completion",
66
"description": "VSCode extension for AI code completion with OpenVINO",
77
"icon": "media/logo.png",
@@ -188,38 +188,44 @@
188188
"default": 30,
189189
"markdownDescription": "Server request timeout in seconds after which request will be aborted."
190190
},
191-
"openvinoCode.fillInTheMiddleMode": {
191+
"openvinoCode.streamInlineCompletion": {
192192
"order": 3,
193193
"type": "boolean",
194+
"default": "false",
195+
"description": "When checked inline complention will be generated in streaming mode"
196+
},
197+
"openvinoCode.fillInTheMiddleMode": {
198+
"order": 4,
199+
"type": "boolean",
194200
"default": false,
195201
"markdownDescription": "When checked, text before (above) and after (below) the cursor will be used for completion generation. When unckecked, only text before (above) the cursor will be used."
196202
},
197203
"openvinoCode.temperature": {
198-
"order": 4,
204+
"order": 5,
199205
"type": "number",
200206
"default": 0.2,
201-
"description": "Sampling temperature."
207+
"description": "Non-zero value. The higher the value, the more diverse the code suggestions and the lower temperature emphasizes the most likely words."
202208
},
203209
"openvinoCode.topK": {
204-
"order": 4,
210+
"order": 5,
205211
"type": "integer",
206212
"default": 10,
207-
"description": "Top K."
213+
"description": "Select the next word during suggestion generation from the top K candidates. Improves diversity of generated suggestions."
208214
},
209215
"openvinoCode.topP": {
210-
"order": 4,
216+
"order": 5,
211217
"type": "number",
212218
"default": 1,
213-
"description": "Top P."
219+
"description": "A value between 0 and 1. Similar to Top K, it adjusts the number of candidate words based on their probability. Candidates will be added for selection until the cumulative probability exceeds P."
214220
},
215221
"openvinoCode.minNewTokens": {
216-
"order": 5,
222+
"order": 6,
217223
"type": "number",
218224
"default": 1,
219225
"description": "Minimum of new generated tokens."
220226
},
221227
"openvinoCode.maxNewTokens": {
222-
"order": 5,
228+
"order": 6,
223229
"type": "number",
224230
"default": 100,
225231
"description": "Maximum of new generated tokens."
@@ -280,6 +286,12 @@
280286
"key": "ctrl+alt+space",
281287
"mac": "ctrl+alt+space",
282288
"when": "editorTextFocus"
289+
},
290+
{
291+
"command": "openvinoCode.stopGeneration",
292+
"key": "escape",
293+
"mac": "escape",
294+
"when": "openvinoCode.generating"
283295
}
284296
]
285297
},

modules/openvino_code/server/pyproject.toml

+6-7
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,21 @@ version = "0.0.1"
44
requires-python = ">=3.8"
55

66
dependencies = [
7-
'fastapi==0.101.0',
8-
'uvicorn==0.23.1',
7+
'fastapi==0.103.1',
8+
'uvicorn==0.23.2',
99
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.8"',
1010
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.9"',
1111
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.10"',
1212
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp311-cp311-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.11"',
1313
'torch ; sys_platform != "linux"',
1414
'openvino==2023.1.0.dev20230811',
15-
'optimum-intel[openvino]==1.11.0',
15+
'transformers==4.31.0',
16+
'optimum==1.12.0',
17+
'optimum-intel[openvino]==1.10.1',
1618
]
1719

1820
[project.optional-dependencies]
19-
dev = [
20-
"black",
21-
"ruff",
22-
]
21+
dev = ["black", "ruff"]
2322

2423
[build-system]
2524
requires = ["setuptools>=43.0.0", "wheel"]

modules/openvino_code/server/src/app.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from time import perf_counter
22
from typing import Dict, Union
33

4-
from fastapi import Depends, FastAPI
4+
from fastapi import Depends, FastAPI, Request
55
from fastapi.responses import RedirectResponse, StreamingResponse
6-
from pydantic import BaseModel, Field
6+
from pydantic import BaseModel, Field, TypeAdapter
77

88
from src.generators import GeneratorFunctor
99
from src.utils import get_logger
@@ -105,11 +105,12 @@ async def generate(
105105

106106
@app.post("/api/generate_stream", status_code=200)
107107
async def generate_stream(
108-
request: GenerationRequest,
108+
request: Request,
109109
generator: GeneratorFunctor = Depends(get_generator_dummy),
110110
) -> StreamingResponse:
111-
logger.info(request)
112-
return StreamingResponse(generator.generate_stream(request.inputs, request.parameters.model_dump()))
111+
generation_request = TypeAdapter(GenerationRequest).validate_python(await request.json())
112+
logger.info(generation_request)
113+
return StreamingResponse(generator.generate_stream(generation_request.inputs, generation_request.parameters.model_dump(), request))
113114

114115

115116
@app.post("/api/summarize", status_code=200, response_model=GenerationResponse)

modules/openvino_code/server/src/generators.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import re
23
from functools import lru_cache
34
from io import StringIO
@@ -6,6 +7,7 @@
67
from typing import Any, Callable, Container, Dict, Generator, List, Optional, Type, Union
78

89
import torch
10+
from fastapi import Request
911
from huggingface_hub.utils import EntryNotFoundError
1012
from optimum.intel import OVModelForCausalLM, OVModelForSeq2SeqLM
1113
from transformers import (
@@ -61,11 +63,15 @@ def get_model(checkpoint: str, device: str = "CPU") -> OVModel:
6163
return model
6264

6365

66+
# TODO: generator needs running flag or cancellation on new generation request
67+
# generator cannot handle concurrent requests - fails and stalls process
68+
# RuntimeError: Exception from src/inference/src/infer_request.cpp:189:
69+
# [ REQUEST_BUSY ]
6470
class GeneratorFunctor:
6571
def __call__(self, input_text: str, parameters: Dict[str, Any]) -> str:
6672
raise NotImplementedError
6773

68-
async def generate_stream(self, input_text: str, parameters: Dict[str, Any]):
74+
async def generate_stream(self, input_text: str, parameters: Dict[str, Any], request: Request):
6975
raise NotImplementedError
7076

7177
def summarize(self, input_text: str, template: str, signature: str, style: str, parameters: Dict[str, Any]):
@@ -122,24 +128,45 @@ def __call__(
122128
logger.info(f"Number of input tokens: {prompt_len}; generated {len(output_ids)} tokens")
123129
return self.tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
124130

125-
async def generate_stream(
126-
self, input_text: str, parameters: Dict[str, Any], stopping_criteria: Optional[StoppingCriteriaList] = None
127-
):
131+
async def generate_stream(self, input_text: str, parameters: Dict[str, Any], request: Request = None):
128132
input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
129133
streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
130134
parameters["streamer"] = streamer
131135
config = GenerationConfig.from_dict({**self.generation_config.to_dict(), **parameters})
136+
137+
stop_on_tokens = StopOnTokens([])
138+
132139
generation_kwargs = dict(
133140
input_ids=input_ids,
134141
streamer=streamer,
135-
stopping_criteria=stopping_criteria,
142+
stopping_criteria=StoppingCriteriaList([stop_on_tokens]),
136143
**config.to_dict(),
137144
)
145+
146+
# listen disconnect event so generation can be stopped
147+
def listen_for_disconnect():
148+
async def listen():
149+
message = await request.receive()
150+
if message.get("type") == "http.disconnect":
151+
stop_on_tokens.cancelled = True
152+
asyncio.create_task(listen())
153+
154+
155+
listen_thread = Thread(target=listen_for_disconnect)
156+
# thread.run doesn't actually start a new thread
157+
# it runs the thread function in current thread context
158+
# thread.start() doesn't work here
159+
listen_thread.run()
160+
138161
thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
139162
thread.start()
163+
140164
for token in streamer:
165+
await asyncio.sleep(0.01)
141166
yield token
142167

168+
thread.join()
169+
143170
def generate_between(
144171
self,
145172
input_parts: List[str],
@@ -243,7 +270,10 @@ def inner() -> GeneratorFunctor:
243270

244271
class StopOnTokens(StoppingCriteria):
245272
def __init__(self, token_ids: List[int]) -> None:
273+
self.cancelled = False
246274
self.token_ids = torch.tensor(token_ids, requires_grad=False)
247275

248276
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
277+
if self.cancelled:
278+
return True
249279
return torch.any(torch.eq(input_ids[0, -1], self.token_ids)).item()

modules/openvino_code/src/configuration.ts

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export type CustomConfiguration = {
88
model: ModelName;
99
serverUrl: string;
1010
serverRequestTimeout: number;
11+
streamInlineCompletion: boolean;
1112
fillInTheMiddleMode: boolean;
1213
temperature: number;
1314
topK: number;

modules/openvino_code/src/constants.ts

+5
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,9 @@ export const COMMANDS = {
2424
STOP_SERVER_NATIVE: 'openvinoCode.stopServerNative',
2525
SHOW_SERVER_LOG: 'openvinoCode.showServerLog',
2626
SHOW_EXTENSION_LOG: 'openvinoCode.showExtensionLog',
27+
STOP_GENERATION: 'openvinoCode.stopGeneration',
28+
};
29+
30+
export const EXTENSION_CONTEXT_STATE = {
31+
GENERATING: 'openvinoCode.generating',
2732
};

modules/openvino_code/src/inline-completion/completion.service.ts

+37-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { InlineCompletionItem, Position, Range, TextDocument, window } from 'vscode';
2-
import { backendService } from '../services/backend.service';
3-
import { extensionState } from '../state';
42
import { EXTENSION_DISPLAY_NAME } from '../constants';
3+
import { IGenerateRequest, backendService } from '../services/backend.service';
4+
import { extensionState } from '../state';
55

66
const outputChannel = window.createOutputChannel(EXTENSION_DISPLAY_NAME, { log: true });
77
const logCompletionInput = (input: string): void => outputChannel.append(`Completion input:\n${input}\n\n`);
@@ -67,6 +67,41 @@ class CompletionService {
6767
const completionItem = new InlineCompletionItem(generatedText, new Range(position, position.translate(0, 1)));
6868
return [completionItem];
6969
}
70+
71+
async getCompletionStream(
72+
document: TextDocument,
73+
position: Position,
74+
onDataChunk: (chunk: string) => unknown,
75+
signal?: AbortSignal
76+
) {
77+
const textBeforeCursor = this._getTextBeforeCursor(document, position);
78+
const textAfterCursor = this._getTextAfterCursor(document, position);
79+
const completionInput = this._prepareCompletionInput(textBeforeCursor, textAfterCursor);
80+
logCompletionInput(completionInput);
81+
82+
const { temperature, topK, topP, minNewTokens, maxNewTokens } = extensionState.config;
83+
84+
const request: IGenerateRequest = {
85+
inputs: completionInput,
86+
parameters: {
87+
temperature,
88+
top_k: topK,
89+
top_p: topP,
90+
min_new_tokens: minNewTokens,
91+
max_new_tokens: maxNewTokens,
92+
},
93+
};
94+
95+
outputChannel.append(`Completion output:\n`);
96+
return backendService.generateCompletionStream(
97+
request,
98+
(chunk) => {
99+
outputChannel.append(chunk);
100+
onDataChunk(chunk);
101+
},
102+
signal
103+
);
104+
}
70105
}
71106

72107
export default new CompletionService();

0 commit comments

Comments
 (0)