Skip to content

Commit e7ae5cf

Browse files
committed
Add matching test
1 parent 508767d commit e7ae5cf

File tree

3 files changed

+53
-23
lines changed

3 files changed

+53
-23
lines changed

.github/workflows/causal_lm_cpp.yml

+27-1
Original file line numberDiff line numberDiff line change
@@ -714,10 +714,36 @@ jobs:
714714
run: |
715715
source ./ov/setupvars.sh
716716
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
717-
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
717+
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
718718
python -m pip install -U "optimum<1.23" --no-dependencies
719719
optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
720720
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
721+
- name: Generate reference
722+
shell: python
723+
run: |
724+
from optimum.intel.openvino import OVModelForVisualCausalLM
725+
from transformers import AutoProcessor
726+
from PIL import Image
727+
import requests
728+
import cv2
729+
import numpy as np
730+
res = 448, 448
731+
im = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255
732+
im = im.reshape([*res, 3])
733+
cv2.imwrite("lines.png", im)
734+
model_id = "openbmb/MiniCPM-V-2_6"
735+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
736+
prompt = processor.tokenizer.apply_chat_template([{"role": "user", "content": "(<image>./</image>)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True)
737+
image = Image.open("/home/vzlobin/r/g/g.png").convert('RGB')
738+
# image = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB')
739+
model = OVModelForVisualCausalLM.from_pretrained("MiniCPM-V-2_6", trust_remote_code=True)
740+
inputs = processor([prompt], [image], return_tensors="pt")
741+
result = model.generate(**inputs, max_new_tokens=200)
742+
decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
743+
print(decoded)
744+
with open("ref.txt", "w") as f:
745+
f.write(decoded)
746+
721747
- name: Run visual_language_chat sample - MiniCPM-V-2_6
722748
run: >
723749
source ./ov/setupvars.sh

miniCPM-V-2_6.py

+13-12
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
11
from optimum.intel.openvino import OVModelForVisualCausalLM
2-
from transformers import AutoProcessor, AutoTokenizer
2+
from transformers import AutoProcessor
33
from PIL import Image
44
import requests
5-
5+
import cv2
6+
import numpy as np
7+
res = 448, 448
8+
im = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255
9+
im = im.reshape([*res, 3])
10+
cv2.imwrite("lines.png", im)
611
model_id = "openbmb/MiniCPM-V-2_6"
7-
812
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
9-
10-
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
11-
prompt = tokenizer.apply_chat_template([{"role": "user", "content": "(<image>./</image>)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True)
12-
# image = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB')
13+
prompt = processor.tokenizer.apply_chat_template([{"role": "user", "content": "(<image>./</image>)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True)
1314
image = Image.open("/home/vzlobin/r/g/g.png").convert('RGB')
14-
15+
# image = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB')
1516
model = OVModelForVisualCausalLM.from_pretrained("MiniCPM-V-2_6", trust_remote_code=True)
16-
1717
inputs = processor([prompt], [image], return_tensors="pt")
18-
1918
result = model.generate(**inputs, max_new_tokens=200)
20-
21-
print(processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:]))
19+
decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
20+
print(decoded)
21+
with open("ref.txt", "w") as f:
22+
f.write(decoded)

src/cpp/src/visual_language/pipeline.cpp

+13-10
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
413413

414414
int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1;
415415
size_t vocab_size = m_language.get_tensor("logits").get_shape().back();
416+
float* logits = m_language.get_tensor("logits").data<float>() + sequence_len * vocab_size;
417+
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
416418

417419
m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size});
418420
m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 });
@@ -435,16 +437,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
435437
}, streamer);
436438
std::vector<int64_t> generated;
437439
while (true) { //(out_token != eos_token_id)
438-
float *logits = m_language.get_tensor("logits").data<float>();
439-
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
440-
generated.push_back(out_token);
441-
// if (streamer_ptr && streamer_ptr->put(out_token)) {
442-
// break;
443-
// }
444-
std::cout << out_token << ", ";
445-
if (out_token == eos_token_id) {
446-
break;
447-
}
448440
m_embedding.get_input_tensor().data<int64_t>()[0] = out_token;
449441
m_embedding.infer();
450442
const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor();
@@ -459,6 +451,17 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
459451
m_language.get_tensor("position_ids").data<int64_t>()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 1);
460452

461453
m_language.infer();
454+
455+
generated.push_back(out_token);
456+
if (streamer_ptr && streamer_ptr->put(out_token)) {
457+
break;
458+
}
459+
logits = m_language.get_tensor("logits").data<float>();
460+
461+
out_token = std::max_element(logits, logits + vocab_size) - logits;
462+
if (out_token == eos_token_id) {
463+
break;
464+
}
462465
}
463466

464467
if (streamer_ptr) {

0 commit comments

Comments
 (0)