diff --git a/.github/reusable-steps/categorize-projects/action.yml b/.github/reusable-steps/categorize-projects/action.yml index fa9767dd..cfbd8339 100644 --- a/.github/reusable-steps/categorize-projects/action.yml +++ b/.github/reusable-steps/categorize-projects/action.yml @@ -16,6 +16,8 @@ outputs: value: ${{ steps.group-subprojects.outputs.qt }} js: value: ${{ steps.group-subprojects.outputs.js }} + unittest: + value: ${{ steps.group-subprojects.outputs.unittest }} runs: using: 'composite' @@ -30,6 +32,7 @@ runs: webcam=() qt=() js=() + unittest=() for dir in ${{ inputs.subprojects }}; do if [ -f "$dir/package.json" ]; then @@ -42,6 +45,8 @@ runs: qt+=("$dir") elif [ -f "$dir/main.py" ] && grep -q -- "--stream" "$dir/main.py"; then webcam+=("$dir") + elif [ -d "$dir/test" ]; then + unittest+=("$dir/test") elif [ -f "$dir/main.py" ]; then python+=("$dir") fi @@ -53,6 +58,7 @@ runs: webcam_json=$(printf '%s\n' "${webcam[@]}" | jq -R -s -c 'split("\n") | map(select(length > 0))') qt_json=$(printf '%s\n' "${qt[@]}" | jq -R -s -c 'split("\n") | map(select(length > 0))') js_json=$(printf '%s\n' "${js[@]}" | jq -R -s -c 'split("\n") | map(select(length > 0))') + unittest_json=$(printf '%s\n' "${unittest[@]}" | jq -R -s -c 'split("\n") | map(select(length > 0))') echo "notebook=$notebook_json" >> $GITHUB_OUTPUT echo "python=$python_json" >> $GITHUB_OUTPUT @@ -60,6 +66,7 @@ runs: echo "webcam=$webcam_json" >> $GITHUB_OUTPUT echo "qt=$qt_json" >> $GITHUB_OUTPUT echo "js=$js_json" >> $GITHUB_OUTPUT + echo "unittest=$unittest_json" >> $GITHUB_OUTPUT - name: Print subprojects to test shell: bash run: | @@ -69,3 +76,4 @@ runs: echo "Webcam subprojects: ${{ steps.group-subprojects.outputs.webcam }}" echo "Qt subprojects: ${{ steps.group-subprojects.outputs.qt }}" echo "JS subprojects: ${{ steps.group-subprojects.outputs.js }}" + echo "Unit test subprojects: ${{ steps.group-subprojects.outputs.unittest }}" diff --git a/.github/workflows/sanity-check-demos.yml b/.github/workflows/sanity-check-demos.yml index d3fdf8d3..6422e3f7 100644 --- a/.github/workflows/sanity-check-demos.yml +++ b/.github/workflows/sanity-check-demos.yml @@ -119,3 +119,44 @@ jobs: command: npm start project: ${{ matrix.subproject }} timeout: 1m + + unittest: + needs: find-subprojects + if: ${{ needs.find-subprojects.outputs.unittest != '[]' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python: ["3.10", "3.12"] + subproject: ${{ fromJson(needs.find-subprojects.outputs.unittest) }} + steps: + - uses: actions/checkout@v4 + - uses: ./.github/reusable-steps/setup-os + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - uses: ./.github/reusable-steps/setup-python + with: + python: ${{ matrix.python }} + project: ${{ matrix.subproject }} + - name: Login to HF + shell: bash + run: | + huggingface-cli login --token ${{ secrets.HF_TOKEN }} + - name: Install ollama + shell: bash + run: | + curl -fsSL https://ollama.com/install.sh | sh + - name: Start ollama and setup deepeval to use ollama + shell: bash + run: | + ollama serve & + ollama pull deepseek-r1 & + deepeval set-ollama deepseek-r1 + - uses: ./.github/reusable-steps/timeouted-action + with: + script: python test.py + project: ${{ matrix.subproject }} + timeout: 5h diff --git a/demos/virtual_ai_assistant_demo/requirements.txt b/demos/virtual_ai_assistant_demo/requirements.txt index 0cf0b606..bd442ff6 100644 --- a/demos/virtual_ai_assistant_demo/requirements.txt +++ b/demos/virtual_ai_assistant_demo/requirements.txt @@ -20,5 +20,7 @@ torch==2.5.1 transformers==4.48.3 pymupdf==1.24.10 pyyaml==6.0.1 +selfcheckgpt==0.1.7 +deepeval==2.4.9 gradio==5.12.0 diff --git a/demos/virtual_ai_assistant_demo/test/README.md b/demos/virtual_ai_assistant_demo/test/README.md new file mode 100644 index 00000000..4a76ad32 --- /dev/null +++ b/demos/virtual_ai_assistant_demo/test/README.md @@ -0,0 +1,50 @@ +# How to select hallucination computation algorithm + +Currently, two methods are availble: [deepeval](#use-deepeval-to-compute-hallucination-score) and [selfcheckgpt](#use-selfcheckgpt-to-compute-hallucination-score). + +If you have an evaluation dataset (i.e. both question and correct answer), you can choose [deepeval](#use-deepeval-to-compute-hallucination-score). However, if you do not have a labeled dataset, you can choose [selfcheckgpt](#use-selfcheckgpt-to-compute-hallucination-score). It will compute hallucination score based on the output consistency. + +# Use deepeval to compute hallucination score +## Prerequisite libraries +1. [deepeval](https://github.com/confident-ai/deepeval) +2. [Ollama](https://github.com/ollama/ollama/blob/main/README.md) + +## How to set up +1. Install deepeval: + ``` + pip install -U deepeval + ``` +2. Install Ollama: + Please refer to [ollama](https://github.com/ollama/ollama/blob/main/README.md#ollama) + +3. Run Ollama, taking `deepseek-r1` as an example: + ``` + ollama run deepseek-r1 + ``` +4. Set deepeval to use Ollama for evaluation: + ``` + deepeval set-ollama deepseek-r1 + ``` + +## How to run the test +``` +python test.py --personality /path/to/personality.yaml --check_type deepeval +``` + +## More to read +[deepeval hallucination](https://docs.confident-ai.com/docs/metrics-hallucination) + +# Use selfcheckgpt to compute hallucination score +## Prerequisite libraries +1. [selfcheckgpt](https://github.com/potsawee/selfcheckgpt) + +## How to set up and run the test +1. Install deepeval: + ``` + pip install selfcheckgpt==0.1.7 + ``` + +2. Run test + ``` + python test.py --personality /path/to/personality.yaml --check_type selfcheckgpt + ``` \ No newline at end of file diff --git a/demos/virtual_ai_assistant_demo/test/bartender_questions.txt b/demos/virtual_ai_assistant_demo/test/bartender_questions.txt new file mode 100644 index 00000000..bf3d75f4 --- /dev/null +++ b/demos/virtual_ai_assistant_demo/test/bartender_questions.txt @@ -0,0 +1,20 @@ +Can you suggest some popular fruit-based drinks that are healthy and refreshing? +Can you suggest some recipes using your favorite juices or ingredients? +Can you suggest some refreshing drinks with watermelon or lime? +Can you suggest some tropical juices or smoothies with kiwi or banana? +What are the ingredients in a classic Martini? +What are some popular drinks that use pomegranate juice? +Can you suggest a cocktail that uses honey? +What are the ingredients in a classic Daiquiri? +Can you recommend a cocktail that uses apple cider? +What are some popular drinks that use cranberry juice? +Can you suggest a cocktail that uses chocolate? +What are the ingredients in a classic Negroni? +Can you recommend a cocktail that uses almond milk? +What are some popular drinks that use grapefruit juice? +Can you suggest a cocktail that uses lavender? +What are the ingredients in a classic Pina Colada? +Can you recommend a cocktail that uses maple syrup? +What are some popular drinks that use lemon or lime juice? +Can you suggest a cocktail that uses cinnamon? +What are the ingredients in a classic Bloody Mary? \ No newline at end of file diff --git a/demos/virtual_ai_assistant_demo/test/culinara_questions.txt b/demos/virtual_ai_assistant_demo/test/culinara_questions.txt new file mode 100644 index 00000000..152769ef --- /dev/null +++ b/demos/virtual_ai_assistant_demo/test/culinara_questions.txt @@ -0,0 +1,20 @@ +I'm planning to cook a classic spaghetti carbonara. What ingredients do I need? +Can I substitute pancetta with bacon in my carbonara? +I'm planning to make a vegan lasagna. What can I use instead of ricotta cheese? +How long should I bake my lasagna for the best results? +I'm making a chicken curry. What spices should I use for an authentic flavor? +Can I use coconut milk instead of cream in my chicken curry? +I'm planning to bake a chocolate cake. What type of cocoa powder is best? +Can I use almond flour instead of all-purpose flour in my cake? +I'm making a Caesar salad. What ingredients are essential for the dressing? +Can I use Greek yogurt instead of mayonnaise in my Caesar dressing? +I'm planning to cook a beef stew. What cut of beef is best for stewing? +Can I use red wine instead of beef broth in my stew? +I'm planning to cook a seafood paella. What types of seafood are best to use? +Can I use brown rice instead of white rice in my paella? +How do I achieve the perfect socarrat (crispy bottom) in my paella? +I'm making a vegetarian chili. What beans are best to use? +Can I add quinoa to my chili for extra protein? +I'm planning to bake a batch of cookies. What type of sugar should I use? +Can I substitute butter with coconut oil in my cookies? +I'm making a Greek salad. What ingredients are essential? \ No newline at end of file diff --git a/demos/virtual_ai_assistant_demo/test/test.py b/demos/virtual_ai_assistant_demo/test/test.py new file mode 100644 index 00000000..13ace11b --- /dev/null +++ b/demos/virtual_ai_assistant_demo/test/test.py @@ -0,0 +1,194 @@ +import argparse +import logging as log +import os + +from typing import List, Set +from pathlib import Path +from tqdm import tqdm + +import numpy as np +import openvino as ov +import yaml + +from datasets import load_dataset +from urllib.request import getproxies +from deepeval.metrics import HallucinationMetric +from deepeval.test_case import LLMTestCase +from llama_index.core.chat_engine import SimpleChatEngine +from llama_index.core.memory import ChatMemoryBuffer +from llama_index.llms.openvino import OpenVINOLLM +from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt +from transformers import AutoTokenizer + +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from main import load_chat_model + +proxies = getproxies() +os.environ["http_proxy"] = proxies["http"] +os.environ["https_proxy"] = proxies["https"] +os.environ["no_proxy"] = "localhost, 127.0.0.1/8, ::1" +from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig + + +DATASET_MAPPING = { + "agribot_personality.yaml": {"name": "KisanVaani/agriculture-qa-english-only", "split": "train", "col": "question"}, + "healthcare_personality.yaml": {"name": "medalpaca/medical_meadow_medical_flashcards", "split": "train", "col": "input"}, + "bartender_personality.yaml": {"name": str(Path(__file__).parent / "bartender_questions.txt"), "col": "text"}, + "culinara_personality.yaml": {"name": str(Path(__file__).parent / "culinara_questions.txt"), "col": "text"}, + "tutor_personality.yaml": {"name": str(Path(__file__).parent / "tutor_questions.txt"), "col": "text"} +} +MODEL_DIR = Path("model") + + +def get_available_devices() -> Set[str]: + core = ov.Core() + return {device.split(".")[0] for device in core.available_devices} + + +def compute_deepeval_hallucination(inputs, outputs, contexts) -> float: + avg_score = 0. + for input, output, context in zip(inputs, outputs, contexts): + test_case = LLMTestCase( + input=input, + actual_output=output, + context=context + ) + metric = HallucinationMetric(threshold=0.5) + metric.measure(test_case) + score = metric.score + # reason = metric.reason + avg_score += score / len(inputs) + return avg_score + + +def prepare_dataset_and_model(chat_model_name: str, personality_file_path: Path, auth_token: str): + dataset_info = DATASET_MAPPING.get(personality_file_path.name, "") + assert dataset_info != "" + log.info("Loading dataset") + if dataset_info["name"].endswith(".txt"): + dataset = load_dataset("text", data_files={"data": dataset_info["name"]})["data"] + else: + dataset = load_dataset(dataset_info["name"])[dataset_info["split"]] + log.info("Dataset loading is finished") + + with open(personality_file_path, "rb") as f: + chatbot_config = yaml.safe_load(f) + + ov_llm = load_chat_model(chat_model_name, auth_token) + ov_chat_engine = SimpleChatEngine.from_defaults(llm=ov_llm, system_prompt=chatbot_config["system_configuration"], + memory=ChatMemoryBuffer.from_defaults()) + return dataset[dataset_info["col"]], ov_chat_engine + + +def run_test_deepeval(chat_model_name: str, personality_file_path: Path, auth_token: str, selection_num: int = 10) -> float: + """ + Args: + chat_model_name (str): large language model path. + personality_file_path (Path): personality file path. + auth_token (str): auth token used for huggingface. + selection_num (int): maximum number of prompt are selected to compute hallucination score + + Returns: + hallucination score: the higher the score, the higher possibility of having hallucination issue. + """ + dataset_question, ov_chat_engine = prepare_dataset_and_model(chat_model_name, personality_file_path, auth_token) + inputs = dataset_question + # We use question as context because the dataset lacks context + contexts = dataset_question + contexts_res = [[context] for context in contexts] + + outputs = [] + for input in tqdm(inputs[:selection_num]): + output = ov_chat_engine.chat(input).response + outputs.append(output) + + final_score = compute_deepeval_hallucination(inputs[:selection_num], outputs[:selection_num], contexts_res[:selection_num]) + return final_score + + +class OVSelfCheckLLMPrompt(SelfCheckLLMPrompt): + def __init__(self, ov_chat_engine: SimpleChatEngine): + self.ov_chat_engine = ov_chat_engine + self.text_mapping = {'yes': 0.0, 'no': 1.0, 'n/a': 0.5} + self.prompt_template = "Context: {context}\n\nSentence: {sentence}\n\nIs the sentence supported by the context above? Answer Yes or No.\n\nAnswer: " + self.not_defined_text = set() + self.generate_num = 3 + + def generate_outputs(self, prompt_list: List[str]) -> List[str]: + response_list = [] + for prompt in tqdm(prompt_list, desc="generating responses"): + tmp_list = [] + for _ in range(self.generate_num): + response = self.ov_chat_engine.chat(prompt).response + # remove part + response = response[response.rfind("") + 8:].strip() + tmp_list.append(response) + response_list.append(tmp_list) + return response_list + + def predict( + self, + sampled_passages: List[str], + ) -> np.array: + num_samples = len(sampled_passages) + scores = np.zeros((num_samples, num_samples)) + + for sent_i in range(num_samples): + sentence = sampled_passages[sent_i].replace("\n", " ") + for sample_i, sample in enumerate(sampled_passages): + if sent_i == sample_i: + continue + + # this seems to improve performance when using the simple prompt template + sample = sample.replace("\n", " ") + prompt = self.prompt_template.format(context=sample, sentence=sentence) + generate_output = self.ov_chat_engine.chat(prompt).response + + # get text after + truncate_output = generate_output[generate_output.rfind("") + 8:].strip() + score_ = self.text_postprocessing(truncate_output) + scores[sent_i, sample_i] = score_ + + avg_score = np.sum(scores) / num_samples / (num_samples - 1) + return avg_score + + +def run_test_selfcheckgpt(chat_model_name: str, personality_file_path: Path, auth_token: str, selection_num: int = 10) -> float: + """ + Args: + chat_model_name (str): large language model path. + personality_file_path (Path): personality file path. + auth_token (str): auth token used for huggingface. + selection_num (int): maximum number of prompt are selected to compute hallucination score + + Returns: + hallucination score: the higher the score, the higher possibility of having hallucination issue. + """ + dataset_question, ov_chat_engine = prepare_dataset_and_model(chat_model_name, personality_file_path, auth_token) + check_eng = OVSelfCheckLLMPrompt(ov_chat_engine) + response_list = check_eng.generate_outputs(dataset_question[:selection_num]) + score_list = [] + for response_list_per_prompt in tqdm(response_list, desc="predict hallucination ratio"): + score_list.append(check_eng.predict(response_list_per_prompt)) + final_score = float(np.mean(score_list)) + return final_score + + +if __name__ == "__main__": + # set up logging + log.getLogger().setLevel(log.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument("--chat_model", type=str, default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", help="Path/name of the chat model") + parser.add_argument("--personality", type=str, default="../healthcare_personality.yaml", help="Path to the YAML file with chatbot personality") + parser.add_argument("--hf_token", type=str, help="HuggingFace access token to get Llama3") + parser.add_argument("--check_type", type=str, choices=["deepeval", "selfcheckgpt"], default="deepeval", help="Hallucination check type") + parser.add_argument("--selection_num", type=int, default=5, help="Maximum number of prompt are selected to compute hallucination score") + + args = parser.parse_args() + if args.check_type == "deepeval": + hallucination_score = run_test_deepeval(args.chat_model, Path(args.personality), args.hf_token, args.selection_num) + else: + hallucination_score = run_test_selfcheckgpt(args.chat_model, Path(args.personality), args.hf_token, args.selection_num) + print(f"hallucination_score for personality {args.personality}: {hallucination_score}") diff --git a/demos/virtual_ai_assistant_demo/test/tutor_questions.txt b/demos/virtual_ai_assistant_demo/test/tutor_questions.txt new file mode 100644 index 00000000..0142eb2b --- /dev/null +++ b/demos/virtual_ai_assistant_demo/test/tutor_questions.txt @@ -0,0 +1,20 @@ +I'm struggling with understanding the concept of supply and demand in economics. Can you help explain it? +How does the law of demand work in a real-world scenario? +Can you give an example of how the law of supply affects market prices? +What factors can cause a shift in the demand curve? +How do changes in consumer income affect demand? +What is the difference between a movement along the supply curve and a shift in the supply curve? +How do technological advancements impact supply? +Can you explain the concept of equilibrium price? +What happens when there is a surplus in the market? +How does a shortage affect market equilibrium? +What role do government regulations play in supply and demand? +How do taxes influence supply and demand? +Can you explain the concept of price elasticity of demand? +What factors determine the elasticity of a product? +How does the elasticity of supply differ from the elasticity of demand? +What is the significance of cross-price elasticity? +How do substitute goods affect demand? +Can you explain the concept of complementary goods? +How do expectations of future prices impact current demand? +What is the role of consumer preferences in determining demand? \ No newline at end of file