Merge branch 'stanford-crfm:main' into main

NoushNabi · web-flow · commit dfbce1f20444 · 2024-05-23T22:06:22.000-07:00
diff --git a/src/helm/benchmark/metrics/gpt4v_originality_critique_metrics.py b/src/helm/benchmark/metrics/gpt4v_originality_critique_metrics.py
@@ -90,8 +90,8 @@ def evaluate_generation(
         template = CritiqueTaskTemplate(
             name="vhelm_gpt4v_originality",
             # TODO: Add proper instructions
-            instructions="Answer the question given the text and image, remember to only answer "
-            "with a capital letter.\n\n{{prompt}}",
+            instructions="Answer the multiple choice question by just giving the letter of the correct "
+            "answer.\n\n{{prompt}}",
             num_respondents=self._num_respondents,
             questions=[
                 CritiqueQuestionTemplate(
diff --git a/src/helm/proxy/critique/model_critique_client.py b/src/helm/proxy/critique/model_critique_client.py
@@ -135,7 +135,13 @@ def _multiple_choice_completion_to_answer(
                 raise CritiqueParseError(
                     f"Invalid answer: {completion}. Multiple choice questions should have one answer."
                 )
-            return answers[0]
+            letter_answer = answers[0]
+            choice_rank = string.ascii_uppercase.index(letter_answer)
+            if choice_rank >= len(question.options):
+                raise CritiqueParseError(
+                    f"Invalid answer: {completion}. The answer is out of range of the options: {question.options}"
+                )
+            return letter_answer
         except CritiqueParseError as e:
             # If there was an error parsing the answer, we assume the user did not answer the question.
             hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")