Skip to content

Commit dfbce1f

Browse files
authored
Merge branch 'stanford-crfm:main' into main
2 parents f43477a + bfe9a4c commit dfbce1f

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

src/helm/benchmark/metrics/gpt4v_originality_critique_metrics.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ def evaluate_generation(
9090
template = CritiqueTaskTemplate(
9191
name="vhelm_gpt4v_originality",
9292
# TODO: Add proper instructions
93-
instructions="Answer the question given the text and image, remember to only answer "
94-
"with a capital letter.\n\n{{prompt}}",
93+
instructions="Answer the multiple choice question by just giving the letter of the correct "
94+
"answer.\n\n{{prompt}}",
9595
num_respondents=self._num_respondents,
9696
questions=[
9797
CritiqueQuestionTemplate(

src/helm/proxy/critique/model_critique_client.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,13 @@ def _multiple_choice_completion_to_answer(
135135
raise CritiqueParseError(
136136
f"Invalid answer: {completion}. Multiple choice questions should have one answer."
137137
)
138-
return answers[0]
138+
letter_answer = answers[0]
139+
choice_rank = string.ascii_uppercase.index(letter_answer)
140+
if choice_rank >= len(question.options):
141+
raise CritiqueParseError(
142+
f"Invalid answer: {completion}. The answer is out of range of the options: {question.options}"
143+
)
144+
return letter_answer
139145
except CritiqueParseError as e:
140146
# If there was an error parsing the answer, we assume the user did not answer the question.
141147
hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")

0 commit comments

Comments
 (0)