Skip to content

Commit a512f00

Browse files
yifanmairyokawajp
andauthored
Add SUMO Web Claims Summarization scenario (#3112)
Co-authored-by: Ryo Kawahara <ryokawa@jp.ibm.com>
1 parent d8290e2 commit a512f00

File tree

4 files changed

+225
-14
lines changed

4 files changed

+225
-14
lines changed

setup.cfg

+4
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ models =
172172
crfm-helm[reka]
173173
crfm-helm[together]
174174
crfm-helm[yandex]
175+
crfm-helm[ibm-enterprise-scenarios]
175176

176177
reka =
177178
reka-api~=2.0.0
@@ -202,6 +203,9 @@ vlm =
202203
# For metrics
203204
pycocoevalcap~=1.2
204205

206+
ibm-enterprise-scenarios =
207+
openpyxl~=3.1
208+
205209
image2struct =
206210
crfm-helm[images]
207211

src/helm/benchmark/run_specs/enterprise_run_specs.py

+33
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,36 @@ def get_casehold_spec() -> RunSpec:
8989
metric_specs=metric_specs,
9090
groups=["casehold"],
9191
)
92+
93+
94+
# Climate
95+
96+
97+
@run_spec_function("sumosum")
98+
def get_sumosum_spec() -> RunSpec:
99+
scenario_spec = ScenarioSpec(
100+
class_name="helm.benchmark.scenarios.sumosum_scenario.SUMOSumScenario",
101+
args={
102+
# A too-short article could be garbage.
103+
"test_filter_min_length": 100,
104+
# A too-long article doesn't fit in a prompt.
105+
"test_filter_max_length": 3700,
106+
},
107+
)
108+
109+
instructions = "Generate the title of the following article."
110+
adapter_spec = get_generation_adapter_spec(
111+
instructions=instructions,
112+
output_noun="Title",
113+
max_train_instances=0,
114+
max_tokens=100,
115+
stop_sequences=["\n\n"],
116+
)
117+
118+
return RunSpec(
119+
name="sumosum",
120+
scenario_spec=scenario_spec,
121+
adapter_spec=adapter_spec,
122+
metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
123+
groups=["sumosum"],
124+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import os
2+
import re
3+
from typing import Dict, List, Optional
4+
5+
import pandas as pd
6+
7+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
8+
from helm.common.optional_dependencies import handle_module_not_found_error
9+
from helm.benchmark.scenarios.scenario import (
10+
Scenario,
11+
Instance,
12+
Reference,
13+
TRAIN_SPLIT,
14+
TEST_SPLIT,
15+
CORRECT_TAG,
16+
Input,
17+
Output,
18+
)
19+
20+
try:
21+
# Needed for pandas.read_excel
22+
import openpyxl # noqa
23+
except ModuleNotFoundError as e:
24+
handle_module_not_found_error(e, ["ibm-enterprise-scenarios"])
25+
26+
27+
class SUMOSumScenario(Scenario):
28+
"""SUMO Web Claims Summarization
29+
30+
SUMO Web Claims Summarization is a summarization task over the climate subset from the SUMO dataset.
31+
The task is to write a title based on the article contents.
32+
33+
Citation:
34+
@inproceedings{mishra-etal-2020-generating,
35+
title = "Generating Fact Checking Summaries for Web Claims",
36+
author = "Mishra, Rahul and
37+
Gupta, Dhruv and
38+
Leippold, Markus",
39+
editor = "Xu, Wei and
40+
Ritter, Alan and
41+
Baldwin, Tim and
42+
Rahimi, Afshin",
43+
booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)",
44+
month = nov,
45+
year = "2020",
46+
address = "Online",
47+
publisher = "Association for Computational Linguistics",
48+
url = "https://aclanthology.org/2020.wnut-1.12",
49+
doi = "10.18653/v1/2020.wnut-1.12",
50+
pages = "81--90",
51+
abstract = "We present SUMO, a neural attention-based approach that learns to establish correctness of textual claims based on evidence in the form of text documents (e.g., news articles or web documents). SUMO further generates an extractive summary by presenting a diversified set of sentences from the documents that explain its decision on the correctness of the textual claim. Prior approaches to address the problem of fact checking and evidence extraction have relied on simple concatenation of claim and document word embeddings as an input to claim driven attention weight computation. This is done so as to extract salient words and sentences from the documents that help establish the correctness of the claim. However this design of claim-driven attention fails to capture the contextual information in documents properly. We improve on the prior art by using improved claim and title guided hierarchical attention to model effective contextual cues. We show the efficacy of our approach on political, healthcare, and environmental datasets.",
52+
}
53+
""" # noqa: E501
54+
55+
name = "sumosum"
56+
description = "Text summarization with climate corpus"
57+
tags = ["summarization", "climate"]
58+
59+
TRAIN_RATIO = 0.2
60+
TITLE_KEY = "Title"
61+
DOCUMENT_KEY = "Doc_text"
62+
63+
def __init__(
64+
self,
65+
train_filter_min_length: Optional[int] = None,
66+
train_filter_max_length: Optional[int] = None,
67+
test_filter_min_length: Optional[int] = None,
68+
test_filter_max_length: Optional[int] = None,
69+
truncate_length: Optional[int] = None,
70+
):
71+
"""
72+
Initializes the scenario.
73+
Args:
74+
train_filter_min_length: Int indicating minimum length for training
75+
documents. Train examples smaller than
76+
train_filter_min_length tokens will be filtered out.
77+
train_filter_max_length: Int indicating maximum length for training
78+
documents. Train examples larger than
79+
train_filter_max_length tokens will be filtered out.
80+
test_filter_min_length: Int indicating minimum length for training
81+
documents. Test examples smaller than
82+
test_filter_min_length tokens will be filtered out.
83+
test_filter_max_length: Int indicating maximum length for training
84+
documents. Test examples larger than
85+
test_filter_max_length tokens will be filtered out.
86+
truncate_length: Int indicating the maximum length in tokens to
87+
truncate documents. Documents in all splits will be
88+
truncated to truncate_length tokens.
89+
NOTE: Whitespace tokenization is used to compute tokens.
90+
"""
91+
super().__init__()
92+
self.train_filter_min_length = train_filter_min_length
93+
self.train_filter_max_length = train_filter_max_length
94+
self.test_filter_min_length = test_filter_min_length
95+
self.test_filter_max_length = test_filter_max_length
96+
self.truncate_length = truncate_length
97+
98+
@staticmethod
99+
def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
100+
text = re.sub(r"\s+", " ", text)
101+
return " ".join(text.split()[:max_length])
102+
103+
def _load_dataset(self, output_path: str) -> Dict[str, pd.DataFrame]:
104+
data_dir = os.path.join(output_path, "data")
105+
ensure_directory_exists(data_dir)
106+
107+
source_url = "https://github.com/rahulOmishra/SUMO/raw/main/climate_claims_raw.xlsx"
108+
source_file = os.path.basename(source_url)
109+
target_path = os.path.join(data_dir, source_file)
110+
ensure_file_downloaded(
111+
source_url=source_url,
112+
target_path=target_path,
113+
)
114+
115+
# Column headers: Claim_id(int),Claim,Title,Doc_text,Label(bool)
116+
target_df = pd.read_excel(target_path, skiprows=1)
117+
target_df = target_df.dropna(subset=[SUMOSumScenario.TITLE_KEY, SUMOSumScenario.DOCUMENT_KEY])
118+
# Remove carriage return _x000D_ in Excel string
119+
target_df = target_df.replace({r"_x000D_": ""}, regex=True)
120+
# target_df = target_df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)
121+
# Split randomly (works better than split by order)
122+
train_df = target_df.sample(frac=SUMOSumScenario.TRAIN_RATIO, random_state=0)
123+
test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
124+
return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
125+
126+
def get_instances(self, output_path: str) -> List[Instance]:
127+
dataset_dict = self._load_dataset(output_path)
128+
129+
instances: List[Instance] = []
130+
131+
for split, split_data in dataset_dict.items():
132+
for example in split_data.itertuples():
133+
document = getattr(example, SUMOSumScenario.DOCUMENT_KEY)
134+
title = getattr(example, SUMOSumScenario.TITLE_KEY)
135+
art_len = len(document.split())
136+
if split == TEST_SPLIT:
137+
if self.test_filter_max_length and art_len > self.test_filter_max_length:
138+
continue
139+
if self.test_filter_min_length and art_len < self.test_filter_min_length:
140+
continue
141+
if split == TRAIN_SPLIT:
142+
if self.train_filter_max_length and art_len > self.train_filter_max_length:
143+
continue
144+
if self.train_filter_min_length and art_len < self.train_filter_min_length:
145+
continue
146+
147+
document = SUMOSumScenario._clean_and_truncate(document, self.truncate_length)
148+
title = SUMOSumScenario._clean_and_truncate(title)
149+
150+
instance = Instance(
151+
input=Input(text=document),
152+
references=[Reference(output=Output(text=title), tags=[CORRECT_TAG])],
153+
split=split,
154+
)
155+
instances.append(instance)
156+
157+
return instances

src/helm/benchmark/static/schema_enterprise.yaml

+31-14
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,20 @@ run_groups:
110110
subgroups:
111111
- gold_commodity_news
112112

113+
- name: legal_scenarios
114+
display_name: Legal Scenarios
115+
description: Scenarios for the legal domain
116+
category: All scenarios
117+
subgroups:
118+
- legal_contract_summarization
119+
120+
- name: climate_scenarios
121+
display_name: Climate Scenarios
122+
description: Scenarios for the climate domain
123+
category: All scenarios
124+
subgroups:
125+
- sumosum
126+
113127
- name: gold_commodity_news
114128
display_name: Gold Commodity News
115129
description: A classification benchmark based on a dataset of human-annotated gold commodity news headlines ([Sinha & Khandait, 2019](https://arxiv.org/abs/2009.04202)).
@@ -127,14 +141,6 @@ run_groups:
127141
when: 2000-2019
128142
language: English
129143

130-
- name: legal_scenarios
131-
display_name: Legal Scenarios
132-
description: Scenarios for the legal domain
133-
category: All scenarios
134-
subgroups:
135-
- legal_contract_summarization
136-
- casehold
137-
138144
- name: legal_contract_summarization
139145
display_name: Legal Contract Summarization
140146
description: Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).
@@ -152,15 +158,9 @@ run_groups:
152158
when: before 2019
153159
language: English
154160

155-
156161
- name: casehold
157162
display_name: CaseHOLD
158163
description: CaseHOLD (Case Holdings On Legal Decisions) is a multiple choice question answering scenario where the task is to identify the relevant holding of a cited case [(Zheng et al, 2021)](https://arxiv.org/pdf/2104.08671.pdf).
159-
metric_groups:
160-
- accuracy
161-
- efficiency
162-
- general_information
163-
environment:
164164
main_name: exact_match
165165
main_split: test
166166
taxonomy:
@@ -169,3 +169,20 @@ run_groups:
169169
who: legal professionals
170170
when: before 2021
171171
language: English
172+
173+
- name: sumosum
174+
display_name: SUMO Web Claims Summarization
175+
description: A summarization benchmark based on the climate subset of the SUMO dataset ([Mishra et al., 2020](https://aclanthology.org/2020.wnut-1.12/)).
176+
metric_groups:
177+
- accuracy
178+
- efficiency
179+
- general_information
180+
environment:
181+
main_name: rouge_2
182+
main_split: test
183+
taxonomy:
184+
task: summarization
185+
what: Articles from climatefeedback.org
186+
who: Writers of news articles and web documents
187+
when: Before 2020
188+
language: English

0 commit comments

Comments
 (0)