Skip to content

Commit e6fa7eb

Browse files
authored
Use Anthropic tokenizer from Hugging Face (#3467)
1 parent 4ec117f commit e6fa7eb

6 files changed

+14
-86
lines changed

setup.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ amazon =
139139
botocore~=1.34.131
140140

141141
anthropic =
142-
anthropic~=0.17,<0.39 # TODO(#3212): Limit anthropic to >=0.39 after resolving #3212.
142+
anthropic~=0.39
143143
websocket-client~=1.3.2 # For legacy stanford-online-all-v4-s3
144144
httpx<0.28.0 # TODO(#3324): Remove this tepmorary workaround
145145

src/helm/config/tokenizer_configs.yaml

+3-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ tokenizer_configs:
8686
# Anthropic
8787
- name: anthropic/claude
8888
tokenizer_spec:
89-
class_name: "helm.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
89+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
90+
args:
91+
pretrained_model_name_or_path: Xenova/claude-tokenizer
9092
end_of_text_token: "<|endoftext|>"
9193
prefix_token: "<|endoftext|>"
9294

src/helm/tokenizers/anthropic_tokenizer.py

-52
This file was deleted.

src/helm/tokenizers/caching_tokenizer.py

+1-29
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from abc import abstractmethod
22
from dataclasses import asdict
3-
from typing import Any, Dict, List, Optional
3+
from typing import Any, Dict, List
44

55
from helm.common.cache import Cache, CacheConfig
66
from helm.common.request import wrap_request_time
@@ -153,31 +153,3 @@ def decode(self, request: DecodeRequest) -> DecodeRequestResult:
153153
)
154154
except Exception as error:
155155
raise ValueError(f"Failed to decode tokens with {self.__class__.__name__} tokenizer: {error}") from error
156-
157-
158-
def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
159-
"""
160-
Certain tokenizers introduce special characters to represent spaces, such as
161-
"Ġ" or "▁". This function removes those characters.
162-
"""
163-
if tokenizer_name in [
164-
"TsinghuaKEG/ice",
165-
"bigscience/T0pp",
166-
"google/t5-11b",
167-
"google/flan-t5-xxl",
168-
"google/ul2",
169-
"Yandex/yalm",
170-
"ai21/j1",
171-
"together",
172-
]:
173-
return token.replace("▁", " ")
174-
elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
175-
return token.replace("Ġ", " ")
176-
return token
177-
178-
179-
def cleanup_tokens(tokens: List[str], tokenizer_name: Optional[str] = None) -> List[str]:
180-
"""
181-
Applies `cleanup_str` to each token in `tokens`.
182-
"""
183-
return [cleanup_str(token, tokenizer_name) for token in tokens]

src/helm/tokenizers/test_anthropic_tokenizer.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
TokenizationRequest,
1111
TokenizationRequestResult,
1212
)
13-
from helm.tokenizers.anthropic_tokenizer import AnthropicTokenizer
13+
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
1414

1515

1616
class TestAnthropicTokenizer:
@@ -21,7 +21,11 @@ class TestAnthropicTokenizer:
2121
def setup_method(self, method):
2222
cache_file = tempfile.NamedTemporaryFile(delete=False)
2323
self.cache_path: str = cache_file.name
24-
self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
24+
self.tokenizer = HuggingFaceTokenizer(
25+
SqliteCacheConfig(self.cache_path),
26+
tokenizer_name="anthropic/claude",
27+
pretrained_model_name_or_path="Xenova/claude-tokenizer",
28+
)
2529

2630
def teardown_method(self, method):
2731
os.remove(self.cache_path)

src/helm/tokenizers/tokenizer.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
4141
"together",
4242
]:
4343
return token.replace("▁", " ")
44-
elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
44+
elif tokenizer_name is not None and (
45+
tokenizer_name.startswith("huggingface") or tokenizer_name == "anthropic/claude"
46+
):
4547
return token.replace("Ġ", " ")
4648
return token
4749

0 commit comments

Comments
 (0)