Skip to content

Commit 73b3936

Browse files
committed
add AminoAcidTokenizerFast
This attempts to clarify/simplify the definition of tokenizer subclasses
1 parent 04bc1d5 commit 73b3936

File tree

6 files changed

+339
-0
lines changed

6 files changed

+339
-0
lines changed

src/lobster/assets/amino_acid_tokenizer/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"cls_token": "<cls>",
3+
"eos_token": "<eos>",
4+
"mask_token": "<mask>",
5+
"pad_token": "<pad>",
6+
"unk_token": "<unk>"
7+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
{
2+
"version": "1.0",
3+
"truncation": null,
4+
"padding": null,
5+
"added_tokens": [
6+
{
7+
"id": 0,
8+
"content": "<cls>",
9+
"single_word": false,
10+
"lstrip": false,
11+
"rstrip": false,
12+
"normalized": false,
13+
"special": true
14+
},
15+
{
16+
"id": 1,
17+
"content": "<pad>",
18+
"single_word": false,
19+
"lstrip": false,
20+
"rstrip": false,
21+
"normalized": false,
22+
"special": true
23+
},
24+
{
25+
"id": 2,
26+
"content": "<eos>",
27+
"single_word": false,
28+
"lstrip": false,
29+
"rstrip": false,
30+
"normalized": false,
31+
"special": true
32+
},
33+
{
34+
"id": 3,
35+
"content": "<unk>",
36+
"single_word": false,
37+
"lstrip": false,
38+
"rstrip": false,
39+
"normalized": false,
40+
"special": true
41+
},
42+
{
43+
"id": 32,
44+
"content": "<mask>",
45+
"single_word": false,
46+
"lstrip": false,
47+
"rstrip": false,
48+
"normalized": false,
49+
"special": true
50+
}
51+
],
52+
"normalizer": null,
53+
"pre_tokenizer": null,
54+
"post_processor": {
55+
"type": "TemplateProcessing",
56+
"single": [
57+
{
58+
"SpecialToken": {
59+
"id": "<cls>",
60+
"type_id": 0
61+
}
62+
},
63+
{
64+
"Sequence": {
65+
"id": "A",
66+
"type_id": 0
67+
}
68+
},
69+
{
70+
"SpecialToken": {
71+
"id": "<eos>",
72+
"type_id": 0
73+
}
74+
}
75+
],
76+
"pair": [
77+
{
78+
"SpecialToken": {
79+
"id": "<cls>",
80+
"type_id": 0
81+
}
82+
},
83+
{
84+
"Sequence": {
85+
"id": "A",
86+
"type_id": 0
87+
}
88+
},
89+
{
90+
"SpecialToken": {
91+
"id": "<eos>",
92+
"type_id": 0
93+
}
94+
},
95+
{
96+
"Sequence": {
97+
"id": "B",
98+
"type_id": 1
99+
}
100+
},
101+
{
102+
"SpecialToken": {
103+
"id": "<eos>",
104+
"type_id": 1
105+
}
106+
}
107+
],
108+
"special_tokens": {
109+
"<cls>": {
110+
"id": "<cls>",
111+
"ids": [
112+
0
113+
],
114+
"tokens": [
115+
"<cls>"
116+
]
117+
},
118+
"<eos>": {
119+
"id": "<eos>",
120+
"ids": [
121+
2
122+
],
123+
"tokens": [
124+
"<eos>"
125+
]
126+
}
127+
}
128+
},
129+
"decoder": null,
130+
"model": {
131+
"type": "BPE",
132+
"dropout": null,
133+
"unk_token": "<unk>",
134+
"continuing_subword_prefix": null,
135+
"end_of_word_suffix": null,
136+
"fuse_unk": false,
137+
"byte_fallback": false,
138+
"ignore_merges": true,
139+
"vocab": {
140+
"<cls>": 0,
141+
"<pad>": 1,
142+
"<eos>": 2,
143+
"<unk>": 3,
144+
"L": 4,
145+
"A": 5,
146+
"G": 6,
147+
"V": 7,
148+
"S": 8,
149+
"E": 9,
150+
"R": 10,
151+
"T": 11,
152+
"I": 12,
153+
"D": 13,
154+
"P": 14,
155+
"K": 15,
156+
"Q": 16,
157+
"N": 17,
158+
"F": 18,
159+
"Y": 19,
160+
"M": 20,
161+
"H": 21,
162+
"W": 22,
163+
"C": 23,
164+
"X": 24,
165+
"B": 25,
166+
"U": 26,
167+
"Z": 27,
168+
"O": 28,
169+
".": 29,
170+
"-": 30,
171+
"<null_1>": 31,
172+
"<mask>": 32
173+
},
174+
"merges": []
175+
}
176+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"added_tokens_decoder": {
3+
"0": {
4+
"content": "<cls>",
5+
"lstrip": false,
6+
"normalized": false,
7+
"rstrip": false,
8+
"single_word": false,
9+
"special": true
10+
},
11+
"1": {
12+
"content": "<pad>",
13+
"lstrip": false,
14+
"normalized": false,
15+
"rstrip": false,
16+
"single_word": false,
17+
"special": true
18+
},
19+
"2": {
20+
"content": "<eos>",
21+
"lstrip": false,
22+
"normalized": false,
23+
"rstrip": false,
24+
"single_word": false,
25+
"special": true
26+
},
27+
"3": {
28+
"content": "<unk>",
29+
"lstrip": false,
30+
"normalized": false,
31+
"rstrip": false,
32+
"single_word": false,
33+
"special": true
34+
},
35+
"32": {
36+
"content": "<mask>",
37+
"lstrip": false,
38+
"normalized": false,
39+
"rstrip": false,
40+
"single_word": false,
41+
"special": true
42+
}
43+
},
44+
"bos_token": null,
45+
"clean_up_tokenization_spaces": false,
46+
"cls_token": "<cls>",
47+
"eos_token": "<eos>",
48+
"extra_special_tokens": {},
49+
"mask_token": "<mask>",
50+
"model_max_length": 1000000000000000019884624838656,
51+
"pad_token": "<pad>",
52+
"sep_token": null,
53+
"tokenizer_class": "PreTrainedTokenizerFast",
54+
"unk_token": "<unk>"
55+
}

src/lobster/tokenization/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@
1414
PT5TeacherForcingTransform,
1515
PT5TokenizerTransform,
1616
)
17+
18+
from ._amino_acid import AminoAcidTokenizerFast
+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import importlib.resources
2+
3+
from tokenizers import Tokenizer
4+
from tokenizers.models import BPE
5+
from tokenizers.processors import TemplateProcessing
6+
from transformers import PreTrainedTokenizerFast
7+
8+
AA_VOCAB = {
9+
"<cls>": 0,
10+
"<pad>": 1,
11+
"<eos>": 2,
12+
"<unk>": 3,
13+
"L": 4,
14+
"A": 5,
15+
"G": 6,
16+
"V": 7,
17+
"S": 8,
18+
"E": 9,
19+
"R": 10,
20+
"T": 11,
21+
"I": 12,
22+
"D": 13,
23+
"P": 14,
24+
"K": 15,
25+
"Q": 16,
26+
"N": 17,
27+
"F": 18,
28+
"Y": 19,
29+
"M": 20,
30+
"H": 21,
31+
"W": 22,
32+
"C": 23,
33+
"X": 24,
34+
"B": 25,
35+
"U": 26,
36+
"Z": 27,
37+
"O": 28,
38+
".": 29,
39+
"-": 30,
40+
"<null_1>": 31,
41+
"<mask>": 32,
42+
}
43+
44+
PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "amino_acid_tokenizer"
45+
46+
47+
def _make_amino_acid_tokenizer() -> PreTrainedTokenizerFast:
48+
"""Create a `PreTrainedTokenizerFast` object for tokenization of protein sequences.
49+
50+
To create the tokenizer config stored under lobster/assets/amino_acid_tokenizer we run
51+
52+
```
53+
tokenizer = _make_amino_acid_tokenizer()
54+
tokenizer.save_pretrained("src/lobster/assets/amino_acid_tokenizer")
55+
```
56+
57+
This can now be loaded using
58+
`PreTrainedTokenizerFast.from_pretrained("src/lobster/assets/amino_acid_tokenizer")`
59+
"""
60+
# BPE with no merges => just use input vocab
61+
tok = Tokenizer(BPE(AA_VOCAB, merges=[], unk_token="<unk>", ignore_merges=True))
62+
63+
# bert style post processing
64+
tok.post_processor = TemplateProcessing(
65+
single="<cls> $A <eos>",
66+
pair="<cls> $A <eos> $B:1 <eos>:1",
67+
special_tokens=[("<cls>", 0), ("<eos>", 2)], # NOTE must match ids from AA_VOCAB
68+
)
69+
70+
tok = PreTrainedTokenizerFast(
71+
tokenizer_object=tok,
72+
bos_token=None,
73+
eos_token="<eos>",
74+
unk_token="<unk>",
75+
sep_token=None,
76+
pad_token="<pad>",
77+
cls_token="<cls>",
78+
mask_token="<mask>",
79+
)
80+
81+
return tok
82+
83+
84+
class AminoAcidTokenizerFast(PreTrainedTokenizerFast):
85+
padding_side = "right"
86+
truncation_side = "right"
87+
model_input_names = ["input_ids", "attention_mask"]
88+
89+
def __init__(self):
90+
super().__init__(
91+
tokenizer_file=str(PRETRAINED_TOKENIZER_PATH / "tokenizer.json"),
92+
bos_token=None,
93+
eos_token="<eos>",
94+
unk_token="<unk>",
95+
sep_token=None,
96+
pad_token="<pad>",
97+
cls_token="<cls>",
98+
mask_token="<mask>",
99+
)

0 commit comments

Comments
 (0)