Skip to content

Commit 35c691e

Browse files
Merge pull request #43 from prescient-design/structure_tokens
Structure tokens
2 parents 6cf2d1c + 06d87ce commit 35c691e

File tree

7 files changed

+800
-0
lines changed

7 files changed

+800
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"cls_token": "<cls>",
3+
"eos_token": "<eos>",
4+
"mask_token": "<mask>",
5+
"pad_token": "<pad>",
6+
"unk_token": "<unk>"
7+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
{
2+
"version": "1.0",
3+
"truncation": null,
4+
"padding": null,
5+
"added_tokens": [],
6+
"normalizer": null,
7+
"pre_tokenizer": {
8+
"type": "Sequence",
9+
"pretokenizers": [
10+
{
11+
"type": "WhitespaceSplit"
12+
}
13+
]
14+
},
15+
"post_processor": {
16+
"type": "TemplateProcessing",
17+
"single": [
18+
{
19+
"SpecialToken": {
20+
"id": "<cls>",
21+
"type_id": 0
22+
}
23+
},
24+
{
25+
"Sequence": {
26+
"id": "A",
27+
"type_id": 0
28+
}
29+
},
30+
{
31+
"SpecialToken": {
32+
"id": "<eos>",
33+
"type_id": 0
34+
}
35+
}
36+
],
37+
"pair": [
38+
{
39+
"SpecialToken": {
40+
"id": "<cls>",
41+
"type_id": 0
42+
}
43+
},
44+
{
45+
"Sequence": {
46+
"id": "A",
47+
"type_id": 0
48+
}
49+
},
50+
{
51+
"SpecialToken": {
52+
"id": "<eos>",
53+
"type_id": 0
54+
}
55+
},
56+
{
57+
"Sequence": {
58+
"id": "B",
59+
"type_id": 1
60+
}
61+
},
62+
{
63+
"SpecialToken": {
64+
"id": "<eos>",
65+
"type_id": 1
66+
}
67+
}
68+
],
69+
"special_tokens": {
70+
"<cls>": {
71+
"id": "<cls>",
72+
"ids": [
73+
0
74+
],
75+
"tokens": [
76+
"<cls>"
77+
]
78+
},
79+
"<eos>": {
80+
"id": "<eos>",
81+
"ids": [
82+
2
83+
],
84+
"tokens": [
85+
"<eos>"
86+
]
87+
}
88+
}
89+
},
90+
"decoder": null,
91+
"model": {
92+
"type": "WordLevel",
93+
"vocab": {
94+
"<cls>": 0,
95+
"<pad>": 1,
96+
"<eos>": 2,
97+
"<unk>": 3,
98+
"<mask>": 4,
99+
".": 5,
100+
"a": 6,
101+
"b": 7,
102+
"c": 8,
103+
"d": 9,
104+
"e": 10,
105+
"f": 11,
106+
"g": 12,
107+
"h": 13,
108+
"i": 14,
109+
"j": 15,
110+
"k": 16,
111+
"l": 17,
112+
"m": 18,
113+
"n": 19,
114+
"o": 20,
115+
"p": 21,
116+
"q": 22,
117+
"r": 23,
118+
"s": 24,
119+
"t": 25,
120+
"u": 26,
121+
"v": 27,
122+
"w": 28,
123+
"x": 29,
124+
"y": 30,
125+
"z": 31,
126+
"aa": 32,
127+
"ab": 33,
128+
"ac": 34,
129+
"ad": 35,
130+
"ae": 36,
131+
"af": 37,
132+
"ag": 38,
133+
"ah": 39,
134+
"ai": 40,
135+
"aj": 41,
136+
"ak": 42,
137+
"al": 43,
138+
"am": 44,
139+
"an": 45,
140+
"ao": 46,
141+
"ap": 47,
142+
"aq": 48,
143+
"ar": 49,
144+
"as": 50,
145+
"at": 51,
146+
"au": 52,
147+
"av": 53,
148+
"aw": 54,
149+
"ax": 55,
150+
"ay": 56,
151+
"az": 57,
152+
"ba": 58,
153+
"bb": 59,
154+
"bc": 60,
155+
"bd": 61,
156+
"be": 62,
157+
"bf": 63,
158+
"bg": 64,
159+
"bh": 65,
160+
"bi": 66,
161+
"bj": 67,
162+
"bk": 68,
163+
"bl": 69,
164+
"bm": 70,
165+
"bn": 71,
166+
"bo": 72,
167+
"bp": 73,
168+
"bq": 74,
169+
"br": 75,
170+
"bs": 76,
171+
"bt": 77,
172+
"bu": 78,
173+
"bv": 79,
174+
"bw": 80,
175+
"bx": 81,
176+
"by": 82,
177+
"bz": 83,
178+
"ca": 84,
179+
"cb": 85,
180+
"cc": 86,
181+
"cd": 87,
182+
"ce": 88,
183+
"cf": 89,
184+
"cg": 90,
185+
"ch": 91,
186+
"ci": 92,
187+
"cj": 93,
188+
"ck": 94,
189+
"cl": 95,
190+
"cm": 96,
191+
"cn": 97,
192+
"co": 98,
193+
"cp": 99,
194+
"cq": 100,
195+
"cr": 101,
196+
"cs": 102,
197+
"ct": 103,
198+
"cu": 104,
199+
"cv": 105,
200+
"cw": 106,
201+
"cx": 107,
202+
"cy": 108,
203+
"cz": 109,
204+
"da": 110,
205+
"db": 111,
206+
"dc": 112,
207+
"dd": 113,
208+
"de": 114,
209+
"df": 115,
210+
"dg": 116,
211+
"dh": 117,
212+
"di": 118,
213+
"dj": 119,
214+
"dk": 120,
215+
"dl": 121,
216+
"dm": 122,
217+
"dn": 123,
218+
"do": 124,
219+
"dp": 125,
220+
"dq": 126,
221+
"dr": 127,
222+
"ds": 128,
223+
"dt": 129,
224+
"du": 130,
225+
"dv": 131,
226+
"dw": 132,
227+
"dx": 133,
228+
"dy": 134,
229+
"dz": 135,
230+
"ea": 136,
231+
"eb": 137,
232+
"ec": 138,
233+
"ed": 139,
234+
"ee": 140,
235+
"ef": 141,
236+
"eg": 142,
237+
"eh": 143,
238+
"ei": 144,
239+
"ej": 145,
240+
"ek": 146,
241+
"el": 147,
242+
"em": 148,
243+
"en": 149,
244+
"eo": 150,
245+
"ep": 151,
246+
"eq": 152,
247+
"er": 153,
248+
"es": 154,
249+
"et": 155,
250+
"eu": 156,
251+
"ev": 157,
252+
"ew": 158,
253+
"ex": 159,
254+
"ey": 160,
255+
"ez": 161,
256+
"fa": 162,
257+
"fb": 163,
258+
"fc": 164,
259+
"fd": 165,
260+
"fe": 166,
261+
"ff": 167,
262+
"fg": 168,
263+
"fh": 169,
264+
"fi": 170,
265+
"fj": 171,
266+
"fk": 172,
267+
"fl": 173,
268+
"fm": 174,
269+
"fn": 175,
270+
"fo": 176,
271+
"fp": 177,
272+
"fq": 178,
273+
"fr": 179,
274+
"fs": 180,
275+
"ft": 181,
276+
"fu": 182,
277+
"fv": 183,
278+
"fw": 184,
279+
"fx": 185,
280+
"fy": 186,
281+
"fz": 187,
282+
"ga": 188,
283+
"gb": 189,
284+
"gc": 190,
285+
"gd": 191,
286+
"ge": 192,
287+
"gf": 193,
288+
"gg": 194,
289+
"gh": 195,
290+
"gi": 196,
291+
"gj": 197,
292+
"gk": 198,
293+
"gl": 199,
294+
"gm": 200,
295+
"gn": 201,
296+
"go": 202,
297+
"gp": 203,
298+
"gq": 204,
299+
"gr": 205,
300+
"gs": 206,
301+
"gt": 207,
302+
"gu": 208,
303+
"gv": 209,
304+
"gw": 210,
305+
"gx": 211,
306+
"gy": 212,
307+
"gz": 213,
308+
"ha": 214,
309+
"hb": 215,
310+
"hc": 216,
311+
"hd": 217,
312+
"he": 218,
313+
"hf": 219,
314+
"hg": 220,
315+
"hh": 221,
316+
"hi": 222,
317+
"hj": 223,
318+
"hk": 224,
319+
"hl": 225,
320+
"hm": 226,
321+
"hn": 227,
322+
"ho": 228,
323+
"hp": 229,
324+
"hq": 230,
325+
"hr": 231,
326+
"hs": 232,
327+
"ht": 233,
328+
"hu": 234,
329+
"hv": 235,
330+
"hw": 236,
331+
"hx": 237,
332+
"hy": 238,
333+
"hz": 239,
334+
"ia": 240,
335+
"ib": 241,
336+
"ic": 242,
337+
"id": 243,
338+
"ie": 244,
339+
"if": 245,
340+
"ig": 246,
341+
"ih": 247,
342+
"ii": 248,
343+
"ij": 249,
344+
"ik": 250,
345+
"il": 251,
346+
"im": 252,
347+
"in": 253,
348+
"io": 254,
349+
"ip": 255,
350+
"iq": 256,
351+
"ir": 257,
352+
"is": 258,
353+
"it": 259,
354+
"iu": 260,
355+
"iv": 261
356+
},
357+
"unk_token": "<unk>"
358+
}
359+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"clean_up_tokenization_spaces": true,
3+
"cls_token": "<cls>",
4+
"eos_token": "<eos>",
5+
"mask_token": "<mask>",
6+
"model_max_length": 1000000000000000019884624838656,
7+
"pad_token": "<pad>",
8+
"padding_side": "right",
9+
"tokenizer_class": "PreTrainedTokenizerFast",
10+
"truncation_side": "left",
11+
"unk_token": "<unk>"
12+
}

0 commit comments

Comments
 (0)