diff --git a/src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json b/src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json new file mode 100644 index 0000000..ba0f9b5 --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/src/lobster/assets/latent_generator_tokenizer/tokenizer.json b/src/lobster/assets/latent_generator_tokenizer/tokenizer.json new file mode 100644 index 0000000..5bc71de --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/tokenizer.json @@ -0,0 +1,359 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "WhitespaceSplit" + } + ] + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 0 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + ".": 5, + "a": 6, + "b": 7, + "c": 8, + "d": 9, + "e": 10, + "f": 11, + "g": 12, + "h": 13, + "i": 14, + "j": 15, + "k": 16, + "l": 17, + "m": 18, + "n": 19, + "o": 20, + "p": 21, + "q": 22, + "r": 23, + "s": 24, + "t": 25, + "u": 26, + "v": 27, + "w": 28, + "x": 29, + "y": 30, + "z": 31, + "aa": 32, + "ab": 33, + "ac": 34, + "ad": 35, + "ae": 36, + "af": 37, + "ag": 38, + "ah": 39, + "ai": 40, + "aj": 41, + "ak": 42, + "al": 43, + "am": 44, + "an": 45, + "ao": 46, + "ap": 47, + "aq": 48, + "ar": 49, + "as": 50, + "at": 51, + "au": 52, + "av": 53, + "aw": 54, + "ax": 55, + "ay": 56, + "az": 57, + "ba": 58, + "bb": 59, + "bc": 60, + "bd": 61, + "be": 62, + "bf": 63, + "bg": 64, + "bh": 65, + "bi": 66, + "bj": 67, + "bk": 68, + "bl": 69, + "bm": 70, + "bn": 71, + "bo": 72, + "bp": 73, + "bq": 74, + "br": 75, + "bs": 76, + "bt": 77, + "bu": 78, + "bv": 79, + "bw": 80, + "bx": 81, + "by": 82, + "bz": 83, + "ca": 84, + "cb": 85, + "cc": 86, + "cd": 87, + "ce": 88, + "cf": 89, + "cg": 90, + "ch": 91, + "ci": 92, + "cj": 93, + "ck": 94, + "cl": 95, + "cm": 96, + "cn": 97, + "co": 98, + "cp": 99, + "cq": 100, + "cr": 101, + "cs": 102, + "ct": 103, + "cu": 104, + "cv": 105, + "cw": 106, + "cx": 107, + "cy": 108, + "cz": 109, + "da": 110, + "db": 111, + "dc": 112, + "dd": 113, + "de": 114, + "df": 115, + "dg": 116, + "dh": 117, + "di": 118, + "dj": 119, + "dk": 120, + "dl": 121, + "dm": 122, + "dn": 123, + "do": 124, + "dp": 125, + "dq": 126, + "dr": 127, + "ds": 128, + "dt": 129, + "du": 130, + "dv": 131, + "dw": 132, + "dx": 133, + "dy": 134, + "dz": 135, + "ea": 136, + "eb": 137, + "ec": 138, + "ed": 139, + "ee": 140, + "ef": 141, + "eg": 142, + "eh": 143, + "ei": 144, + "ej": 145, + "ek": 146, + "el": 147, + "em": 148, + "en": 149, + "eo": 150, + "ep": 151, + "eq": 152, + "er": 153, + "es": 154, + "et": 155, + "eu": 156, + "ev": 157, + "ew": 158, + "ex": 159, + "ey": 160, + "ez": 161, + "fa": 162, + "fb": 163, + "fc": 164, + "fd": 165, + "fe": 166, + "ff": 167, + "fg": 168, + "fh": 169, + "fi": 170, + "fj": 171, + "fk": 172, + "fl": 173, + "fm": 174, + "fn": 175, + "fo": 176, + "fp": 177, + "fq": 178, + "fr": 179, + "fs": 180, + "ft": 181, + "fu": 182, + "fv": 183, + "fw": 184, + "fx": 185, + "fy": 186, + "fz": 187, + "ga": 188, + "gb": 189, + "gc": 190, + "gd": 191, + "ge": 192, + "gf": 193, + "gg": 194, + "gh": 195, + "gi": 196, + "gj": 197, + "gk": 198, + "gl": 199, + "gm": 200, + "gn": 201, + "go": 202, + "gp": 203, + "gq": 204, + "gr": 205, + "gs": 206, + "gt": 207, + "gu": 208, + "gv": 209, + "gw": 210, + "gx": 211, + "gy": 212, + "gz": 213, + "ha": 214, + "hb": 215, + "hc": 216, + "hd": 217, + "he": 218, + "hf": 219, + "hg": 220, + "hh": 221, + "hi": 222, + "hj": 223, + "hk": 224, + "hl": 225, + "hm": 226, + "hn": 227, + "ho": 228, + "hp": 229, + "hq": 230, + "hr": 231, + "hs": 232, + "ht": 233, + "hu": 234, + "hv": 235, + "hw": 236, + "hx": 237, + "hy": 238, + "hz": 239, + "ia": 240, + "ib": 241, + "ic": 242, + "id": 243, + "ie": 244, + "if": 245, + "ig": 246, + "ih": 247, + "ii": 248, + "ij": 249, + "ik": 250, + "il": 251, + "im": 252, + "in": 253, + "io": 254, + "ip": 255, + "iq": 256, + "ir": 257, + "is": 258, + "it": 259, + "iu": 260, + "iv": 261 + }, + "unk_token": "" + } +} \ No newline at end of file diff --git a/src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json b/src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json new file mode 100644 index 0000000..c331c4e --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json @@ -0,0 +1,12 @@ +{ + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "truncation_side": "left", + "unk_token": "" +} diff --git a/src/lobster/assets/latent_generator_tokenizer/vocab.txt b/src/lobster/assets/latent_generator_tokenizer/vocab.txt new file mode 100644 index 0000000..d8fdf49 --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/vocab.txt @@ -0,0 +1,262 @@ + + + + + +. +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +aa +ab +ac +ad +ae +af +ag +ah +ai +aj +ak +al +am +an +ao +ap +aq +ar +as +at +au +av +aw +ax +ay +az +ba +bb +bc +bd +be +bf +bg +bh +bi +bj +bk +bl +bm +bn +bo +bp +bq +br +bs +bt +bu +bv +bw +bx +by +bz +ca +cb +cc +cd +ce +cf +cg +ch +ci +cj +ck +cl +cm +cn +co +cp +cq +cr +cs +ct +cu +cv +cw +cx +cy +cz +da +db +dc +dd +de +df +dg +dh +di +dj +dk +dl +dm +dn +do +dp +dq +dr +ds +dt +du +dv +dw +dx +dy +dz +ea +eb +ec +ed +ee +ef +eg +eh +ei +ej +ek +el +em +en +eo +ep +eq +er +es +et +eu +ev +ew +ex +ey +ez +fa +fb +fc +fd +fe +ff +fg +fh +fi +fj +fk +fl +fm +fn +fo +fp +fq +fr +fs +ft +fu +fv +fw +fx +fy +fz +ga +gb +gc +gd +ge +gf +gg +gh +gi +gj +gk +gl +gm +gn +go +gp +gq +gr +gs +gt +gu +gv +gw +gx +gy +gz +ha +hb +hc +hd +he +hf +hg +hh +hi +hj +hk +hl +hm +hn +ho +hp +hq +hr +hs +ht +hu +hv +hw +hx +hy +hz +ia +ib +ic +id +ie +if +ig +ih +ii +ij +ik +il +im +in +io +ip +iq +ir +is +it +iu +iv \ No newline at end of file diff --git a/src/lobster/tokenization/__init__.py b/src/lobster/tokenization/__init__.py index 938bdca..ba9e3b1 100644 --- a/src/lobster/tokenization/__init__.py +++ b/src/lobster/tokenization/__init__.py @@ -1,6 +1,7 @@ from ._amino_acid import AminoAcidTokenizerFast from ._hyena_tokenizer import HyenaTokenizer from ._hyena_tokenizer_transform import HyenaTokenizerTransform +from ._latent_generator_3d_coord_tokenizer import LatentGenerator3DCoordTokenizerFast from ._mgm_tokenizer import MgmTokenizer from ._mgm_tokenizer_transform import MgmTokenizerTransform from ._nucleotide_tokenizer import NucleotideTokenizerFast diff --git a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py new file mode 100644 index 0000000..def466e --- /dev/null +++ b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py @@ -0,0 +1,72 @@ +import importlib.resources + +from tokenizers import pre_tokenizers +from tokenizers.models import WordLevel +from tokenizers.pre_tokenizers import WhitespaceSplit +from tokenizers.processors import TemplateProcessing +from transformers import PreTrainedTokenizerFast + +from ._load_vocab_file import load_vocab_file +from ._make_pretrained_tokenizer_fast import make_pretrained_tokenizer_fast + +PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "latent_generator_tokenizer" +VOCAB_PATH = PRETRAINED_TOKENIZER_PATH / "vocab.txt" +vocab = load_vocab_file(VOCAB_PATH) +LG_VOCAB = {v: k for k, v in enumerate(vocab)} + + +def _make_latent_generator_3d_coord_tokenizer() -> PreTrainedTokenizerFast: + """Create a `PreTrainedTokenizerFast` object for tokenization of protein structure 3d coordinate to tokens via Latent Generator. + + To create the tokenizer config stored under lobster/assets/latent_generator_tokenizer we run + + ``` + tokenizer = _make_latent_generator_3d_coord_tokenizer() + tokenizer.save_pretrained("src/lobster/assets/latent_generator_tokenizer") + ``` + + This can now be loaded using + `PreTrainedTokenizerFast.from_pretrained("src/lobster/assets/latent_generator_tokenizer")` + """ + + # WordLevel tokenizer + tokenizer_model = WordLevel(LG_VOCAB, unk_token="") + + # pretokenizers + pre_tokenizer = pre_tokenizers.Sequence([WhitespaceSplit()]) + + # bert style post processing + post_processor = TemplateProcessing( + single=" $A ", + pair=" $A $B:1 :1", + special_tokens=[("", 0), ("", 2)], # NOTE must match ids from AA_VOCAB + ) + + return make_pretrained_tokenizer_fast( + tokenizer_model=tokenizer_model, + post_processor=post_processor, + pre_tokenizer=pre_tokenizer, + eos_token="", + unk_token="", + pad_token="", + cls_token="", + mask_token="", + ) + + +class LatentGenerator3DCoordTokenizerFast(PreTrainedTokenizerFast): + padding_side = "right" + truncation_side = "right" + model_input_names = ["input_ids", "attention_mask"] + + def __init__(self): + super().__init__( + tokenizer_file=str(PRETRAINED_TOKENIZER_PATH / "tokenizer.json"), + bos_token=None, + eos_token="", + unk_token="", + sep_token=None, + pad_token="", + cls_token="", + mask_token="", + ) diff --git a/tests/lobster/tokenization/test__latent_generator_tokenizer.py b/tests/lobster/tokenization/test__latent_generator_tokenizer.py new file mode 100644 index 0000000..142f3f1 --- /dev/null +++ b/tests/lobster/tokenization/test__latent_generator_tokenizer.py @@ -0,0 +1,87 @@ +from lobster.tokenization._latent_generator_3d_coord_tokenizer import ( + LatentGenerator3DCoordTokenizerFast, + _make_latent_generator_3d_coord_tokenizer, +) +from transformers import PreTrainedTokenizerFast + + +def test__make_latent_generator_3d_coord_tokenizer(): + tokenizer = _make_latent_generator_3d_coord_tokenizer() + + assert isinstance(tokenizer, PreTrainedTokenizerFast) + + assert tokenizer.cls_token == "" + assert tokenizer.eos_token == "" + assert tokenizer.unk_token == "" + assert tokenizer.pad_token == "" + assert tokenizer.mask_token == "" + + assert tokenizer.vocab_size == 262 + + assert tokenizer.special_tokens_map == { + "eos_token": "", + "unk_token": "", + "pad_token": "", + "cls_token": "", + "mask_token": "", + } + + tokenized_output = tokenizer("gd fh ds fh ad gf fe cz ek ds cq") + + assert tokenized_output.input_ids == [0, 191, 169, 128, 169, 35, 193, 166, 109, 146, 128, 100, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds fh ad gf fe cz ek ds cq " + + tokenized_output = tokenizer("GD FH DS FH AD GF FE CZ EK DS CQ") + + assert tokenized_output.input_ids == [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2] + assert ( + tokenizer.decode(tokenized_output.input_ids) + == " " + ) + + tokenized_output = tokenizer("R A gd fh ds") + assert tokenized_output.input_ids == [0, 3, 3, 191, 169, 128, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds " + + +class TestLatentGenerator3DCoordTokenizerFast: + def test__init__(self): + tokenizer = LatentGenerator3DCoordTokenizerFast() + + assert isinstance(tokenizer, PreTrainedTokenizerFast) + + assert tokenizer.vocab_size == 262 + + assert tokenizer.cls_token == "" + assert tokenizer.eos_token == "" + assert tokenizer.unk_token == "" + assert tokenizer.pad_token == "" + assert tokenizer.mask_token == "" + + tokenized_output = tokenizer("gd fh ds fh ad gf fe cz ek ds cq") + assert tokenized_output.input_ids == [0, 191, 169, 128, 169, 35, 193, 166, 109, 146, 128, 100, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds fh ad gf fe cz ek ds cq " + + tokenized_output = tokenizer("GD FH DS FH AD GF FE CZ EK DS CQ") + assert tokenized_output.input_ids == [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2] + assert ( + tokenizer.decode(tokenized_output.input_ids) + == " " + ) + + tokenized_output = tokenizer("R A gd fh ds") + assert tokenized_output.input_ids == [0, 3, 3, 191, 169, 128, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds " + + assert tokenizer.special_tokens_map == { + "eos_token": "", + "unk_token": "", + "pad_token": "", + "cls_token": "", + "mask_token": "", + } + + +if __name__ == "__main__": + test__make_latent_generator_3d_coord_tokenizer() + TestLatentGenerator3DCoordTokenizerFast().test__init__()