From 26e57469168edd32f0216aa1e0bd4e95be7aaa02 Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Thu, 6 Mar 2025 15:41:51 -0500 Subject: [PATCH 01/10] lg tokenizer --- .../tokenization/_latent_generator_tokens.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 src/lobster/tokenization/_latent_generator_tokens.py diff --git a/src/lobster/tokenization/_latent_generator_tokens.py b/src/lobster/tokenization/_latent_generator_tokens.py new file mode 100644 index 0000000..a769b77 --- /dev/null +++ b/src/lobster/tokenization/_latent_generator_tokens.py @@ -0,0 +1,91 @@ +import importlib.resources + +from tokenizers.models import BPE +from tokenizers.processors import TemplateProcessing +from transformers import PreTrainedTokenizerFast + +from ._make_pretrained_tokenizer_fast import make_pretrained_tokenizer_fast + +LG_VOCAB = {'': 0, '': 1, '': 2, '': 3, '': 4, '.': 5, 'a': 6, 'b': 7, 'c': 8, + 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, + 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, + 'z': 31, 'aa': 32, 'ab': 33, 'ac': 34, 'ad': 35, 'ae': 36, 'af': 37, 'ag': 38, 'ah': 39, 'ai': 40, + 'aj': 41, 'ak': 42, 'al': 43, 'am': 44, 'an': 45, 'ao': 46, 'ap': 47, 'aq': 48, 'ar': 49, 'as': 50, + 'at': 51, 'au': 52, 'av': 53, 'aw': 54, 'ax': 55, 'ay': 56, 'az': 57, 'ba': 58, 'bb': 59, 'bc': 60, + 'bd': 61, 'be': 62, 'bf': 63, 'bg': 64, 'bh': 65, 'bi': 66, 'bj': 67, 'bk': 68, 'bl': 69, 'bm': 70, + 'bn': 71, 'bo': 72, 'bp': 73, 'bq': 74, 'br': 75, 'bs': 76, 'bt': 77, 'bu': 78, 'bv': 79, 'bw': 80, + 'bx': 81, 'by': 82, 'bz': 83, 'ca': 84, 'cb': 85, 'cc': 86, 'cd': 87, 'ce': 88, 'cf': 89, 'cg': 90, + 'ch': 91, 'ci': 92, 'cj': 93, 'ck': 94, 'cl': 95, 'cm': 96, 'cn': 97, 'co': 98, 'cp': 99, 'cq': 100, + 'cr': 101, 'cs': 102, 'ct': 103, 'cu': 104, 'cv': 105, 'cw': 106, 'cx': 107, 'cy': 108, 'cz': 109, + 'da': 110, 'db': 111, 'dc': 112, 'dd': 113, 'de': 114, 'df': 115, 'dg': 116, 'dh': 117, 'di': 118, + 'dj': 119, 'dk': 120, 'dl': 121, 'dm': 122, 'dn': 123, 'do': 124, 'dp': 125, 'dq': 126, 'dr': 127, + 'ds': 128, 'dt': 129, 'du': 130, 'dv': 131, 'dw': 132, 'dx': 133, 'dy': 134, 'dz': 135, 'ea': 136, + 'eb': 137, 'ec': 138, 'ed': 139, 'ee': 140, 'ef': 141, 'eg': 142, 'eh': 143, 'ei': 144, 'ej': 145, + 'ek': 146, 'el': 147, 'em': 148, 'en': 149, 'eo': 150, 'ep': 151, 'eq': 152, 'er': 153, 'es': 154, + 'et': 155, 'eu': 156, 'ev': 157, 'ew': 158, 'ex': 159, 'ey': 160, 'ez': 161, 'fa': 162, 'fb': 163, + 'fc': 164, 'fd': 165, 'fe': 166, 'ff': 167, 'fg': 168, 'fh': 169, 'fi': 170, 'fj': 171, 'fk': 172, + 'fl': 173, 'fm': 174, 'fn': 175, 'fo': 176, 'fp': 177, 'fq': 178, 'fr': 179, 'fs': 180, 'ft': 181, + 'fu': 182, 'fv': 183, 'fw': 184, 'fx': 185, 'fy': 186, 'fz': 187, 'ga': 188, 'gb': 189, 'gc': 190, + 'gd': 191, 'ge': 192, 'gf': 193, 'gg': 194, 'gh': 195, 'gi': 196, 'gj': 197, 'gk': 198, 'gl': 199, + 'gm': 200, 'gn': 201, 'go': 202, 'gp': 203, 'gq': 204, 'gr': 205, 'gs': 206, 'gt': 207, 'gu': 208, + 'gv': 209, 'gw': 210, 'gx': 211, 'gy': 212, 'gz': 213, 'ha': 214, 'hb': 215, 'hc': 216, 'hd': 217, + 'he': 218, 'hf': 219, 'hg': 220, 'hh': 221, 'hi': 222, 'hj': 223, 'hk': 224, 'hl': 225, 'hm': 226, + 'hn': 227, 'ho': 228, 'hp': 229, 'hq': 230, 'hr': 231, 'hs': 232, 'ht': 233, 'hu': 234, 'hv': 235, + 'hw': 236, 'hx': 237, 'hy': 238, 'hz': 239, 'ia': 240, 'ib': 241, 'ic': 242, 'id': 243, 'ie': 244, + 'if': 245, 'ig': 246, 'ih': 247, 'ii': 248, 'ij': 249, 'ik': 250, 'il': 251, 'im': 252, 'in': 253, + 'io': 254, 'ip': 255, 'iq': 256, 'ir': 257, 'is': 258, 'it': 259, 'iu': 260, 'iv': 261} + +PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "latent_generator_tokenizer" + + +def _make_latent_generator_tokenizer() -> PreTrainedTokenizerFast: + """Create a `PreTrainedTokenizerFast` object for tokenization of protein structure latent generator sequences. + + To create the tokenizer config stored under lobster/assets/latent_generator_tokenizer we run + + ``` + tokenizer = _make_latent_generator_tokenizer() + tokenizer.save_pretrained("src/lobster/assets/latent_generator_tokenizer") + ``` + + This can now be loaded using + `PreTrainedTokenizerFast.from_pretrained("src/lobster/assets/latent_generator_tokenizer")` + """ + + # BPE with no merges => just use input vocab + tokenizer_model = BPE(LG_VOCAB, merges=[], unk_token="", ignore_merges=True) + + # bert style post processing + post_processor = TemplateProcessing( + single=" $A ", + pair=" $A $B:1 :1", + special_tokens=[("", 0), ("", 2)], # NOTE must match ids from AA_VOCAB + ) + + return make_pretrained_tokenizer_fast( + tokenizer_model=tokenizer_model, + post_processor=post_processor, + eos_token="", + unk_token="", + pad_token="", + cls_token="", + mask_token="", + ) + + +class LatentGeneratorTokenizerFast(PreTrainedTokenizerFast): + padding_side = "right" + truncation_side = "right" + model_input_names = ["input_ids", "attention_mask"] + + def __init__(self): + super().__init__( + tokenizer_file=str(PRETRAINED_TOKENIZER_PATH / "tokenizer.json"), + bos_token=None, + eos_token="", + unk_token="", + sep_token=None, + pad_token="", + cls_token="", + mask_token="", + ) From 8eaa2acd1014f8b958560cc3958b212650a3e53f Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Thu, 6 Mar 2025 16:39:59 -0500 Subject: [PATCH 02/10] lg tokenizer assets --- .../special_tokens_map.json | 7 + .../latent_generator_tokenizer/tokenizer.json | 358 ++++++++++++++++++ .../tokenizer_config.json | 12 + src/lobster/tokenization/__init__.py | 1 + 4 files changed, 378 insertions(+) create mode 100644 src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json create mode 100644 src/lobster/assets/latent_generator_tokenizer/tokenizer.json create mode 100644 src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json diff --git a/src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json b/src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json new file mode 100644 index 0000000..ba0f9b5 --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/src/lobster/assets/latent_generator_tokenizer/tokenizer.json b/src/lobster/assets/latent_generator_tokenizer/tokenizer.json new file mode 100644 index 0000000..5d93118 --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/tokenizer.json @@ -0,0 +1,358 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [], + "normalizer": null, + "pre_tokenizer": null, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 0 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + ".": 5, + "a": 6, + "b": 7, + "c": 8, + "d": 9, + "e": 10, + "f": 11, + "g": 12, + "h": 13, + "i": 14, + "j": 15, + "k": 16, + "l": 17, + "m": 18, + "n": 19, + "o": 20, + "p": 21, + "q": 22, + "r": 23, + "s": 24, + "t": 25, + "u": 26, + "v": 27, + "w": 28, + "x": 29, + "y": 30, + "z": 31, + "aa": 32, + "ab": 33, + "ac": 34, + "ad": 35, + "ae": 36, + "af": 37, + "ag": 38, + "ah": 39, + "ai": 40, + "aj": 41, + "ak": 42, + "al": 43, + "am": 44, + "an": 45, + "ao": 46, + "ap": 47, + "aq": 48, + "ar": 49, + "as": 50, + "at": 51, + "au": 52, + "av": 53, + "aw": 54, + "ax": 55, + "ay": 56, + "az": 57, + "ba": 58, + "bb": 59, + "bc": 60, + "bd": 61, + "be": 62, + "bf": 63, + "bg": 64, + "bh": 65, + "bi": 66, + "bj": 67, + "bk": 68, + "bl": 69, + "bm": 70, + "bn": 71, + "bo": 72, + "bp": 73, + "bq": 74, + "br": 75, + "bs": 76, + "bt": 77, + "bu": 78, + "bv": 79, + "bw": 80, + "bx": 81, + "by": 82, + "bz": 83, + "ca": 84, + "cb": 85, + "cc": 86, + "cd": 87, + "ce": 88, + "cf": 89, + "cg": 90, + "ch": 91, + "ci": 92, + "cj": 93, + "ck": 94, + "cl": 95, + "cm": 96, + "cn": 97, + "co": 98, + "cp": 99, + "cq": 100, + "cr": 101, + "cs": 102, + "ct": 103, + "cu": 104, + "cv": 105, + "cw": 106, + "cx": 107, + "cy": 108, + "cz": 109, + "da": 110, + "db": 111, + "dc": 112, + "dd": 113, + "de": 114, + "df": 115, + "dg": 116, + "dh": 117, + "di": 118, + "dj": 119, + "dk": 120, + "dl": 121, + "dm": 122, + "dn": 123, + "do": 124, + "dp": 125, + "dq": 126, + "dr": 127, + "ds": 128, + "dt": 129, + "du": 130, + "dv": 131, + "dw": 132, + "dx": 133, + "dy": 134, + "dz": 135, + "ea": 136, + "eb": 137, + "ec": 138, + "ed": 139, + "ee": 140, + "ef": 141, + "eg": 142, + "eh": 143, + "ei": 144, + "ej": 145, + "ek": 146, + "el": 147, + "em": 148, + "en": 149, + "eo": 150, + "ep": 151, + "eq": 152, + "er": 153, + "es": 154, + "et": 155, + "eu": 156, + "ev": 157, + "ew": 158, + "ex": 159, + "ey": 160, + "ez": 161, + "fa": 162, + "fb": 163, + "fc": 164, + "fd": 165, + "fe": 166, + "ff": 167, + "fg": 168, + "fh": 169, + "fi": 170, + "fj": 171, + "fk": 172, + "fl": 173, + "fm": 174, + "fn": 175, + "fo": 176, + "fp": 177, + "fq": 178, + "fr": 179, + "fs": 180, + "ft": 181, + "fu": 182, + "fv": 183, + "fw": 184, + "fx": 185, + "fy": 186, + "fz": 187, + "ga": 188, + "gb": 189, + "gc": 190, + "gd": 191, + "ge": 192, + "gf": 193, + "gg": 194, + "gh": 195, + "gi": 196, + "gj": 197, + "gk": 198, + "gl": 199, + "gm": 200, + "gn": 201, + "go": 202, + "gp": 203, + "gq": 204, + "gr": 205, + "gs": 206, + "gt": 207, + "gu": 208, + "gv": 209, + "gw": 210, + "gx": 211, + "gy": 212, + "gz": 213, + "ha": 214, + "hb": 215, + "hc": 216, + "hd": 217, + "he": 218, + "hf": 219, + "hg": 220, + "hh": 221, + "hi": 222, + "hj": 223, + "hk": 224, + "hl": 225, + "hm": 226, + "hn": 227, + "ho": 228, + "hp": 229, + "hq": 230, + "hr": 231, + "hs": 232, + "ht": 233, + "hu": 234, + "hv": 235, + "hw": 236, + "hx": 237, + "hy": 238, + "hz": 239, + "ia": 240, + "ib": 241, + "ic": 242, + "id": 243, + "ie": 244, + "if": 245, + "ig": 246, + "ih": 247, + "ii": 248, + "ij": 249, + "ik": 250, + "il": 251, + "im": 252, + "in": 253, + "io": 254, + "ip": 255, + "iq": 256, + "ir": 257, + "is": 258, + "it": 259, + "iu": 260, + "iv": 261 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json b/src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json new file mode 100644 index 0000000..c331c4e --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/tokenizer_config.json @@ -0,0 +1,12 @@ +{ + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "truncation_side": "left", + "unk_token": "" +} diff --git a/src/lobster/tokenization/__init__.py b/src/lobster/tokenization/__init__.py index 938bdca..64d9975 100644 --- a/src/lobster/tokenization/__init__.py +++ b/src/lobster/tokenization/__init__.py @@ -4,6 +4,7 @@ from ._mgm_tokenizer import MgmTokenizer from ._mgm_tokenizer_transform import MgmTokenizerTransform from ._nucleotide_tokenizer import NucleotideTokenizerFast +from ._latent_generator_tokens import LatentGeneratorTokenizerFast from ._pmlm_custom_concept_tokenizer_transform import ( CUSTOM_TOKENIZER, UnirefDescriptorTransform, From a9968643eeb91e284dca0ed255a8b73a0faa5214 Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Fri, 7 Mar 2025 16:05:28 -0500 Subject: [PATCH 03/10] lint --- src/lobster/tokenization/__init__.py | 2 +- .../tokenization/_latent_generator_tokens.py | 54 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/lobster/tokenization/__init__.py b/src/lobster/tokenization/__init__.py index 64d9975..cda0d42 100644 --- a/src/lobster/tokenization/__init__.py +++ b/src/lobster/tokenization/__init__.py @@ -1,10 +1,10 @@ from ._amino_acid import AminoAcidTokenizerFast from ._hyena_tokenizer import HyenaTokenizer from ._hyena_tokenizer_transform import HyenaTokenizerTransform +from ._latent_generator_tokens import LatentGeneratorTokenizerFast from ._mgm_tokenizer import MgmTokenizer from ._mgm_tokenizer_transform import MgmTokenizerTransform from ._nucleotide_tokenizer import NucleotideTokenizerFast -from ._latent_generator_tokens import LatentGeneratorTokenizerFast from ._pmlm_custom_concept_tokenizer_transform import ( CUSTOM_TOKENIZER, UnirefDescriptorTransform, diff --git a/src/lobster/tokenization/_latent_generator_tokens.py b/src/lobster/tokenization/_latent_generator_tokens.py index a769b77..77eab20 100644 --- a/src/lobster/tokenization/_latent_generator_tokens.py +++ b/src/lobster/tokenization/_latent_generator_tokens.py @@ -6,33 +6,33 @@ from ._make_pretrained_tokenizer_fast import make_pretrained_tokenizer_fast -LG_VOCAB = {'': 0, '': 1, '': 2, '': 3, '': 4, '.': 5, 'a': 6, 'b': 7, 'c': 8, - 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, - 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, - 'z': 31, 'aa': 32, 'ab': 33, 'ac': 34, 'ad': 35, 'ae': 36, 'af': 37, 'ag': 38, 'ah': 39, 'ai': 40, - 'aj': 41, 'ak': 42, 'al': 43, 'am': 44, 'an': 45, 'ao': 46, 'ap': 47, 'aq': 48, 'ar': 49, 'as': 50, - 'at': 51, 'au': 52, 'av': 53, 'aw': 54, 'ax': 55, 'ay': 56, 'az': 57, 'ba': 58, 'bb': 59, 'bc': 60, - 'bd': 61, 'be': 62, 'bf': 63, 'bg': 64, 'bh': 65, 'bi': 66, 'bj': 67, 'bk': 68, 'bl': 69, 'bm': 70, - 'bn': 71, 'bo': 72, 'bp': 73, 'bq': 74, 'br': 75, 'bs': 76, 'bt': 77, 'bu': 78, 'bv': 79, 'bw': 80, - 'bx': 81, 'by': 82, 'bz': 83, 'ca': 84, 'cb': 85, 'cc': 86, 'cd': 87, 'ce': 88, 'cf': 89, 'cg': 90, - 'ch': 91, 'ci': 92, 'cj': 93, 'ck': 94, 'cl': 95, 'cm': 96, 'cn': 97, 'co': 98, 'cp': 99, 'cq': 100, - 'cr': 101, 'cs': 102, 'ct': 103, 'cu': 104, 'cv': 105, 'cw': 106, 'cx': 107, 'cy': 108, 'cz': 109, - 'da': 110, 'db': 111, 'dc': 112, 'dd': 113, 'de': 114, 'df': 115, 'dg': 116, 'dh': 117, 'di': 118, - 'dj': 119, 'dk': 120, 'dl': 121, 'dm': 122, 'dn': 123, 'do': 124, 'dp': 125, 'dq': 126, 'dr': 127, - 'ds': 128, 'dt': 129, 'du': 130, 'dv': 131, 'dw': 132, 'dx': 133, 'dy': 134, 'dz': 135, 'ea': 136, - 'eb': 137, 'ec': 138, 'ed': 139, 'ee': 140, 'ef': 141, 'eg': 142, 'eh': 143, 'ei': 144, 'ej': 145, - 'ek': 146, 'el': 147, 'em': 148, 'en': 149, 'eo': 150, 'ep': 151, 'eq': 152, 'er': 153, 'es': 154, - 'et': 155, 'eu': 156, 'ev': 157, 'ew': 158, 'ex': 159, 'ey': 160, 'ez': 161, 'fa': 162, 'fb': 163, - 'fc': 164, 'fd': 165, 'fe': 166, 'ff': 167, 'fg': 168, 'fh': 169, 'fi': 170, 'fj': 171, 'fk': 172, - 'fl': 173, 'fm': 174, 'fn': 175, 'fo': 176, 'fp': 177, 'fq': 178, 'fr': 179, 'fs': 180, 'ft': 181, - 'fu': 182, 'fv': 183, 'fw': 184, 'fx': 185, 'fy': 186, 'fz': 187, 'ga': 188, 'gb': 189, 'gc': 190, - 'gd': 191, 'ge': 192, 'gf': 193, 'gg': 194, 'gh': 195, 'gi': 196, 'gj': 197, 'gk': 198, 'gl': 199, - 'gm': 200, 'gn': 201, 'go': 202, 'gp': 203, 'gq': 204, 'gr': 205, 'gs': 206, 'gt': 207, 'gu': 208, - 'gv': 209, 'gw': 210, 'gx': 211, 'gy': 212, 'gz': 213, 'ha': 214, 'hb': 215, 'hc': 216, 'hd': 217, - 'he': 218, 'hf': 219, 'hg': 220, 'hh': 221, 'hi': 222, 'hj': 223, 'hk': 224, 'hl': 225, 'hm': 226, - 'hn': 227, 'ho': 228, 'hp': 229, 'hq': 230, 'hr': 231, 'hs': 232, 'ht': 233, 'hu': 234, 'hv': 235, - 'hw': 236, 'hx': 237, 'hy': 238, 'hz': 239, 'ia': 240, 'ib': 241, 'ic': 242, 'id': 243, 'ie': 244, - 'if': 245, 'ig': 246, 'ih': 247, 'ii': 248, 'ij': 249, 'ik': 250, 'il': 251, 'im': 252, 'in': 253, +LG_VOCAB = {'': 0, '': 1, '': 2, '': 3, '': 4, '.': 5, 'a': 6, 'b': 7, 'c': 8, + 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, + 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, + 'z': 31, 'aa': 32, 'ab': 33, 'ac': 34, 'ad': 35, 'ae': 36, 'af': 37, 'ag': 38, 'ah': 39, 'ai': 40, + 'aj': 41, 'ak': 42, 'al': 43, 'am': 44, 'an': 45, 'ao': 46, 'ap': 47, 'aq': 48, 'ar': 49, 'as': 50, + 'at': 51, 'au': 52, 'av': 53, 'aw': 54, 'ax': 55, 'ay': 56, 'az': 57, 'ba': 58, 'bb': 59, 'bc': 60, + 'bd': 61, 'be': 62, 'bf': 63, 'bg': 64, 'bh': 65, 'bi': 66, 'bj': 67, 'bk': 68, 'bl': 69, 'bm': 70, + 'bn': 71, 'bo': 72, 'bp': 73, 'bq': 74, 'br': 75, 'bs': 76, 'bt': 77, 'bu': 78, 'bv': 79, 'bw': 80, + 'bx': 81, 'by': 82, 'bz': 83, 'ca': 84, 'cb': 85, 'cc': 86, 'cd': 87, 'ce': 88, 'cf': 89, 'cg': 90, + 'ch': 91, 'ci': 92, 'cj': 93, 'ck': 94, 'cl': 95, 'cm': 96, 'cn': 97, 'co': 98, 'cp': 99, 'cq': 100, + 'cr': 101, 'cs': 102, 'ct': 103, 'cu': 104, 'cv': 105, 'cw': 106, 'cx': 107, 'cy': 108, 'cz': 109, + 'da': 110, 'db': 111, 'dc': 112, 'dd': 113, 'de': 114, 'df': 115, 'dg': 116, 'dh': 117, 'di': 118, + 'dj': 119, 'dk': 120, 'dl': 121, 'dm': 122, 'dn': 123, 'do': 124, 'dp': 125, 'dq': 126, 'dr': 127, + 'ds': 128, 'dt': 129, 'du': 130, 'dv': 131, 'dw': 132, 'dx': 133, 'dy': 134, 'dz': 135, 'ea': 136, + 'eb': 137, 'ec': 138, 'ed': 139, 'ee': 140, 'ef': 141, 'eg': 142, 'eh': 143, 'ei': 144, 'ej': 145, + 'ek': 146, 'el': 147, 'em': 148, 'en': 149, 'eo': 150, 'ep': 151, 'eq': 152, 'er': 153, 'es': 154, + 'et': 155, 'eu': 156, 'ev': 157, 'ew': 158, 'ex': 159, 'ey': 160, 'ez': 161, 'fa': 162, 'fb': 163, + 'fc': 164, 'fd': 165, 'fe': 166, 'ff': 167, 'fg': 168, 'fh': 169, 'fi': 170, 'fj': 171, 'fk': 172, + 'fl': 173, 'fm': 174, 'fn': 175, 'fo': 176, 'fp': 177, 'fq': 178, 'fr': 179, 'fs': 180, 'ft': 181, + 'fu': 182, 'fv': 183, 'fw': 184, 'fx': 185, 'fy': 186, 'fz': 187, 'ga': 188, 'gb': 189, 'gc': 190, + 'gd': 191, 'ge': 192, 'gf': 193, 'gg': 194, 'gh': 195, 'gi': 196, 'gj': 197, 'gk': 198, 'gl': 199, + 'gm': 200, 'gn': 201, 'go': 202, 'gp': 203, 'gq': 204, 'gr': 205, 'gs': 206, 'gt': 207, 'gu': 208, + 'gv': 209, 'gw': 210, 'gx': 211, 'gy': 212, 'gz': 213, 'ha': 214, 'hb': 215, 'hc': 216, 'hd': 217, + 'he': 218, 'hf': 219, 'hg': 220, 'hh': 221, 'hi': 222, 'hj': 223, 'hk': 224, 'hl': 225, 'hm': 226, + 'hn': 227, 'ho': 228, 'hp': 229, 'hq': 230, 'hr': 231, 'hs': 232, 'ht': 233, 'hu': 234, 'hv': 235, + 'hw': 236, 'hx': 237, 'hy': 238, 'hz': 239, 'ia': 240, 'ib': 241, 'ic': 242, 'id': 243, 'ie': 244, + 'if': 245, 'ig': 246, 'ih': 247, 'ii': 248, 'ij': 249, 'ik': 250, 'il': 251, 'im': 252, 'in': 253, 'io': 254, 'ip': 255, 'iq': 256, 'ir': 257, 'is': 258, 'it': 259, 'iu': 260, 'iv': 261} PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "latent_generator_tokenizer" From 542b3ac9df181770263a327231197f3dd758224a Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Fri, 7 Mar 2025 16:08:58 -0500 Subject: [PATCH 04/10] vocab.txt --- .../latent_generator_tokenizer/vocab.txt | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 src/lobster/assets/latent_generator_tokenizer/vocab.txt diff --git a/src/lobster/assets/latent_generator_tokenizer/vocab.txt b/src/lobster/assets/latent_generator_tokenizer/vocab.txt new file mode 100644 index 0000000..d8fdf49 --- /dev/null +++ b/src/lobster/assets/latent_generator_tokenizer/vocab.txt @@ -0,0 +1,262 @@ + + + + + +. +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +aa +ab +ac +ad +ae +af +ag +ah +ai +aj +ak +al +am +an +ao +ap +aq +ar +as +at +au +av +aw +ax +ay +az +ba +bb +bc +bd +be +bf +bg +bh +bi +bj +bk +bl +bm +bn +bo +bp +bq +br +bs +bt +bu +bv +bw +bx +by +bz +ca +cb +cc +cd +ce +cf +cg +ch +ci +cj +ck +cl +cm +cn +co +cp +cq +cr +cs +ct +cu +cv +cw +cx +cy +cz +da +db +dc +dd +de +df +dg +dh +di +dj +dk +dl +dm +dn +do +dp +dq +dr +ds +dt +du +dv +dw +dx +dy +dz +ea +eb +ec +ed +ee +ef +eg +eh +ei +ej +ek +el +em +en +eo +ep +eq +er +es +et +eu +ev +ew +ex +ey +ez +fa +fb +fc +fd +fe +ff +fg +fh +fi +fj +fk +fl +fm +fn +fo +fp +fq +fr +fs +ft +fu +fv +fw +fx +fy +fz +ga +gb +gc +gd +ge +gf +gg +gh +gi +gj +gk +gl +gm +gn +go +gp +gq +gr +gs +gt +gu +gv +gw +gx +gy +gz +ha +hb +hc +hd +he +hf +hg +hh +hi +hj +hk +hl +hm +hn +ho +hp +hq +hr +hs +ht +hu +hv +hw +hx +hy +hz +ia +ib +ic +id +ie +if +ig +ih +ii +ij +ik +il +im +in +io +ip +iq +ir +is +it +iu +iv \ No newline at end of file From ae9fc1dc03463e19558563e59cd0b28f5766d00f Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Sat, 8 Mar 2025 09:40:49 -0500 Subject: [PATCH 05/10] added test and new wor level model v bpe --- .../latent_generator_tokenizer/tokenizer.json | 19 ++--- src/lobster/tokenization/__init__.py | 2 +- ...kens.py => _latent_generator_tokenizer.py} | 18 ++++- .../test__latent_generator_tokenizer.py | 78 +++++++++++++++++++ 4 files changed, 103 insertions(+), 14 deletions(-) rename src/lobster/tokenization/{_latent_generator_tokens.py => _latent_generator_tokenizer.py} (89%) create mode 100644 tests/lobster/tokenization/test__latent_generator_tokenizer.py diff --git a/src/lobster/assets/latent_generator_tokenizer/tokenizer.json b/src/lobster/assets/latent_generator_tokenizer/tokenizer.json index 5d93118..5bc71de 100644 --- a/src/lobster/assets/latent_generator_tokenizer/tokenizer.json +++ b/src/lobster/assets/latent_generator_tokenizer/tokenizer.json @@ -4,7 +4,14 @@ "padding": null, "added_tokens": [], "normalizer": null, - "pre_tokenizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "WhitespaceSplit" + } + ] + }, "post_processor": { "type": "TemplateProcessing", "single": [ @@ -82,13 +89,7 @@ }, "decoder": null, "model": { - "type": "BPE", - "dropout": null, - "unk_token": "", - "continuing_subword_prefix": null, - "end_of_word_suffix": null, - "fuse_unk": false, - "byte_fallback": false, + "type": "WordLevel", "vocab": { "": 0, "": 1, @@ -353,6 +354,6 @@ "iu": 260, "iv": 261 }, - "merges": [] + "unk_token": "" } } \ No newline at end of file diff --git a/src/lobster/tokenization/__init__.py b/src/lobster/tokenization/__init__.py index cda0d42..cdf0dad 100644 --- a/src/lobster/tokenization/__init__.py +++ b/src/lobster/tokenization/__init__.py @@ -1,7 +1,7 @@ from ._amino_acid import AminoAcidTokenizerFast from ._hyena_tokenizer import HyenaTokenizer from ._hyena_tokenizer_transform import HyenaTokenizerTransform -from ._latent_generator_tokens import LatentGeneratorTokenizerFast +from ._latent_generator_tokenizer import LatentGeneratorTokenizerFast from ._mgm_tokenizer import MgmTokenizer from ._mgm_tokenizer_transform import MgmTokenizerTransform from ._nucleotide_tokenizer import NucleotideTokenizerFast diff --git a/src/lobster/tokenization/_latent_generator_tokens.py b/src/lobster/tokenization/_latent_generator_tokenizer.py similarity index 89% rename from src/lobster/tokenization/_latent_generator_tokens.py rename to src/lobster/tokenization/_latent_generator_tokenizer.py index 77eab20..3599312 100644 --- a/src/lobster/tokenization/_latent_generator_tokens.py +++ b/src/lobster/tokenization/_latent_generator_tokenizer.py @@ -1,6 +1,8 @@ import importlib.resources -from tokenizers.models import BPE +from tokenizers import pre_tokenizers +from tokenizers.models import WordLevel +from tokenizers.pre_tokenizers import WhitespaceSplit from tokenizers.processors import TemplateProcessing from transformers import PreTrainedTokenizerFast @@ -39,7 +41,7 @@ def _make_latent_generator_tokenizer() -> PreTrainedTokenizerFast: - """Create a `PreTrainedTokenizerFast` object for tokenization of protein structure latent generator sequences. + """Create a `PreTrainedTokenizerFast` object for tokenization of protein structure 3d coordinate to tokens. To create the tokenizer config stored under lobster/assets/latent_generator_tokenizer we run @@ -52,8 +54,12 @@ def _make_latent_generator_tokenizer() -> PreTrainedTokenizerFast: `PreTrainedTokenizerFast.from_pretrained("src/lobster/assets/latent_generator_tokenizer")` """ - # BPE with no merges => just use input vocab - tokenizer_model = BPE(LG_VOCAB, merges=[], unk_token="", ignore_merges=True) + # WordLevel tokenizer + tokenizer_model = WordLevel(LG_VOCAB, unk_token="") + + #pretokenizers + pre_tokenizer = pre_tokenizers.Sequence([WhitespaceSplit()]) + # bert style post processing post_processor = TemplateProcessing( @@ -65,6 +71,7 @@ def _make_latent_generator_tokenizer() -> PreTrainedTokenizerFast: return make_pretrained_tokenizer_fast( tokenizer_model=tokenizer_model, post_processor=post_processor, + pre_tokenizer = pre_tokenizer, eos_token="", unk_token="", pad_token="", @@ -89,3 +96,6 @@ def __init__(self): cls_token="", mask_token="", ) +if __name__ == "__main__": + tokenizer = _make_latent_generator_tokenizer() + tokenizer.save_pretrained("/Users/lisanzas/Research/Develop/lobster/src/lobster/assets/latent_generator_tokenizer") diff --git a/tests/lobster/tokenization/test__latent_generator_tokenizer.py b/tests/lobster/tokenization/test__latent_generator_tokenizer.py new file mode 100644 index 0000000..78d8c78 --- /dev/null +++ b/tests/lobster/tokenization/test__latent_generator_tokenizer.py @@ -0,0 +1,78 @@ +from transformers import PreTrainedTokenizerFast + +from lobster.tokenization._latent_generator_tokenizer import LatentGeneratorTokenizerFast, _make_latent_generator_tokenizer + + +def test__make_latent_generator_tokenizer(): + tokenizer = _make_latent_generator_tokenizer() + + assert isinstance(tokenizer, PreTrainedTokenizerFast) + + assert tokenizer.cls_token == "" + assert tokenizer.eos_token == "" + assert tokenizer.unk_token == "" + assert tokenizer.pad_token == "" + assert tokenizer.mask_token == "" + + assert tokenizer.vocab_size == 262 + + assert tokenizer.special_tokens_map == { + "eos_token": "", + "unk_token": "", + "pad_token": "", + "cls_token": "", + "mask_token": "", + } + + tokenized_output = tokenizer("gd fh ds fh ad gf fe cz ek ds cq") + + assert tokenized_output.input_ids == [0, 191, 169, 128, 169, 35, 193, 166, 109, 146, 128, 100, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds fh ad gf fe cz ek ds cq " + + tokenized_output = tokenizer("GD FH DS FH AD GF FE CZ EK DS CQ") + + assert tokenized_output.input_ids == [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " " + + tokenized_output = tokenizer("R A gd fh ds") + assert tokenized_output.input_ids == [0, 3, 3, 191, 169, 128, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds " + + +class TestLatentGeneratorTokenizerFast: + def test__init__(self): + tokenizer = LatentGeneratorTokenizerFast() + + assert isinstance(tokenizer, PreTrainedTokenizerFast) + + assert tokenizer.vocab_size == 262 + + assert tokenizer.cls_token == "" + assert tokenizer.eos_token == "" + assert tokenizer.unk_token == "" + assert tokenizer.pad_token == "" + assert tokenizer.mask_token == "" + + tokenized_output = tokenizer("gd fh ds fh ad gf fe cz ek ds cq") + assert tokenized_output.input_ids == [0, 191, 169, 128, 169, 35, 193, 166, 109, 146, 128, 100, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds fh ad gf fe cz ek ds cq " + + tokenized_output = tokenizer("GD FH DS FH AD GF FE CZ EK DS CQ") + assert tokenized_output.input_ids == [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " " + + tokenized_output = tokenizer("R A gd fh ds") + assert tokenized_output.input_ids == [0, 3, 3, 191, 169, 128, 2] + assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds " + + assert tokenizer.special_tokens_map == { + "eos_token": "", + "unk_token": "", + "pad_token": "", + "cls_token": "", + "mask_token": "", + } + +if __name__ == "__main__": + test__make_latent_generator_tokenizer() + TestLatentGeneratorTokenizerFast().test__init__() \ No newline at end of file From f0bc0ba1b2dbf0c1cc3339e0b46974f2fc8ae7f9 Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Mon, 10 Mar 2025 08:27:56 -0400 Subject: [PATCH 06/10] rename to include coord tokenization explicity --- src/lobster/tokenization/__init__.py | 2 +- ....py => _latent_generator_3d_coord_tokenizer.py} | 10 +++++----- .../test__latent_generator_tokenizer.py | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) rename src/lobster/tokenization/{_latent_generator_tokenizer.py => _latent_generator_3d_coord_tokenizer.py} (94%) diff --git a/src/lobster/tokenization/__init__.py b/src/lobster/tokenization/__init__.py index cdf0dad..ba9e3b1 100644 --- a/src/lobster/tokenization/__init__.py +++ b/src/lobster/tokenization/__init__.py @@ -1,7 +1,7 @@ from ._amino_acid import AminoAcidTokenizerFast from ._hyena_tokenizer import HyenaTokenizer from ._hyena_tokenizer_transform import HyenaTokenizerTransform -from ._latent_generator_tokenizer import LatentGeneratorTokenizerFast +from ._latent_generator_3d_coord_tokenizer import LatentGenerator3DCoordTokenizerFast from ._mgm_tokenizer import MgmTokenizer from ._mgm_tokenizer_transform import MgmTokenizerTransform from ._nucleotide_tokenizer import NucleotideTokenizerFast diff --git a/src/lobster/tokenization/_latent_generator_tokenizer.py b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py similarity index 94% rename from src/lobster/tokenization/_latent_generator_tokenizer.py rename to src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py index 3599312..e7973c9 100644 --- a/src/lobster/tokenization/_latent_generator_tokenizer.py +++ b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py @@ -40,13 +40,13 @@ PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "latent_generator_tokenizer" -def _make_latent_generator_tokenizer() -> PreTrainedTokenizerFast: - """Create a `PreTrainedTokenizerFast` object for tokenization of protein structure 3d coordinate to tokens. +def _make_latent_generator_3d_coord_tokenizer() -> PreTrainedTokenizerFast: + """Create a `PreTrainedTokenizerFast` object for tokenization of protein structure 3d coordinate to tokens via Latent Generator. To create the tokenizer config stored under lobster/assets/latent_generator_tokenizer we run ``` - tokenizer = _make_latent_generator_tokenizer() + tokenizer = _make_latent_generator_3d_coord_tokenizer() tokenizer.save_pretrained("src/lobster/assets/latent_generator_tokenizer") ``` @@ -80,7 +80,7 @@ def _make_latent_generator_tokenizer() -> PreTrainedTokenizerFast: ) -class LatentGeneratorTokenizerFast(PreTrainedTokenizerFast): +class LatentGenerator3DCoordTokenizerFast(PreTrainedTokenizerFast): padding_side = "right" truncation_side = "right" model_input_names = ["input_ids", "attention_mask"] @@ -97,5 +97,5 @@ def __init__(self): mask_token="", ) if __name__ == "__main__": - tokenizer = _make_latent_generator_tokenizer() + tokenizer = _make_latent_generator_3d_coord_tokenizer() tokenizer.save_pretrained("/Users/lisanzas/Research/Develop/lobster/src/lobster/assets/latent_generator_tokenizer") diff --git a/tests/lobster/tokenization/test__latent_generator_tokenizer.py b/tests/lobster/tokenization/test__latent_generator_tokenizer.py index 78d8c78..4deacad 100644 --- a/tests/lobster/tokenization/test__latent_generator_tokenizer.py +++ b/tests/lobster/tokenization/test__latent_generator_tokenizer.py @@ -1,10 +1,10 @@ from transformers import PreTrainedTokenizerFast -from lobster.tokenization._latent_generator_tokenizer import LatentGeneratorTokenizerFast, _make_latent_generator_tokenizer +from lobster.tokenization._latent_generator_3d_coord_tokenizer import LatentGenerator3DCoordTokenizerFast, _make_latent_generator_3d_coord_tokenizer -def test__make_latent_generator_tokenizer(): - tokenizer = _make_latent_generator_tokenizer() +def test__make_latent_generator_3d_coord_tokenizer(): + tokenizer = _make_latent_generator_3d_coord_tokenizer() assert isinstance(tokenizer, PreTrainedTokenizerFast) @@ -39,9 +39,9 @@ def test__make_latent_generator_tokenizer(): assert tokenizer.decode(tokenized_output.input_ids) == " gd fh ds " -class TestLatentGeneratorTokenizerFast: +class TestLatentGenerator3DCoordTokenizerFast: def test__init__(self): - tokenizer = LatentGeneratorTokenizerFast() + tokenizer = LatentGenerator3DCoordTokenizerFast() assert isinstance(tokenizer, PreTrainedTokenizerFast) @@ -74,5 +74,5 @@ def test__init__(self): } if __name__ == "__main__": - test__make_latent_generator_tokenizer() - TestLatentGeneratorTokenizerFast().test__init__() \ No newline at end of file + test__make_latent_generator_3d_coord_tokenizer() + TestLatentGenerator3DCoordTokenizerFast().test__init__() \ No newline at end of file From 42a27124897e52e201e865d719d95daf015eae43 Mon Sep 17 00:00:00 2001 From: karinazad Date: Mon, 10 Mar 2025 09:52:54 -0400 Subject: [PATCH 07/10] ruff --- .../_latent_generator_3d_coord_tokenizer.py | 299 ++++++++++++++++-- 1 file changed, 268 insertions(+), 31 deletions(-) diff --git a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py index e7973c9..833c1d8 100644 --- a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py +++ b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py @@ -8,34 +8,270 @@ from ._make_pretrained_tokenizer_fast import make_pretrained_tokenizer_fast -LG_VOCAB = {'': 0, '': 1, '': 2, '': 3, '': 4, '.': 5, 'a': 6, 'b': 7, 'c': 8, - 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, - 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, - 'z': 31, 'aa': 32, 'ab': 33, 'ac': 34, 'ad': 35, 'ae': 36, 'af': 37, 'ag': 38, 'ah': 39, 'ai': 40, - 'aj': 41, 'ak': 42, 'al': 43, 'am': 44, 'an': 45, 'ao': 46, 'ap': 47, 'aq': 48, 'ar': 49, 'as': 50, - 'at': 51, 'au': 52, 'av': 53, 'aw': 54, 'ax': 55, 'ay': 56, 'az': 57, 'ba': 58, 'bb': 59, 'bc': 60, - 'bd': 61, 'be': 62, 'bf': 63, 'bg': 64, 'bh': 65, 'bi': 66, 'bj': 67, 'bk': 68, 'bl': 69, 'bm': 70, - 'bn': 71, 'bo': 72, 'bp': 73, 'bq': 74, 'br': 75, 'bs': 76, 'bt': 77, 'bu': 78, 'bv': 79, 'bw': 80, - 'bx': 81, 'by': 82, 'bz': 83, 'ca': 84, 'cb': 85, 'cc': 86, 'cd': 87, 'ce': 88, 'cf': 89, 'cg': 90, - 'ch': 91, 'ci': 92, 'cj': 93, 'ck': 94, 'cl': 95, 'cm': 96, 'cn': 97, 'co': 98, 'cp': 99, 'cq': 100, - 'cr': 101, 'cs': 102, 'ct': 103, 'cu': 104, 'cv': 105, 'cw': 106, 'cx': 107, 'cy': 108, 'cz': 109, - 'da': 110, 'db': 111, 'dc': 112, 'dd': 113, 'de': 114, 'df': 115, 'dg': 116, 'dh': 117, 'di': 118, - 'dj': 119, 'dk': 120, 'dl': 121, 'dm': 122, 'dn': 123, 'do': 124, 'dp': 125, 'dq': 126, 'dr': 127, - 'ds': 128, 'dt': 129, 'du': 130, 'dv': 131, 'dw': 132, 'dx': 133, 'dy': 134, 'dz': 135, 'ea': 136, - 'eb': 137, 'ec': 138, 'ed': 139, 'ee': 140, 'ef': 141, 'eg': 142, 'eh': 143, 'ei': 144, 'ej': 145, - 'ek': 146, 'el': 147, 'em': 148, 'en': 149, 'eo': 150, 'ep': 151, 'eq': 152, 'er': 153, 'es': 154, - 'et': 155, 'eu': 156, 'ev': 157, 'ew': 158, 'ex': 159, 'ey': 160, 'ez': 161, 'fa': 162, 'fb': 163, - 'fc': 164, 'fd': 165, 'fe': 166, 'ff': 167, 'fg': 168, 'fh': 169, 'fi': 170, 'fj': 171, 'fk': 172, - 'fl': 173, 'fm': 174, 'fn': 175, 'fo': 176, 'fp': 177, 'fq': 178, 'fr': 179, 'fs': 180, 'ft': 181, - 'fu': 182, 'fv': 183, 'fw': 184, 'fx': 185, 'fy': 186, 'fz': 187, 'ga': 188, 'gb': 189, 'gc': 190, - 'gd': 191, 'ge': 192, 'gf': 193, 'gg': 194, 'gh': 195, 'gi': 196, 'gj': 197, 'gk': 198, 'gl': 199, - 'gm': 200, 'gn': 201, 'go': 202, 'gp': 203, 'gq': 204, 'gr': 205, 'gs': 206, 'gt': 207, 'gu': 208, - 'gv': 209, 'gw': 210, 'gx': 211, 'gy': 212, 'gz': 213, 'ha': 214, 'hb': 215, 'hc': 216, 'hd': 217, - 'he': 218, 'hf': 219, 'hg': 220, 'hh': 221, 'hi': 222, 'hj': 223, 'hk': 224, 'hl': 225, 'hm': 226, - 'hn': 227, 'ho': 228, 'hp': 229, 'hq': 230, 'hr': 231, 'hs': 232, 'ht': 233, 'hu': 234, 'hv': 235, - 'hw': 236, 'hx': 237, 'hy': 238, 'hz': 239, 'ia': 240, 'ib': 241, 'ic': 242, 'id': 243, 'ie': 244, - 'if': 245, 'ig': 246, 'ih': 247, 'ii': 248, 'ij': 249, 'ik': 250, 'il': 251, 'im': 252, 'in': 253, - 'io': 254, 'ip': 255, 'iq': 256, 'ir': 257, 'is': 258, 'it': 259, 'iu': 260, 'iv': 261} +LG_VOCAB = { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + ".": 5, + "a": 6, + "b": 7, + "c": 8, + "d": 9, + "e": 10, + "f": 11, + "g": 12, + "h": 13, + "i": 14, + "j": 15, + "k": 16, + "l": 17, + "m": 18, + "n": 19, + "o": 20, + "p": 21, + "q": 22, + "r": 23, + "s": 24, + "t": 25, + "u": 26, + "v": 27, + "w": 28, + "x": 29, + "y": 30, + "z": 31, + "aa": 32, + "ab": 33, + "ac": 34, + "ad": 35, + "ae": 36, + "af": 37, + "ag": 38, + "ah": 39, + "ai": 40, + "aj": 41, + "ak": 42, + "al": 43, + "am": 44, + "an": 45, + "ao": 46, + "ap": 47, + "aq": 48, + "ar": 49, + "as": 50, + "at": 51, + "au": 52, + "av": 53, + "aw": 54, + "ax": 55, + "ay": 56, + "az": 57, + "ba": 58, + "bb": 59, + "bc": 60, + "bd": 61, + "be": 62, + "bf": 63, + "bg": 64, + "bh": 65, + "bi": 66, + "bj": 67, + "bk": 68, + "bl": 69, + "bm": 70, + "bn": 71, + "bo": 72, + "bp": 73, + "bq": 74, + "br": 75, + "bs": 76, + "bt": 77, + "bu": 78, + "bv": 79, + "bw": 80, + "bx": 81, + "by": 82, + "bz": 83, + "ca": 84, + "cb": 85, + "cc": 86, + "cd": 87, + "ce": 88, + "cf": 89, + "cg": 90, + "ch": 91, + "ci": 92, + "cj": 93, + "ck": 94, + "cl": 95, + "cm": 96, + "cn": 97, + "co": 98, + "cp": 99, + "cq": 100, + "cr": 101, + "cs": 102, + "ct": 103, + "cu": 104, + "cv": 105, + "cw": 106, + "cx": 107, + "cy": 108, + "cz": 109, + "da": 110, + "db": 111, + "dc": 112, + "dd": 113, + "de": 114, + "df": 115, + "dg": 116, + "dh": 117, + "di": 118, + "dj": 119, + "dk": 120, + "dl": 121, + "dm": 122, + "dn": 123, + "do": 124, + "dp": 125, + "dq": 126, + "dr": 127, + "ds": 128, + "dt": 129, + "du": 130, + "dv": 131, + "dw": 132, + "dx": 133, + "dy": 134, + "dz": 135, + "ea": 136, + "eb": 137, + "ec": 138, + "ed": 139, + "ee": 140, + "ef": 141, + "eg": 142, + "eh": 143, + "ei": 144, + "ej": 145, + "ek": 146, + "el": 147, + "em": 148, + "en": 149, + "eo": 150, + "ep": 151, + "eq": 152, + "er": 153, + "es": 154, + "et": 155, + "eu": 156, + "ev": 157, + "ew": 158, + "ex": 159, + "ey": 160, + "ez": 161, + "fa": 162, + "fb": 163, + "fc": 164, + "fd": 165, + "fe": 166, + "ff": 167, + "fg": 168, + "fh": 169, + "fi": 170, + "fj": 171, + "fk": 172, + "fl": 173, + "fm": 174, + "fn": 175, + "fo": 176, + "fp": 177, + "fq": 178, + "fr": 179, + "fs": 180, + "ft": 181, + "fu": 182, + "fv": 183, + "fw": 184, + "fx": 185, + "fy": 186, + "fz": 187, + "ga": 188, + "gb": 189, + "gc": 190, + "gd": 191, + "ge": 192, + "gf": 193, + "gg": 194, + "gh": 195, + "gi": 196, + "gj": 197, + "gk": 198, + "gl": 199, + "gm": 200, + "gn": 201, + "go": 202, + "gp": 203, + "gq": 204, + "gr": 205, + "gs": 206, + "gt": 207, + "gu": 208, + "gv": 209, + "gw": 210, + "gx": 211, + "gy": 212, + "gz": 213, + "ha": 214, + "hb": 215, + "hc": 216, + "hd": 217, + "he": 218, + "hf": 219, + "hg": 220, + "hh": 221, + "hi": 222, + "hj": 223, + "hk": 224, + "hl": 225, + "hm": 226, + "hn": 227, + "ho": 228, + "hp": 229, + "hq": 230, + "hr": 231, + "hs": 232, + "ht": 233, + "hu": 234, + "hv": 235, + "hw": 236, + "hx": 237, + "hy": 238, + "hz": 239, + "ia": 240, + "ib": 241, + "ic": 242, + "id": 243, + "ie": 244, + "if": 245, + "ig": 246, + "ih": 247, + "ii": 248, + "ij": 249, + "ik": 250, + "il": 251, + "im": 252, + "in": 253, + "io": 254, + "ip": 255, + "iq": 256, + "ir": 257, + "is": 258, + "it": 259, + "iu": 260, + "iv": 261, +} PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "latent_generator_tokenizer" @@ -57,10 +293,9 @@ def _make_latent_generator_3d_coord_tokenizer() -> PreTrainedTokenizerFast: # WordLevel tokenizer tokenizer_model = WordLevel(LG_VOCAB, unk_token="") - #pretokenizers + # pretokenizers pre_tokenizer = pre_tokenizers.Sequence([WhitespaceSplit()]) - # bert style post processing post_processor = TemplateProcessing( single=" $A ", @@ -71,7 +306,7 @@ def _make_latent_generator_3d_coord_tokenizer() -> PreTrainedTokenizerFast: return make_pretrained_tokenizer_fast( tokenizer_model=tokenizer_model, post_processor=post_processor, - pre_tokenizer = pre_tokenizer, + pre_tokenizer=pre_tokenizer, eos_token="", unk_token="", pad_token="", @@ -96,6 +331,8 @@ def __init__(self): cls_token="", mask_token="", ) + + if __name__ == "__main__": tokenizer = _make_latent_generator_3d_coord_tokenizer() tokenizer.save_pretrained("/Users/lisanzas/Research/Develop/lobster/src/lobster/assets/latent_generator_tokenizer") From 5cc5f70f535cd7d8e2a7e32dac1408143ae6522f Mon Sep 17 00:00:00 2001 From: karinazad Date: Mon, 10 Mar 2025 09:56:01 -0400 Subject: [PATCH 08/10] ruff tests --- .../test__latent_generator_tokenizer.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/lobster/tokenization/test__latent_generator_tokenizer.py b/tests/lobster/tokenization/test__latent_generator_tokenizer.py index 4deacad..142f3f1 100644 --- a/tests/lobster/tokenization/test__latent_generator_tokenizer.py +++ b/tests/lobster/tokenization/test__latent_generator_tokenizer.py @@ -1,7 +1,9 @@ +from lobster.tokenization._latent_generator_3d_coord_tokenizer import ( + LatentGenerator3DCoordTokenizerFast, + _make_latent_generator_3d_coord_tokenizer, +) from transformers import PreTrainedTokenizerFast -from lobster.tokenization._latent_generator_3d_coord_tokenizer import LatentGenerator3DCoordTokenizerFast, _make_latent_generator_3d_coord_tokenizer - def test__make_latent_generator_3d_coord_tokenizer(): tokenizer = _make_latent_generator_3d_coord_tokenizer() @@ -32,7 +34,10 @@ def test__make_latent_generator_3d_coord_tokenizer(): tokenized_output = tokenizer("GD FH DS FH AD GF FE CZ EK DS CQ") assert tokenized_output.input_ids == [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2] - assert tokenizer.decode(tokenized_output.input_ids) == " " + assert ( + tokenizer.decode(tokenized_output.input_ids) + == " " + ) tokenized_output = tokenizer("R A gd fh ds") assert tokenized_output.input_ids == [0, 3, 3, 191, 169, 128, 2] @@ -59,7 +64,10 @@ def test__init__(self): tokenized_output = tokenizer("GD FH DS FH AD GF FE CZ EK DS CQ") assert tokenized_output.input_ids == [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2] - assert tokenizer.decode(tokenized_output.input_ids) == " " + assert ( + tokenizer.decode(tokenized_output.input_ids) + == " " + ) tokenized_output = tokenizer("R A gd fh ds") assert tokenized_output.input_ids == [0, 3, 3, 191, 169, 128, 2] @@ -73,6 +81,7 @@ def test__init__(self): "mask_token": "", } + if __name__ == "__main__": test__make_latent_generator_3d_coord_tokenizer() - TestLatentGenerator3DCoordTokenizerFast().test__init__() \ No newline at end of file + TestLatentGenerator3DCoordTokenizerFast().test__init__() From 2d20c8f13d6452ae800a3e7960e80762df95f4ee Mon Sep 17 00:00:00 2001 From: karinazad Date: Mon, 10 Mar 2025 09:57:18 -0400 Subject: [PATCH 09/10] remove __name__==main --- .../tokenization/_latent_generator_3d_coord_tokenizer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py index 833c1d8..025eb56 100644 --- a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py +++ b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py @@ -331,8 +331,3 @@ def __init__(self): cls_token="", mask_token="", ) - - -if __name__ == "__main__": - tokenizer = _make_latent_generator_3d_coord_tokenizer() - tokenizer.save_pretrained("/Users/lisanzas/Research/Develop/lobster/src/lobster/assets/latent_generator_tokenizer") From 28b8eed6a916c02a2639ec335334926ea32e6564 Mon Sep 17 00:00:00 2001 From: Sidney Lisanza Date: Mon, 10 Mar 2025 13:23:01 -0400 Subject: [PATCH 10/10] use vocab.txt instead of ductionary --- .../_latent_generator_3d_coord_tokenizer.py | 33 +++---------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py index e7973c9..8bca74e 100644 --- a/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py +++ b/src/lobster/tokenization/_latent_generator_3d_coord_tokenizer.py @@ -6,38 +6,13 @@ from tokenizers.processors import TemplateProcessing from transformers import PreTrainedTokenizerFast +from ._load_vocab_file import load_vocab_file from ._make_pretrained_tokenizer_fast import make_pretrained_tokenizer_fast -LG_VOCAB = {'': 0, '': 1, '': 2, '': 3, '': 4, '.': 5, 'a': 6, 'b': 7, 'c': 8, - 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, - 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, - 'z': 31, 'aa': 32, 'ab': 33, 'ac': 34, 'ad': 35, 'ae': 36, 'af': 37, 'ag': 38, 'ah': 39, 'ai': 40, - 'aj': 41, 'ak': 42, 'al': 43, 'am': 44, 'an': 45, 'ao': 46, 'ap': 47, 'aq': 48, 'ar': 49, 'as': 50, - 'at': 51, 'au': 52, 'av': 53, 'aw': 54, 'ax': 55, 'ay': 56, 'az': 57, 'ba': 58, 'bb': 59, 'bc': 60, - 'bd': 61, 'be': 62, 'bf': 63, 'bg': 64, 'bh': 65, 'bi': 66, 'bj': 67, 'bk': 68, 'bl': 69, 'bm': 70, - 'bn': 71, 'bo': 72, 'bp': 73, 'bq': 74, 'br': 75, 'bs': 76, 'bt': 77, 'bu': 78, 'bv': 79, 'bw': 80, - 'bx': 81, 'by': 82, 'bz': 83, 'ca': 84, 'cb': 85, 'cc': 86, 'cd': 87, 'ce': 88, 'cf': 89, 'cg': 90, - 'ch': 91, 'ci': 92, 'cj': 93, 'ck': 94, 'cl': 95, 'cm': 96, 'cn': 97, 'co': 98, 'cp': 99, 'cq': 100, - 'cr': 101, 'cs': 102, 'ct': 103, 'cu': 104, 'cv': 105, 'cw': 106, 'cx': 107, 'cy': 108, 'cz': 109, - 'da': 110, 'db': 111, 'dc': 112, 'dd': 113, 'de': 114, 'df': 115, 'dg': 116, 'dh': 117, 'di': 118, - 'dj': 119, 'dk': 120, 'dl': 121, 'dm': 122, 'dn': 123, 'do': 124, 'dp': 125, 'dq': 126, 'dr': 127, - 'ds': 128, 'dt': 129, 'du': 130, 'dv': 131, 'dw': 132, 'dx': 133, 'dy': 134, 'dz': 135, 'ea': 136, - 'eb': 137, 'ec': 138, 'ed': 139, 'ee': 140, 'ef': 141, 'eg': 142, 'eh': 143, 'ei': 144, 'ej': 145, - 'ek': 146, 'el': 147, 'em': 148, 'en': 149, 'eo': 150, 'ep': 151, 'eq': 152, 'er': 153, 'es': 154, - 'et': 155, 'eu': 156, 'ev': 157, 'ew': 158, 'ex': 159, 'ey': 160, 'ez': 161, 'fa': 162, 'fb': 163, - 'fc': 164, 'fd': 165, 'fe': 166, 'ff': 167, 'fg': 168, 'fh': 169, 'fi': 170, 'fj': 171, 'fk': 172, - 'fl': 173, 'fm': 174, 'fn': 175, 'fo': 176, 'fp': 177, 'fq': 178, 'fr': 179, 'fs': 180, 'ft': 181, - 'fu': 182, 'fv': 183, 'fw': 184, 'fx': 185, 'fy': 186, 'fz': 187, 'ga': 188, 'gb': 189, 'gc': 190, - 'gd': 191, 'ge': 192, 'gf': 193, 'gg': 194, 'gh': 195, 'gi': 196, 'gj': 197, 'gk': 198, 'gl': 199, - 'gm': 200, 'gn': 201, 'go': 202, 'gp': 203, 'gq': 204, 'gr': 205, 'gs': 206, 'gt': 207, 'gu': 208, - 'gv': 209, 'gw': 210, 'gx': 211, 'gy': 212, 'gz': 213, 'ha': 214, 'hb': 215, 'hc': 216, 'hd': 217, - 'he': 218, 'hf': 219, 'hg': 220, 'hh': 221, 'hi': 222, 'hj': 223, 'hk': 224, 'hl': 225, 'hm': 226, - 'hn': 227, 'ho': 228, 'hp': 229, 'hq': 230, 'hr': 231, 'hs': 232, 'ht': 233, 'hu': 234, 'hv': 235, - 'hw': 236, 'hx': 237, 'hy': 238, 'hz': 239, 'ia': 240, 'ib': 241, 'ic': 242, 'id': 243, 'ie': 244, - 'if': 245, 'ig': 246, 'ih': 247, 'ii': 248, 'ij': 249, 'ik': 250, 'il': 251, 'im': 252, 'in': 253, - 'io': 254, 'ip': 255, 'iq': 256, 'ir': 257, 'is': 258, 'it': 259, 'iu': 260, 'iv': 261} - PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "latent_generator_tokenizer" +VOCAB_PATH = PRETRAINED_TOKENIZER_PATH / "vocab.txt" +vocab = load_vocab_file(VOCAB_PATH) +LG_VOCAB = {v: k for k, v in enumerate(vocab)} def _make_latent_generator_3d_coord_tokenizer() -> PreTrainedTokenizerFast: