prescient-design
diff --git a/‎.gitattributes
+8 b/‎.gitattributes
+8
diff --git a/‎.github/workflows/push.yml
+53-49 b/‎.github/workflows/push.yml
+53-49
diff --git a/‎README.md
+23-20 b/‎README.md
+23-20
diff --git a/‎docs/CONTRIBUTORS.md
-1 b/‎docs/CONTRIBUTORS.md
-1
diff --git a/‎model_testing/inference.py
+5-3 b/‎model_testing/inference.py
+5-3
diff --git a/‎model_testing/intervene.py
+6-6 b/‎model_testing/intervene.py
+6-6
diff --git a/‎pyproject.toml
+4-3 b/‎pyproject.toml
+4-3
diff --git a/‎requirements-mgm.in
+2 b/‎requirements-mgm.in
+2
diff --git a/‎requirements.in
+7-2 b/‎requirements.in
+7-2
diff --git a/‎src/lobster/.DS_Store
-6 KB b/‎src/lobster/.DS_Store
-6 KB
diff --git a/‎src/lobster/_imports.py
-1 b/‎src/lobster/_imports.py
-1
diff --git a/‎src/lobster/data/__init__.py
-4 b/‎src/lobster/data/__init__.py
-4
diff --git a/‎src/lobster/data/_calm_datamodule.py
+1-1 b/‎src/lobster/data/_calm_datamodule.py
+1-1
@@ -1 +1,9 @@
 *.ckpt filter=lfs diff=lfs merge=lfs -text
+
+# Set default behavior to automatically normalize line endings.
+* text=auto
+
+# Explicitly declare text files you want to always be normalized and converted
+# to native line endings on checkout.
+*.py text eol=lf
+*.toml text
@@ -10,33 +10,41 @@ jobs:
           python-version: "3.x"
       - run: "python -m pip install --upgrade build"
       - run: "python -m build ."
-      - uses: "actions/upload-artifact@v3"
+      - uses: "actions/upload-artifact@v4"
         with:
           name: "python-package-distributions"
           path: "dist/"
-  # pytest:
-  #   strategy:
-  #     matrix:
-  #       platform:
-  #         - "macos-latest"
-  #         - "ubuntu-latest"
-  #         # - "windows-latest"
-  #       python:
-  #         - "3.10"
-  #         - "3.11"
-  #   runs-on: ${{ matrix.platform }}
-  #   steps:
-  #     - uses: "actions/checkout@v4"
-  #     - uses: "actions/setup-python@v5"
-  #       with:
-  #         python-version: ${{ matrix.python }}
-  #     - run: "python -m pip install -r requirements.in"
-  #     - run: "python -m pip install -r requirements-dev.in"
-  #     - run: "python -m pip install --editable ."
-  #     - run: "python -m pytest"
-  #     - env:
-  #         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-  #       uses: "codecov/codecov-action@v3"
+  pytest:
+    strategy:
+      matrix:
+        platform:
+          - "macos-latest"
+          - "ubuntu-latest"
+          # - "windows-latest"
+        python:
+          - "3.10"
+    runs-on: ${{ matrix.platform }}
+    steps:
+      - uses: "actions/checkout@v4"
+      - uses: "actions/setup-python@v5"
+        with:
+          python-version: ${{ matrix.python }}
+      - run: "python -m pip install -r requirements.in"
+      - run: "python -m pip install -r requirements-dev.in"
+      - run: "python -m pip install -r requirements-mgm.in"
+      - run: "python -m pip install --editable ."
+      - run: "python -m pytest"
+      - env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+        uses: "codecov/codecov-action@v3"
+  ruff:
+    runs-on: "ubuntu-latest"
+    steps:
+      - uses: "actions/checkout@v4"
+      - uses: "astral-sh/ruff-action@v1"
+        with:
+          args: "format --check"
+          version: 0.7.3
   # pypi:
   #   environment:
   #     name: "pypi.org"
@@ -74,29 +82,25 @@ jobs:
   #     - env:
   #         GITHUB_TOKEN: "${{ github.token }}"
   #       run: "gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}'"
-  # ruff:
-  #   runs-on: "ubuntu-latest"
-  #   steps:
-  #     - uses: "actions/checkout@v4"
-  #     - uses: "chartboost/ruff-action@v1"
-  #       with:
-  #         args: "format --check"
-#   testpypi:
-#     environment:
-#       name: "test.pypi.org"
-#       url: "https://test.pypi.org/project/lbster"
-#     needs:
-#       - "build"
-#     permissions:
-#       id-token: "write"
-#     runs-on: "ubuntu-latest"
-#     steps:
-#       - uses: "actions/download-artifact@v3"
-#         with:
-#           name: "python-package-distributions"
-#           path: "dist/"
-#       - uses: "pypa/gh-action-pypi-publish@release/v1"
-#         with:
-#           repository-url: "https://test.pypi.org/legacy/"
-#           skip-existing: true
+  testpypi:
+    environment:
+      name: "test.pypi.org"
+      url: "https://test.pypi.org/project/lbster"
+    needs:
+      - "build"
+    permissions:
+      id-token: "write"
+    runs-on: "ubuntu-latest"
+    steps:
+      - uses: "actions/download-artifact@v4"
+        with:
+          name: "python-package-distributions"
+          path: "dist/"
+      - uses: "pypa/gh-action-pypi-publish@release/v1"
+        with:
+          user: __token__
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          repository-url: "https://test.pypi.org/legacy/"
+          skip-existing: true
+          verbose: true
 on: "push"
@@ -2,15 +2,18 @@
 **L**anguage models for **B**iological **S**equence **T**ransformation and **E**volutionary **R**epresentation
 
 
-`lobster` is a "batteries included" language model library for proteins and other biological sequences. Led by [Nathan Frey](https://github.com/ncfrey), [Taylor Joren](https://github.com/taylormjs), [Aya Abdlesalam Ismail](https://github.com/ayaabdelsalam91), and [Allen Goodman](https://github.com/0x00b1), with many valuable contributions from [Contributors](docs/CONTRIBUTORS.md) across [Prescient Design, Genentech](https://www.gene.com/scientists/our-scientists/prescient-design).
+`lobster` is a "batteries included" language model library for proteins and other biological sequences. Led by [Nathan Frey](https://github.com/ncfrey), [Taylor Joren](https://github.com/taylormjs), [Aya Abdlesalam Ismail](https://github.com/ayaabdelsalam91), [Joseph Kleinhenz](https://github.com/kleinhenz) and [Allen Goodman](https://github.com/0x00b1), with many valuable contributions from [Contributors](docs/CONTRIBUTORS.md) across [Prescient Design, Genentech](https://www.gene.com/scientists/our-scientists/prescient-design).
+
+This repository contains training code and access to pre-trained language models for biological sequence data.
+
+## Usage
 
-This repository contains code and access to pre-trained language models for biological sequence data.
 
 <!---
 image credit: Amy Wang
 -->
 <p align="center">
-<img src="assets/lobster.png" width=200px>
+<img src="https://raw.githubusercontent.com/prescient-design/lobster/refs/heads/main/assets/lobster.png" width=200px>
 </p>
 
 
@@ -21,17 +24,19 @@ image credit: Amy Wang
 - [Install instructions](#install)
 - [Models](#main-models)
 - [Notebooks](#notebooks)
-- [Usage](#usage)
+- [Training and inference](#training)
+- [Contributing](#contributing)
 </details>
 
 ## Why you should use LBSTER <a name="why-use"></a>
 * LBSTER is built for pre-training models quickly from scratch. It is "batteries included." This is most useful if you need to control the pre-training data mixture and embedding space, or want to experiment with novel pre-training objectives and fine-tuning strategies.
 * LBSTER is a living, open-source library that will be periodically updated with new code and pre-trained models from the [Frey Lab](https://ncfrey.github.io/) at [Prescient Design, Genentech](https://www.gene.com/scientists/our-scientists/prescient-design). The Frey Lab works on real therapeutic molecule design problems and LBSTER models and capabilities reflect the demands of real-world drug discovery campaigns.
 * LBSTER is built with [beignet](https://github.com/Genentech/beignet/tree/main), a standard library for biological research, and integrated with [cortex](https://github.com/prescient-design/cortex/tree/main), a modular framework for multitask modeling, guided generation, and multi-modal models.
 * LBSTER supports concepts; we have a concept-bottleneck protein language model we refer to as CB-LBSTER, which supports 718 concepts.
+
 ## Citations <a name="citations"></a>
 If you use the code and/or models, please cite the relevant papers.
-For the `lbster` code base cite: [Cramming Protein Language Model Training in 24 GPU Hours](https://www.biorxiv.org/content/early/2024/05/15/2024.05.14.594108})
+For the `lbster` code base cite: [Cramming Protein Language Model Training in 24 GPU Hours](https://www.biorxiv.org/content/early/2024/05/15/2024.05.14.594108)
 ```bibtex
 @article{Frey2024.05.14.594108,
 	author = {Frey, Nathan C. and Joren, Taylor and Ismail, Aya Abdelsalam and Goodman, Allen and Bonneau, Richard and Cho, Kyunghyun and Gligorijevi{\'c}, Vladimir},
@@ -48,21 +53,19 @@ For the `lbster` code base cite: [Cramming Protein Language Model Training in 24
 ```
 
 
-<!-- For the `cb-lbster` code base cite: [Concept bottleneck Protien Language](https://www.biorxiv.org/content/early/2024/05/15/2024.05.14.594108})
+For the `cb-lbster` code base cite: [Concept Bottleneck Language Models for Protein Design](https://arxiv.org/abs/2411.06090)
 ```bibtex
-@article{Frey2024.05.14.594108,
-	author = {Frey, Nathan C. and Joren, Taylor and Ismail, Aya Abdelsalam and Goodman, Allen and Bonneau, Richard and Cho, Kyunghyun and Gligorijevi{\'c}, Vladimir},
-	title = {Cramming Protein Language Model Training in 24 GPU Hours},
-	elocation-id = {2024.05.14.594108},
-	year = {2024},
-	doi = {10.1101/2024.05.14.594108},
-	publisher = {Cold Spring Harbor Laboratory},
-	URL = {https://www.biorxiv.org/content/early/2024/05/15/2024.05.14.594108},
-	eprint = {https://www.biorxiv.org/content/early/2024/05/15/2024.05.14.594108.full.pdf},
-	journal = {bioRxiv}
+@article{ismail2024conceptbottlenecklanguagemodels,
+      title={Concept Bottleneck Language Models For protein design}, 
+      author={Aya Abdelsalam Ismail and Tuomas Oikarinen and Amy Wang and Julius Adebayo and Samuel Stanton and Taylor Joren and Joseph Kleinhenz and Allen Goodman and Héctor Corrada Bravo and Kyunghyun Cho and Nathan C. Frey},
+      year={2024},
+      eprint={2411.06090},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2411.06090}, 
 }
 
-``` -->
+```
 
 ## Install <a name="install"></a>
 clone the repo, cd into it and do `mamba env create -f env.yml`
@@ -118,7 +121,7 @@ Check out [jupyter notebook tutorial](notebooks/01-inference.ipynb) for example
 Check out [jupyter notebook tutorial](notebooks/02-intervention.ipynb) for example on to intervene on different concepts for our concept-bottleneck models class.
 
 
-## Usage <a name="usage"></a>
+## Training and inference <a name="training"></a>
 
 ### Embedding
 The entrypoint `lobster_embed` is the main driver for embedding sequences and accepts parameters using Hydra syntax. The available parameters for configuration can be found by running `lobster_embed --help` or by looking in the src/lobster/hydra_config directory
@@ -141,15 +144,15 @@ model.naturalness(sequences)
 model.likelihood(sequences)
 ```
 
-## Training from scratch
+### Training from scratch
 The entrypoint `lobster_train` is the main driver for training and accepts parameters using Hydra syntax. The available parameters for configuration can be found by running `lobster_train --help` or by looking in the src/lobster/hydra_config directory
 
 To train an MLM on a fasta file of sequences on an interactive GPU node, cd into the root dir of this repo and do
 ```bash
 lobster_train data.path_to_fasta="test_data/query.fasta" logger=csv paths.root_dir="."
 ```
 
-## Contributing
+## Contributing <a name="contributing"></a>
 Contributions are welcome! We ask that all users and contributors remember that the LBSTER team are all full-time drug hunters, and our open-source efforts are a labor of love because we care deeply about open science and scientific progress.
 
 ### Install dev requirements and pre-commit hooks
 
@@ -1,5 +1,4 @@
 * Karina Zadorozhny
-* Joseph Kleinhenz
 * Matthieu Kirchmeyer
 * Sai Pooja Mahajan
 * Amy Wang
@@ -1,13 +1,13 @@
 # Lobster Model Inference
 
 import torch
-from lobster.model import LobsterPMLM, LobsterCBMPMLM
+from lobster.model import LobsterCBMPMLM, LobsterPMLM
 
 # Define the test protein sequence
 test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"
 
 # Determine the device
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # Load the LobsterPMLM model
 lobster = LobsterPMLM("asalam91/lobster_24M").to(device)
@@ -29,7 +29,9 @@
 
 # Get protein concepts
 test_protein_concepts = cb_lobster.sequences_to_concepts([test_protein])[-1]
-test_protein_concepts_emb = cb_lobster.sequences_to_concepts_emb([test_protein])[-1][0]  # All of the known concepts are the same for all tokens...
+test_protein_concepts_emb = cb_lobster.sequences_to_concepts_emb([test_protein])[-1][
+    0
+]  # All of the known concepts are the same for all tokens...
 test_protein_concepts_unknown_emb = cb_lobster.sequences_to_concepts_emb([test_protein])[-1]
 
 # Print results
 
@@ -1,19 +1,19 @@
+import Levenshtein
 import torch
 from lobster.model import LobsterCBMPMLM
-import Levenshtein
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # Load the LobsterCBMPMLM model
 cb_lobster = LobsterCBMPMLM("asalam91/cb_lobster_24M").to(device)
 cb_lobster.eval()
-print (cb_lobster.list_supported_concept())
+print(cb_lobster.list_supported_concept())
 
-concept ="gravy"
-test_protein ="MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"
+concept = "gravy"
+test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"
 
-[new_protien] = cb_lobster.intervene_on_sequences([test_protein],concept,edits=5,intervention_type="negative")
+[new_protien] = cb_lobster.intervene_on_sequences([test_protein], concept, edits=5, intervention_type="negative")
 
 
 print(new_protien)
-print(Levenshtein.distance(test_protein, new_protien))
+print(Levenshtein.distance(test_protein, new_protien))
@@ -1,8 +1,9 @@
 [project]
 name = "lbster"
+readme = "README.md"
 description = "Language models for Biological Sequence Transformation and Evolutionary Representation."
 authors = [{name = "Nathan C. Frey", email = "frey.nathan.nf1@gene.com"}]
-dynamic = ["version", "readme", "dependencies", "optional-dependencies"]
+dynamic = ["version", "dependencies", "optional-dependencies"]
 requires-python = ">=3.10"
 
 [build-system]
@@ -20,10 +21,10 @@ lobster_eval = "lobster.cmdline:eval_embed"
 
 [tool.setuptools.dynamic]
 dependencies = {file = ["requirements.in"]}
-readme = {file = "README.md"}
 
 [tool.setuptools.dynamic.optional-dependencies]
 dev = {file = ["requirements-dev.in"]}
+mgm = {file = ["requirements-mgm.in"]}
 
 [tool.setuptools.packages.find]
 where = ["src"]
@@ -36,8 +37,8 @@ lobster = ["*.txt", "*.json", "*.yaml"]
 [tool.setuptools_scm]
 search_parent_directories = true
 version_scheme = "no-guess-dev"
-local_scheme = "node-and-date"
 fallback_version = "0.0.0"
+local_scheme = "no-local-version" # see https://github.com/pypa/setuptools-scm/issues/455
 
 [tool.ruff]
 line-length = 120
 
@@ -0,0 +1,2 @@
+selfies
+rdkit
@@ -24,5 +24,10 @@ fastparquet
 datasketch
 peft
 icecream
-selfies
-rdkit
+captum
+pooch
+edlib
+onnx
+onnxscript
+beignet[all]
+fair-esm
@@ -3,7 +3,6 @@
 from ._constants import (  # nopycln: import
     ESM_MODEL_NAMES,
 )
-from ._cyno_pk_datamodule import CynoPKClearanceLightningDataModule
 from ._dataframe_dataset_in_memory import (  # nopycln: import
     DataFrameDatasetInMemory,
     DataFrameLightningDataModule,
@@ -16,15 +15,12 @@
 )
 from ._minhasher import LobsterMinHasher
 from ._mmseqs import MMSeqsRunner
-from ._neglog_datamodule import NegLogDataModule
 from ._structure_datamodule import PDBDataModule
 from ._utils import (  # nopycln: import
     load_pickle,
 )
 
 __all__ = [
-    "ContactMapDataModule",
-    "NegLogDataModule",
     "PDBDataModule",
     "DataFrameDatasetInMemory",
 ]
@@ -4,13 +4,13 @@
 from typing import Any, Callable, Iterable, Optional, Sequence, TypeVar, Union
 
 import torch.utils.data
-from lobster.transforms import Transform
 from lightning import LightningDataModule
 from torch import Generator
 from torch.utils.data import DataLoader, Sampler
 
 from lobster.datasets._calm_dataset import CalmDataset
 from lobster.tokenization import PmlmTokenizerTransform
+from lobster.transforms import Transform
 
 T = TypeVar("T")
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`from ._constants import ( # nopycln: import`
`4`	`4`	`ESM_MODEL_NAMES,`
`5`	`5`	`)`
`6`		`-from ._cyno_pk_datamodule import CynoPKClearanceLightningDataModule`
`7`	`6`	`from ._dataframe_dataset_in_memory import ( # nopycln: import`
`8`	`7`	`DataFrameDatasetInMemory,`
`9`	`8`	`DataFrameLightningDataModule,`
`@@ -16,15 +15,12 @@`
`16`	`15`	`)`
`17`	`16`	`from ._minhasher import LobsterMinHasher`
`18`	`17`	`from ._mmseqs import MMSeqsRunner`
`19`		`-from ._neglog_datamodule import NegLogDataModule`
`20`	`18`	`from ._structure_datamodule import PDBDataModule`
`21`	`19`	`from ._utils import ( # nopycln: import`
`22`	`20`	`load_pickle,`
`23`	`21`	`)`
`24`	`22`
`25`	`23`	`__all__ = [`
`26`		`- "ContactMapDataModule",`
`27`		`- "NegLogDataModule",`
`28`	`24`	`"PDBDataModule",`
`29`	`25`	`"DataFrameDatasetInMemory",`
`30`	`26`	`]`