diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml index 10585bb..e0fc811 100644 --- a/.github/workflows/linters.yml +++ b/.github/workflows/linters.yml @@ -26,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install tomli - export ruff_version=$(python -c 'import tomli; print([line for line in tomli.load(open("pyproject.toml","rb"))["project"]["optional-dependencies"]["dev"] if "ruff" in line][0])') + export ruff_version=$(python -c 'import tomli; print([line for line in tomli.load(open("pyproject.toml","rb"))["project"]["optional-dependencies"]["test"] if "ruff" in line][0])') echo "Click version: ruff_version" python -m pip install $ruff_version - name: Format code using ruff diff --git a/.github/workflows/testing_develop.yml b/.github/workflows/testing_develop.yml index 6cccd46..bd9b0c1 100644 --- a/.github/workflows/testing_develop.yml +++ b/.github/workflows/testing_develop.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: "actions/checkout@v4" @@ -28,9 +28,9 @@ jobs: uses: actions/cache@v3 with: path: tests/data - key: ${{ runner.os }}-data-v1-${{ hashFiles('tests/data/small-113.zip') }} + key: ${{ runner.os }}-data-v2-${{ hashFiles('tests/data/small-113.zip') }} restore-keys: | - ${{ runner.os }}-data-v1- + ${{ runner.os }}-data-v2- - name: Check cache status run: | @@ -39,7 +39,7 @@ jobs: - name: Download data file if: steps.cache-data.outputs.cache-hit != 'true' run: | - curl -o tests/data/small-113.zip https://zenodo.org/records/14625203/files/small-113.zip + curl -L -o tests/data/small-113.zip "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1" - name: Unzip data file run: | diff --git a/noxfile.py b/noxfile.py index 896bfcf..89fd775 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,6 +1,6 @@ import nox -_py_versions = range(10, 13) +_py_versions = range(10, 14) @nox.session(python=[f"3.{v}" for v in _py_versions]) diff --git a/pyproject.toml b/pyproject.toml index 6fa539b..88c63d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ maintainers = [ keywords = ["biology", "genomics", "evolution", "bioinformatics"] readme = "README.md" license = { file = "LICENSE" } -requires-python = ">=3.10,<3.13" +requires-python = ">=3.10,<3.14" dependencies = ["blosc2", "click", "cogent3>=2024.12.19a2", @@ -44,6 +44,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] dynamic = ["version", "description"] @@ -63,45 +64,38 @@ test = [ "pytest-cov", "pytest-timeout", "pytest-xdist", - "ruff==0.9.1", + "ruff==0.9.7", "nox"] doc = ["click", "sphinx", "sphinx-autobuild", - "sphinx>=1.6", "sphinx_book_theme", "sphinx_design", - "sphinxcontrib-bibtex"] + "sphinxcontrib-bibtex", + "ipykernel", + "ipython", + "ipywidgets", + "jupyter-sphinx", + "jupyter_client", + "jupyterlab", + "jupytext", + "kaleido", + "nbconvert>5.4", + "nbformat", + "nbsphinx", + "pillow", + "plotly", + ] dev = ["click", "cogapp", "flit", - "ipykernel", - "ipython", - "ipywidgets", - "jupyter-sphinx", - "jupyter_client", - "jupyterlab", - "jupytext", - "kaleido", - "nbconvert>5.4", - "nbformat", - "nbsphinx", "nox", "numpydoc", - "pandas", - "pillow", - "plotly", "psutil", - "pytest", - "pytest-cov", - "pytest-xdist", - "ruff==0.9.1", "scriv", - "sphinx", - "sphinx-autobuild", - "sphinx_book_theme", - "sphinx_design", - "sphinxcontrib-bibtex"] + "ensembl_tui[doc]", + "ensembl_tui[test]", + ] [tool.flit.sdist] include = ["src/*", "tests/*", "pyproject.toml"] diff --git a/src/ensembl_tui/_annotation.py b/src/ensembl_tui/_annotation.py index 0dbccb4..f5a67d4 100644 --- a/src/ensembl_tui/_annotation.py +++ b/src/ensembl_tui/_annotation.py @@ -286,7 +286,7 @@ def get_features_matching( symbol: OptStr = None, description: OptStr = None, **kwargs, # noqa: ANN003 - ) -> typing.Iterator[FeatureDataBase]: + ) -> typing.Iterator[GeneData]: # add supoport for querying by symbol and description stable_id = stable_id or kwargs.pop("name", None) limit = kwargs.pop("limit", None) diff --git a/src/ensembl_tui/_mysql_core_attr.py b/src/ensembl_tui/_mysql_core_attr.py index 45e0ede..f93ea2a 100644 --- a/src/ensembl_tui/_mysql_core_attr.py +++ b/src/ensembl_tui/_mysql_core_attr.py @@ -133,6 +133,10 @@ class LimitExons: strand: int transcript_id: int + @property + def single_exon(self) -> bool: + return self.start_rank == self.stop_rank + def get_all_limit_exons( conn: duckdb.DuckDBPyConnection, @@ -182,8 +186,7 @@ def get_limit_exons(records: list[tuple[int, ...]]) -> LimitExons: return LimitExons( start_rank=start_rank, stop_rank=end_rank, - # subtract 1 to convert to 0-based - rel_start=rel_start - 1, + rel_start=rel_start, rel_stop=rel_end, strand=strand, transcript_id=transcript_id, @@ -232,6 +235,13 @@ def to_record(self, columns: tuple[str]) -> tuple: return tuple(mapping[c] for c in columns) +def _adjust_single_exon(lex: LimitExons, cds_span: tuple[int, int]) -> tuple[int, int]: + ex_start = cds_span[0] if lex.strand == 1 else cds_span[1] + if lex.strand == 1: + return ex_start + lex.rel_start, ex_start + lex.rel_stop + return ex_start - lex.rel_stop, ex_start - lex.rel_start + + def get_transcript_attr_records( conn: duckdb.DuckDBPyConnection, ) -> typing.Iterator[TranscriptAttrRecord]: @@ -269,6 +279,8 @@ def get_transcript_attr_records( for i, rank in enumerate(ranks): transcript_spans[rank - 1] = (starts[i], stops[i]) + cds_spans = transcript_spans.copy() + transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :] if transcript_id not in limit_exons: # no translated exons yield TranscriptAttrRecord( @@ -291,7 +303,21 @@ def get_transcript_attr_records( # 5' end of an exon # so the start_exon coords become (exon_start + rel_start, exon_end) # the end_exon coords become (exon_start, exon_start + rel_stop) - cds_spans = transcript_spans.copy() + if lex.single_exon: + cds_spans[0, :] = _adjust_single_exon(lex, cds_spans[0]) + + yield TranscriptAttrRecord( + seqid=seqid, + transcript_id=transcript_id, + gene_id=gene_id, + strand=strand, + transcript_spans=transcript_spans, + cds_spans=cds_spans, + transcript_stable_id=transcript_stable_id, + cds_stable_id=cds_stable_id, + ) + continue + start_exon_coords = cds_spans[start_index] stop_exon_coords = cds_spans[stop_index] if lex.strand == 1: @@ -321,7 +347,6 @@ def get_transcript_attr_records( # sort all spans in ascending numerical order # note that the lexsort returns the sorted indices cds_spans = cds_spans[numpy.lexsort(cds_spans.T), :] - transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :] yield TranscriptAttrRecord( seqid=seqid, diff --git a/tests/conftest.py b/tests/conftest.py index d98c6f7..a880ff6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,7 +50,7 @@ def namer(): return name_as_seqid -TEST_DATA_URL = "https://zenodo.org/records/14625203/files/small-113.zip" +TEST_DATA_URL = "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1" SMALL_DATA_DIRNAME = "small-113" diff --git a/tests/test_cli.py b/tests/test_cli.py index 18ad1f0..0798d58 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -79,6 +79,70 @@ def test_installed(installed): assert len(list(path.glob("*attr.parquet"))) == 2 +@pytest.mark.slow +def test_check_one_cds_seq(installed): + # checking a single exon sequence with a rel_start > 0 + from ensembl_tui import _genome as eti_genome + + config = eti_config.read_installed_cfg(installed) + genome = eti_genome.load_genome( + config=config, + species="saccharomyces_cerevisiae", + ) + cds = next(iter(genome.get_cds(stable_id="YMR242C"))) + seq = cds.get_slice() + expect = ( + "GCTCACTTTAAAGAATACCAAGTTATTGGCCGTCGTTTGCCAACTGAATCTGTTCCAGAA" + "CCAAAGTTGTTCAGAATGAGAATCTTTGCTTCAAATGAAGTTATTGCCAAGTCTCGTTAC" + "TGGTATTTCTTGCAAAAGTTGCACAAGGTTAAGAAGGCTTCTGGTGAAATTGTTTCCATC" + "AACCAAATCAACGAAGCTCATCCAACCAAGGTCAAGAACTTCGGTGTCTGGGTTAGATAC" + "GACTCCAGATCTGGTACTCACAATATGTACAAGGAAATCAGAGACGTCTCCAGAGTTGCT" + "GCCGTCGAAACCTTATACCAAGACATGGCTGCCAGACACAGAGCTAGATTTAGATCTATT" + "CACATCTTGAAGGTTGCTGAAATTGAAAAGACTGCTGACGTCAAGAGACAATACGTTAAG" + "CAATTTTTGACCAAGGACTTGAAATTCCCATTGCCTCACAGAGTCCAAAAATCCACCAAG" + "ACTTTCTCCTACAAGAGACCTTCCACTTTCTACTGA" + ) + assert str(cds.get_slice()) == expect + + +@pytest.mark.slow +def test_check_multi_exon_cds_seq_plus_strand(installed): + # checking a multi exon sequence with a rel_start > 0 + # and rel_end != exon length + from ensembl_tui import _genome as eti_genome + + config = eti_config.read_installed_cfg(installed) + genome = eti_genome.load_genome( + config=config, + species="caenorhabditis_elegans", + ) + cds = next(iter(genome.get_cds(stable_id="WBGene00185002"))) + aa = str(cds.get_slice().get_translation()) + # seq expected values from ensembl + assert aa.startswith("MEMEDIDDDITVFYTDDRGTVQGPYGASTVLDWYQKGYFSDNHQMRFTDNGQRIGNLFTY") + assert aa.endswith("IEKVKTNCRDAPSPLPPAMDPVAPYHVRDKCTQS") + assert len(aa) == 274 + + +@pytest.mark.slow +def test_check_two_exon_cds_seq_rev_strand(installed): + # checking a two exon sequence with a rel_start > 0 + # and rel_end != exon length + from ensembl_tui import _genome as eti_genome + + config = eti_config.read_installed_cfg(installed) + genome = eti_genome.load_genome( + config=config, + species="caenorhabditis_elegans", + ) + cds = next(iter(genome.get_cds(stable_id="WBGene00184990"))) + aa = str(cds.get_slice().get_translation()) + # seq expected values from ensembl + assert aa.startswith("MSGVYNNSGSRMRSKNFEKHQVPSDMAFFQKFRKQSHSNETVDCKKKQEE") + assert aa.endswith("DGHYSDETVEEKHNREHRNKTKADNRTRRIAEIRRKHNINA") + assert len(aa) == 161 + + @pytest.mark.slow def test_species_summary(installed): r = RUNNER.invoke( diff --git a/tests/test_genome.py b/tests/test_genome.py index a1b3ce5..f2da3d5 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -198,3 +198,10 @@ def test_get_ids_for_biotype_seqid(yeast): r.seqid for stable_id in stable_ids for r in yeast.get_features(name=stable_id) } assert got == seqids + + +def test_get_celegans_cds(worm): + cds = next(iter(worm.get_cds(stable_id="WBGene00021347"))) + seq = cds.get_slice() + aa = seq.get_translation() + assert aa == "MIIPIRCFTCGKVIGDKWETYLGFLQSEYSEGDALDALGLRRYCCRRMLLAHVDLIEKLLNYHPLEK" diff --git a/tests/test_install.py b/tests/test_install.py index 1c6394b..036e61a 100644 --- a/tests/test_install.py +++ b/tests/test_install.py @@ -222,7 +222,7 @@ def test_get_limiting_exons_one_exon(one_exon): all_exons = eti_tables.get_all_limit_exons(one_exon) lex = eti_tables.get_limit_exons(all_exons[1664]) assert lex.start_rank == lex.stop_rank == 1 - assert lex.rel_start == 0 + assert lex.rel_start == 1 assert lex.rel_stop == 1 @@ -249,7 +249,7 @@ def test_get_limiting_exons_two_exons(two_exon): lex = eti_tables.get_limit_exons(all_exons[269944]) assert lex.start_rank == 4 assert lex.stop_rank == 12 - assert lex.rel_start == 28 + assert lex.rel_start == 29 assert lex.rel_stop == 54 @@ -343,7 +343,7 @@ def same_tr_cds(four_exons): { "transcript_id": 11, "start_exon_id": 1, - "seq_start": 1, + "seq_start": 0, "end_exon_id": 4, "seq_end": 100, "stable_id": "a1", @@ -371,7 +371,7 @@ def diff_tr_cds(four_exons): { "transcript_id": 11, "start_exon_id": 2, - "seq_start": 1, + "seq_start": 0, "end_exon_id": 3, "seq_end": 100, }, @@ -399,7 +399,7 @@ def tr_cds_rel_pos(four_exons): { "transcript_id": 11, "start_exon_id": 1, - "seq_start": 2, + "seq_start": 1, "end_exon_id": 4, "seq_end": 2, }, @@ -486,7 +486,7 @@ def test_rel_start_ends_2(tr_cds_rel_pos_minus): assert tr.start == 100 assert tr.stop == 400 assert numpy.array_equal(tr.transcript_spans, [(100, 200), (300, 400)]) - assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 4)]) + assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 5)]) def test_no_cds_spans(four_exons): @@ -609,7 +609,7 @@ def mixed_data(): { "transcript_id": 12, "start_exon_id": 7, - "seq_start": 1, + "seq_start": 0, "end_exon_id": 5, "seq_end": 100, "stable_id": "pr-01",