cogent3 · GavinHuttley · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/.github/workflows/testing_develop.yml b/.github/workflows/testing_develop.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: "actions/checkout@v4"
@@ -28,9 +28,9 @@ jobs:
         uses: actions/cache@v3
         with:
           path: tests/data
-          key: ${{ runner.os }}-data-v1-${{ hashFiles('tests/data/small-113.zip') }}
+          key: ${{ runner.os }}-data-v2-${{ hashFiles('tests/data/small-113.zip') }}
           restore-keys: |
-            ${{ runner.os }}-data-v1-
+            ${{ runner.os }}-data-v2-
 
       - name: Check cache status
         run: |
@@ -39,7 +39,7 @@ jobs:
       - name: Download data file
         if: steps.cache-data.outputs.cache-hit != 'true'
         run: |
-          curl -o tests/data/small-113.zip https://zenodo.org/records/14625203/files/small-113.zip
+          curl -L -o tests/data/small-113.zip "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1"
 
       - name: Unzip data file
         run: |

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ maintainers = [
 keywords = ["biology", "genomics", "evolution", "bioinformatics"]
 readme = "README.md"
 license = { file = "LICENSE" }
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.10,<3.14"
 dependencies = ["blosc2",
         "click",
         "cogent3>=2024.12.19a2",
@@ -44,6 +44,7 @@ classifiers = [
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
         "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
     ]
 dynamic = ["version", "description"]
 
@@ -68,40 +69,33 @@ test = [
 doc  = ["click",
         "sphinx",
         "sphinx-autobuild",
-        "sphinx>=1.6",
         "sphinx_book_theme",
         "sphinx_design",
-        "sphinxcontrib-bibtex"]
+        "sphinxcontrib-bibtex",
+        "ipykernel",
+        "ipython",
+        "ipywidgets",
+        "jupyter-sphinx",
+        "jupyter_client",
+        "jupyterlab",
+        "jupytext",
+        "kaleido",
+        "nbconvert>5.4",
+        "nbformat",
+        "nbsphinx",
+        "pillow",
+        "plotly",
+        ]
 dev = ["click",
        "cogapp",
        "flit",
-       "ipykernel",
-       "ipython",
-       "ipywidgets",
-       "jupyter-sphinx",
-       "jupyter_client",
-       "jupyterlab",
-       "jupytext",
-       "kaleido",
-       "nbconvert>5.4",
-       "nbformat",
-       "nbsphinx",
        "nox",
        "numpydoc",
-       "pandas",
-       "pillow",
-       "plotly",
        "psutil",
-       "pytest",
-       "pytest-cov",
-       "pytest-xdist",
-       "ruff==0.9.1",
        "scriv",
-       "sphinx",
-       "sphinx-autobuild",
-       "sphinx_book_theme",
-       "sphinx_design",
-       "sphinxcontrib-bibtex"]
+       "ensembl_tui[doc]",
+       "ensembl_tui[test]",
+       ]
 
 [tool.flit.sdist]
 include = ["src/*", "tests/*", "pyproject.toml"]

diff --git a/src/ensembl_tui/_annotation.py b/src/ensembl_tui/_annotation.py
@@ -286,7 +286,7 @@ def get_features_matching(
         symbol: OptStr = None,
         description: OptStr = None,
         **kwargs,  # noqa: ANN003
-    ) -> typing.Iterator[FeatureDataBase]:
+    ) -> typing.Iterator[GeneData]:
         # add supoport for querying by symbol and description
         stable_id = stable_id or kwargs.pop("name", None)
         limit = kwargs.pop("limit", None)

diff --git a/src/ensembl_tui/_mysql_core_attr.py b/src/ensembl_tui/_mysql_core_attr.py
@@ -133,6 +133,10 @@ class LimitExons:
     strand: int
     transcript_id: int
 
+    @property
+    def single_exon(self) -> bool:
+        return self.start_rank == self.stop_rank
+
 
 def get_all_limit_exons(
     conn: duckdb.DuckDBPyConnection,
@@ -182,8 +186,7 @@ def get_limit_exons(records: list[tuple[int, ...]]) -> LimitExons:
     return LimitExons(
         start_rank=start_rank,
         stop_rank=end_rank,
-        # subtract 1 to convert to 0-based
-        rel_start=rel_start - 1,
+        rel_start=rel_start,
         rel_stop=rel_end,
         strand=strand,
         transcript_id=transcript_id,
@@ -269,6 +272,8 @@ def get_transcript_attr_records(
         for i, rank in enumerate(ranks):
             transcript_spans[rank - 1] = (starts[i], stops[i])
 
+        cds_spans = transcript_spans.copy()
+        transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :]
         if transcript_id not in limit_exons:
             # no translated exons
             yield TranscriptAttrRecord(
@@ -291,7 +296,27 @@ def get_transcript_attr_records(
         # 5' end of an exon
         # so the start_exon coords become (exon_start + rel_start, exon_end)
         # the end_exon coords become (exon_start, exon_start + rel_stop)
-        cds_spans = transcript_spans.copy()
+        if lex.single_exon:
+            ex_start = cds_spans[0][0] if lex.strand == 1 else cds_spans[0][1]
+            if lex.strand == 1:
+                start, stop = ex_start + lex.rel_start, ex_start + lex.rel_stop
+            else:
+                start, stop = ex_start - lex.rel_stop, ex_start - lex.rel_start
+
+            cds_spans[0, :] = start, stop
+
+            yield TranscriptAttrRecord(
+                seqid=seqid,
+                transcript_id=transcript_id,
+                gene_id=gene_id,
+                strand=strand,
+                transcript_spans=transcript_spans,
+                cds_spans=cds_spans,
+                transcript_stable_id=transcript_stable_id,
+                cds_stable_id=cds_stable_id,
+            )
+            continue
+
         start_exon_coords = cds_spans[start_index]
         stop_exon_coords = cds_spans[stop_index]
         if lex.strand == 1:
@@ -321,7 +346,6 @@ def get_transcript_attr_records(
         # sort all spans in ascending numerical order
         # note that the lexsort returns the sorted indices
         cds_spans = cds_spans[numpy.lexsort(cds_spans.T), :]
-        transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :]
 
         yield TranscriptAttrRecord(
             seqid=seqid,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -50,7 +50,7 @@ def namer():
     return name_as_seqid
 
 
-TEST_DATA_URL = "https://zenodo.org/records/14625203/files/small-113.zip"
+TEST_DATA_URL = "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1"
 SMALL_DATA_DIRNAME = "small-113"
 
 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -79,6 +79,70 @@ def test_installed(installed):
     assert len(list(path.glob("*attr.parquet"))) == 2
 
 
+@pytest.mark.slow
+def test_check_one_cds_seq(installed):
+    # checking a single exon sequence with a rel_start > 0
+    from ensembl_tui import _genome as eti_genome
+
+    config = eti_config.read_installed_cfg(installed)
+    genome = eti_genome.load_genome(
+        config=config,
+        species="saccharomyces_cerevisiae",
+    )
+    cds = next(iter(genome.get_cds(stable_id="YMR242C")))
+    seq = cds.get_slice()
+    expect = (
+        "GCTCACTTTAAAGAATACCAAGTTATTGGCCGTCGTTTGCCAACTGAATCTGTTCCAGAA"
+        "CCAAAGTTGTTCAGAATGAGAATCTTTGCTTCAAATGAAGTTATTGCCAAGTCTCGTTAC"
+        "TGGTATTTCTTGCAAAAGTTGCACAAGGTTAAGAAGGCTTCTGGTGAAATTGTTTCCATC"
+        "AACCAAATCAACGAAGCTCATCCAACCAAGGTCAAGAACTTCGGTGTCTGGGTTAGATAC"
+        "GACTCCAGATCTGGTACTCACAATATGTACAAGGAAATCAGAGACGTCTCCAGAGTTGCT"
+        "GCCGTCGAAACCTTATACCAAGACATGGCTGCCAGACACAGAGCTAGATTTAGATCTATT"
+        "CACATCTTGAAGGTTGCTGAAATTGAAAAGACTGCTGACGTCAAGAGACAATACGTTAAG"
+        "CAATTTTTGACCAAGGACTTGAAATTCCCATTGCCTCACAGAGTCCAAAAATCCACCAAG"
+        "ACTTTCTCCTACAAGAGACCTTCCACTTTCTACTGA"
+    )
+    assert str(cds.get_slice()) == expect
+
+
+@pytest.mark.slow
+def test_check_multi_exon_cds_seq_plus_strand(installed):
+    # checking a multi exon sequence with a rel_start > 0
+    # and rel_end != exon length
+    from ensembl_tui import _genome as eti_genome
+
+    config = eti_config.read_installed_cfg(installed)
+    genome = eti_genome.load_genome(
+        config=config,
+        species="caenorhabditis_elegans",
+    )
+    cds = next(iter(genome.get_cds(stable_id="WBGene00185002")))
+    aa = str(cds.get_slice().get_translation())
+    # seq expected values from ensembl
+    assert aa.startswith("MEMEDIDDDITVFYTDDRGTVQGPYGASTVLDWYQKGYFSDNHQMRFTDNGQRIGNLFTY")
+    assert aa.endswith("IEKVKTNCRDAPSPLPPAMDPVAPYHVRDKCTQS")
+    assert len(aa) == 274
+
+
+@pytest.mark.slow
+def test_check_two_exon_cds_seq_rev_strand(installed):
+    # checking a two exon sequence with a rel_start > 0
+    # and rel_end != exon length
+    from ensembl_tui import _genome as eti_genome
+
+    config = eti_config.read_installed_cfg(installed)
+    genome = eti_genome.load_genome(
+        config=config,
+        species="caenorhabditis_elegans",
+    )
+    cds = next(iter(genome.get_cds(stable_id="WBGene00184990")))
+    aa = str(cds.get_slice().get_translation())
+    # seq expected values from ensembl
+    assert aa.startswith("MSGVYNNSGSRMRSKNFEKHQVPSDMAFFQKFRKQSHSNETVDCKKKQEE")
+    assert aa.endswith("DGHYSDETVEEKHNREHRNKTKADNRTRRIAEIRRKHNINA")
+    assert len(aa) == 161
+
+
 @pytest.mark.slow
 def test_species_summary(installed):
     r = RUNNER.invoke(

diff --git a/tests/test_genome.py b/tests/test_genome.py
@@ -198,3 +198,10 @@ def test_get_ids_for_biotype_seqid(yeast):
         r.seqid for stable_id in stable_ids for r in yeast.get_features(name=stable_id)
     }
     assert got == seqids
+
+
+def test_get_celegans_cds(worm):
+    cds = next(iter(worm.get_cds(stable_id="WBGene00021347")))
+    seq = cds.get_slice()
+    aa = seq.get_translation()
+    assert aa == "MIIPIRCFTCGKVIGDKWETYLGFLQSEYSEGDALDALGLRRYCCRRMLLAHVDLIEKLLNYHPLEK"
diff --git a/tests/test_install.py b/tests/test_install.py
@@ -222,7 +222,7 @@ def test_get_limiting_exons_one_exon(one_exon):
     all_exons = eti_tables.get_all_limit_exons(one_exon)
     lex = eti_tables.get_limit_exons(all_exons[1664])
     assert lex.start_rank == lex.stop_rank == 1
-    assert lex.rel_start == 0
+    assert lex.rel_start == 1
     assert lex.rel_stop == 1
 
 
@@ -249,7 +249,7 @@ def test_get_limiting_exons_two_exons(two_exon):
     lex = eti_tables.get_limit_exons(all_exons[269944])
     assert lex.start_rank == 4
     assert lex.stop_rank == 12
-    assert lex.rel_start == 28
+    assert lex.rel_start == 29
     assert lex.rel_stop == 54
 
 
@@ -343,7 +343,7 @@ def same_tr_cds(four_exons):
         {
             "transcript_id": 11,
             "start_exon_id": 1,
-            "seq_start": 1,
+            "seq_start": 0,
             "end_exon_id": 4,
             "seq_end": 100,
             "stable_id": "a1",
@@ -371,7 +371,7 @@ def diff_tr_cds(four_exons):
         {
             "transcript_id": 11,
             "start_exon_id": 2,
-            "seq_start": 1,
+            "seq_start": 0,
             "end_exon_id": 3,
             "seq_end": 100,
         },
@@ -399,7 +399,7 @@ def tr_cds_rel_pos(four_exons):
         {
             "transcript_id": 11,
             "start_exon_id": 1,
-            "seq_start": 2,
+            "seq_start": 1,
             "end_exon_id": 4,
             "seq_end": 2,
         },
@@ -486,7 +486,7 @@ def test_rel_start_ends_2(tr_cds_rel_pos_minus):
     assert tr.start == 100
     assert tr.stop == 400
     assert numpy.array_equal(tr.transcript_spans, [(100, 200), (300, 400)])
-    assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 4)])
+    assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 5)])
 
 
 def test_no_cds_spans(four_exons):
@@ -609,7 +609,7 @@ def mixed_data():
         {
             "transcript_id": 12,
             "start_exon_id": 7,
-            "seq_start": 1,
+            "seq_start": 0,
             "end_exon_id": 5,
             "seq_end": 100,
             "stable_id": "pr-01",