Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix CDS span calculations #196

Merged
merged 10 commits into from
Feb 27, 2025
8 changes: 4 additions & 4 deletions .github/workflows/testing_develop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13"]

steps:
- uses: "actions/checkout@v4"
Expand All @@ -28,9 +28,9 @@ jobs:
uses: actions/cache@v3
with:
path: tests/data
key: ${{ runner.os }}-data-v1-${{ hashFiles('tests/data/small-113.zip') }}
key: ${{ runner.os }}-data-v2-${{ hashFiles('tests/data/small-113.zip') }}
restore-keys: |
${{ runner.os }}-data-v1-
${{ runner.os }}-data-v2-

- name: Check cache status
run: |
Expand All @@ -39,7 +39,7 @@ jobs:
- name: Download data file
if: steps.cache-data.outputs.cache-hit != 'true'
run: |
curl -o tests/data/small-113.zip https://zenodo.org/records/14625203/files/small-113.zip
curl -L -o tests/data/small-113.zip "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1"

- name: Unzip data file
run: |
Expand Down
46 changes: 20 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ maintainers = [
keywords = ["biology", "genomics", "evolution", "bioinformatics"]
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10,<3.13"
requires-python = ">=3.10,<3.14"
dependencies = ["blosc2",
"click",
"cogent3>=2024.12.19a2",
Expand All @@ -44,6 +44,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dynamic = ["version", "description"]

Expand All @@ -68,40 +69,33 @@ test = [
doc = ["click",
"sphinx",
"sphinx-autobuild",
"sphinx>=1.6",
"sphinx_book_theme",
"sphinx_design",
"sphinxcontrib-bibtex"]
"sphinxcontrib-bibtex",
"ipykernel",
"ipython",
"ipywidgets",
"jupyter-sphinx",
"jupyter_client",
"jupyterlab",
"jupytext",
"kaleido",
"nbconvert>5.4",
"nbformat",
"nbsphinx",
"pillow",
"plotly",
]
dev = ["click",
"cogapp",
"flit",
"ipykernel",
"ipython",
"ipywidgets",
"jupyter-sphinx",
"jupyter_client",
"jupyterlab",
"jupytext",
"kaleido",
"nbconvert>5.4",
"nbformat",
"nbsphinx",
"nox",
"numpydoc",
"pandas",
"pillow",
"plotly",
"psutil",
"pytest",
"pytest-cov",
"pytest-xdist",
"ruff==0.9.1",
"scriv",
"sphinx",
"sphinx-autobuild",
"sphinx_book_theme",
"sphinx_design",
"sphinxcontrib-bibtex"]
"ensembl_tui[doc]",
"ensembl_tui[test]",
]

[tool.flit.sdist]
include = ["src/*", "tests/*", "pyproject.toml"]
Expand Down
2 changes: 1 addition & 1 deletion src/ensembl_tui/_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def get_features_matching(
symbol: OptStr = None,
description: OptStr = None,
**kwargs, # noqa: ANN003
) -> typing.Iterator[FeatureDataBase]:
) -> typing.Iterator[GeneData]:
# add supoport for querying by symbol and description
stable_id = stable_id or kwargs.pop("name", None)
limit = kwargs.pop("limit", None)
Expand Down
32 changes: 28 additions & 4 deletions src/ensembl_tui/_mysql_core_attr.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ class LimitExons:
strand: int
transcript_id: int

@property
def single_exon(self) -> bool:
return self.start_rank == self.stop_rank


def get_all_limit_exons(
conn: duckdb.DuckDBPyConnection,
Expand Down Expand Up @@ -182,8 +186,7 @@ def get_limit_exons(records: list[tuple[int, ...]]) -> LimitExons:
return LimitExons(
start_rank=start_rank,
stop_rank=end_rank,
# subtract 1 to convert to 0-based
rel_start=rel_start - 1,
rel_start=rel_start,
rel_stop=rel_end,
strand=strand,
transcript_id=transcript_id,
Expand Down Expand Up @@ -269,6 +272,8 @@ def get_transcript_attr_records(
for i, rank in enumerate(ranks):
transcript_spans[rank - 1] = (starts[i], stops[i])

cds_spans = transcript_spans.copy()
transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :]
if transcript_id not in limit_exons:
# no translated exons
yield TranscriptAttrRecord(
Expand All @@ -291,7 +296,27 @@ def get_transcript_attr_records(
# 5' end of an exon
# so the start_exon coords become (exon_start + rel_start, exon_end)
# the end_exon coords become (exon_start, exon_start + rel_stop)
cds_spans = transcript_spans.copy()
if lex.single_exon:
ex_start = cds_spans[0][0] if lex.strand == 1 else cds_spans[0][1]
if lex.strand == 1:
start, stop = ex_start + lex.rel_start, ex_start + lex.rel_stop
else:
start, stop = ex_start - lex.rel_stop, ex_start - lex.rel_start

cds_spans[0, :] = start, stop

yield TranscriptAttrRecord(
seqid=seqid,
transcript_id=transcript_id,
gene_id=gene_id,
strand=strand,
transcript_spans=transcript_spans,
cds_spans=cds_spans,
transcript_stable_id=transcript_stable_id,
cds_stable_id=cds_stable_id,
)
continue

start_exon_coords = cds_spans[start_index]
stop_exon_coords = cds_spans[stop_index]
if lex.strand == 1:
Expand Down Expand Up @@ -321,7 +346,6 @@ def get_transcript_attr_records(
# sort all spans in ascending numerical order
# note that the lexsort returns the sorted indices
cds_spans = cds_spans[numpy.lexsort(cds_spans.T), :]
transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :]

yield TranscriptAttrRecord(
seqid=seqid,
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def namer():
return name_as_seqid


TEST_DATA_URL = "https://zenodo.org/records/14625203/files/small-113.zip"
TEST_DATA_URL = "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1"
SMALL_DATA_DIRNAME = "small-113"


Expand Down
64 changes: 64 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,70 @@ def test_installed(installed):
assert len(list(path.glob("*attr.parquet"))) == 2


@pytest.mark.slow
def test_check_one_cds_seq(installed):
# checking a single exon sequence with a rel_start > 0
from ensembl_tui import _genome as eti_genome

config = eti_config.read_installed_cfg(installed)
genome = eti_genome.load_genome(
config=config,
species="saccharomyces_cerevisiae",
)
cds = next(iter(genome.get_cds(stable_id="YMR242C")))
seq = cds.get_slice()
expect = (
"GCTCACTTTAAAGAATACCAAGTTATTGGCCGTCGTTTGCCAACTGAATCTGTTCCAGAA"
"CCAAAGTTGTTCAGAATGAGAATCTTTGCTTCAAATGAAGTTATTGCCAAGTCTCGTTAC"
"TGGTATTTCTTGCAAAAGTTGCACAAGGTTAAGAAGGCTTCTGGTGAAATTGTTTCCATC"
"AACCAAATCAACGAAGCTCATCCAACCAAGGTCAAGAACTTCGGTGTCTGGGTTAGATAC"
"GACTCCAGATCTGGTACTCACAATATGTACAAGGAAATCAGAGACGTCTCCAGAGTTGCT"
"GCCGTCGAAACCTTATACCAAGACATGGCTGCCAGACACAGAGCTAGATTTAGATCTATT"
"CACATCTTGAAGGTTGCTGAAATTGAAAAGACTGCTGACGTCAAGAGACAATACGTTAAG"
"CAATTTTTGACCAAGGACTTGAAATTCCCATTGCCTCACAGAGTCCAAAAATCCACCAAG"
"ACTTTCTCCTACAAGAGACCTTCCACTTTCTACTGA"
)
assert str(cds.get_slice()) == expect


@pytest.mark.slow
def test_check_multi_exon_cds_seq_plus_strand(installed):
# checking a multi exon sequence with a rel_start > 0
# and rel_end != exon length
from ensembl_tui import _genome as eti_genome

config = eti_config.read_installed_cfg(installed)
genome = eti_genome.load_genome(
config=config,
species="caenorhabditis_elegans",
)
cds = next(iter(genome.get_cds(stable_id="WBGene00185002")))
aa = str(cds.get_slice().get_translation())
# seq expected values from ensembl
assert aa.startswith("MEMEDIDDDITVFYTDDRGTVQGPYGASTVLDWYQKGYFSDNHQMRFTDNGQRIGNLFTY")
assert aa.endswith("IEKVKTNCRDAPSPLPPAMDPVAPYHVRDKCTQS")
assert len(aa) == 274


@pytest.mark.slow
def test_check_two_exon_cds_seq_rev_strand(installed):
# checking a two exon sequence with a rel_start > 0
# and rel_end != exon length
from ensembl_tui import _genome as eti_genome

config = eti_config.read_installed_cfg(installed)
genome = eti_genome.load_genome(
config=config,
species="caenorhabditis_elegans",
)
cds = next(iter(genome.get_cds(stable_id="WBGene00184990")))
aa = str(cds.get_slice().get_translation())
# seq expected values from ensembl
assert aa.startswith("MSGVYNNSGSRMRSKNFEKHQVPSDMAFFQKFRKQSHSNETVDCKKKQEE")
assert aa.endswith("DGHYSDETVEEKHNREHRNKTKADNRTRRIAEIRRKHNINA")
assert len(aa) == 161


@pytest.mark.slow
def test_species_summary(installed):
r = RUNNER.invoke(
Expand Down
7 changes: 7 additions & 0 deletions tests/test_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,10 @@ def test_get_ids_for_biotype_seqid(yeast):
r.seqid for stable_id in stable_ids for r in yeast.get_features(name=stable_id)
}
assert got == seqids


def test_get_celegans_cds(worm):
cds = next(iter(worm.get_cds(stable_id="WBGene00021347")))
seq = cds.get_slice()
aa = seq.get_translation()
assert aa == "MIIPIRCFTCGKVIGDKWETYLGFLQSEYSEGDALDALGLRRYCCRRMLLAHVDLIEKLLNYHPLEK"
14 changes: 7 additions & 7 deletions tests/test_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def test_get_limiting_exons_one_exon(one_exon):
all_exons = eti_tables.get_all_limit_exons(one_exon)
lex = eti_tables.get_limit_exons(all_exons[1664])
assert lex.start_rank == lex.stop_rank == 1
assert lex.rel_start == 0
assert lex.rel_start == 1
assert lex.rel_stop == 1


Expand All @@ -249,7 +249,7 @@ def test_get_limiting_exons_two_exons(two_exon):
lex = eti_tables.get_limit_exons(all_exons[269944])
assert lex.start_rank == 4
assert lex.stop_rank == 12
assert lex.rel_start == 28
assert lex.rel_start == 29
assert lex.rel_stop == 54


Expand Down Expand Up @@ -343,7 +343,7 @@ def same_tr_cds(four_exons):
{
"transcript_id": 11,
"start_exon_id": 1,
"seq_start": 1,
"seq_start": 0,
"end_exon_id": 4,
"seq_end": 100,
"stable_id": "a1",
Expand Down Expand Up @@ -371,7 +371,7 @@ def diff_tr_cds(four_exons):
{
"transcript_id": 11,
"start_exon_id": 2,
"seq_start": 1,
"seq_start": 0,
"end_exon_id": 3,
"seq_end": 100,
},
Expand Down Expand Up @@ -399,7 +399,7 @@ def tr_cds_rel_pos(four_exons):
{
"transcript_id": 11,
"start_exon_id": 1,
"seq_start": 2,
"seq_start": 1,
"end_exon_id": 4,
"seq_end": 2,
},
Expand Down Expand Up @@ -486,7 +486,7 @@ def test_rel_start_ends_2(tr_cds_rel_pos_minus):
assert tr.start == 100
assert tr.stop == 400
assert numpy.array_equal(tr.transcript_spans, [(100, 200), (300, 400)])
assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 4)])
assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 5)])


def test_no_cds_spans(four_exons):
Expand Down Expand Up @@ -609,7 +609,7 @@ def mixed_data():
{
"transcript_id": 12,
"start_exon_id": 7,
"seq_start": 1,
"seq_start": 0,
"end_exon_id": 5,
"seq_end": 100,
"stable_id": "pr-01",
Expand Down
Loading