From 0e330c0febc238cea4ffc6f5d7a7b2c34d502f9d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 17 Jan 2025 12:57:42 +1100 Subject: [PATCH 1/5] MAINT: avoid exceptions closing Hdf5 due to garbage collection --- src/ensembl_tui/_storage_mixin.py | 40 +++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/ensembl_tui/_storage_mixin.py b/src/ensembl_tui/_storage_mixin.py index fcb9072..2ad43e9 100644 --- a/src/ensembl_tui/_storage_mixin.py +++ b/src/ensembl_tui/_storage_mixin.py @@ -8,7 +8,7 @@ import duckdb import numpy import typing_extensions - +import h5py from ensembl_tui import _util as eti_util ReturnType = tuple[str, tuple] # the sql statement and corresponding values @@ -69,15 +69,16 @@ def _make_table_sql( class Hdf5Mixin(eti_util.SerialisableMixin): """HDF5 sequence data storage""" - _file = None - _is_open = False + _file: h5py.File | None = None + _is_open: bool = False + mode: str = "r" - def __getstate__(self): + def __getstate__(self) -> dict: if set(self.mode) & {"w", "a"}: raise NotImplementedError(f"pickling not supported for mode={self.mode!r}") - return self._init_vals.copy() + return self._init_vals.copy() # type: ignore - def __setstate__(self, state): + def __setstate__(self, state: dict) -> None: obj = self.__class__(**state) self.__dict__.update(obj.__dict__) # because we have a __del__ method, and self attributes point to @@ -86,23 +87,26 @@ def __setstate__(self, state): obj._is_open = False obj._file = None - def __del__(self): + def __del__(self) -> None: self.close() - def close(self): + def close(self) -> None: """closes the hdf5 file""" # hdf5 dumps content to stdout if resource already closed, so # we trap that here, and capture expected exceptions raised in the process - with open(os.devnull, "w") as devnull: - with ( - contextlib.redirect_stderr(devnull), - contextlib.redirect_stdout(devnull), - ): - with contextlib.suppress(ValueError, AttributeError): - if self._is_open: - self._file.flush() - - with contextlib.suppress(AttributeError): + if "open" not in locals(): + return + with ( + open(os.devnull, "w") as devnull, # noqa: PTH123 + contextlib.redirect_stderr(devnull), + contextlib.redirect_stdout(devnull), + ): + with contextlib.suppress(ValueError, AttributeError): + if self._is_open and self._file: + self._file.flush() + + with contextlib.suppress(AttributeError): + if self._file: self._file.close() self._is_open = False From bd010f191eb02227b6a38178ad7b16dbe5a57e80 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 17 Jan 2025 12:58:01 +1100 Subject: [PATCH 2/5] MAINT: migrate to cogent3 new_types --- src/ensembl_tui/_align.py | 45 +++++++++++------- src/ensembl_tui/_faster_fasta.py | 60 ----------------------- src/ensembl_tui/_genome.py | 81 +++++++++++++++----------------- src/ensembl_tui/_homology.py | 3 +- src/ensembl_tui/_ingest_align.py | 2 +- tests/test_align.py | 17 ++----- tests/test_genome.py | 23 --------- tests/test_installed.py | 2 +- 8 files changed, 72 insertions(+), 161 deletions(-) delete mode 100644 src/ensembl_tui/_faster_fasta.py diff --git a/src/ensembl_tui/_align.py b/src/ensembl_tui/_align.py index 58bbd4a..c57f98c 100644 --- a/src/ensembl_tui/_align.py +++ b/src/ensembl_tui/_align.py @@ -4,17 +4,20 @@ from collections import defaultdict from dataclasses import dataclass +import cogent3 import h5py import numpy import typing_extensions from cogent3.app.composable import define_app -from cogent3.core.alignment import Aligned, Alignment +from cogent3.core import new_alignment as c3_align from cogent3.core.location import _DEFAULT_GAP_DTYPE, IndelMap from ensembl_tui import _genome as eti_genome from ensembl_tui import _storage_mixin as eti_storage from ensembl_tui import _util as eti_util +DNA = cogent3.get_moltype("dna", new_type=True) + _no_gaps = numpy.array([], dtype=_DEFAULT_GAP_DTYPE) GAP_STORE_SUFFIX = "indels-hdf5_blosc2" @@ -288,18 +291,19 @@ def num_records(self) -> int: def get_alignment( align_db: AlignDb, - genomes: dict, + genomes: dict[str, eti_genome.Genome], ref_species: str, seqid: str, ref_start: int | None = None, ref_end: int | None = None, namer: typing.Callable | None = None, mask_features: list[str] | None = None, -) -> typing.Iterable[Alignment]: +) -> typing.Iterable[c3_align.Alignment]: """yields cogent3 Alignments""" if ref_species not in genomes: - raise ValueError(f"unknown species {ref_species!r}") + msg = f"unknown species {ref_species!r}" + raise ValueError(msg) align_records = align_db.get_records_matching( species=ref_species, @@ -322,7 +326,7 @@ def get_alignment( genome_start = align_record.start genome_end = align_record.stop gap_pos, gap_lengths = align_record.gap_data - gaps = IndelMap( + imap = IndelMap( gap_pos=gap_pos, gap_lengths=gap_lengths, parent_length=genome_end - genome_start, @@ -346,14 +350,15 @@ def get_alignment( seq_start = seq_start - genome_start seq_end = seq_end - genome_start - align_start = gaps.get_align_index(seq_start) - align_end = gaps.get_align_index(seq_end) + align_start = imap.get_align_index(seq_start) + align_end = imap.get_align_index(seq_end) break else: msg = f"no matching alignment record for {ref_species!r}" raise ValueError(msg) seqs = {} + gaps = {} for align_record in block: record_species = align_record.species genome = genomes[record_species] @@ -362,7 +367,7 @@ def get_alignment( genome_start = align_record.start genome_end = align_record.stop gap_pos, gap_lengths = align_record.gap_data - gaps = IndelMap( + imap = IndelMap( gap_pos=gap_pos, gap_lengths=gap_lengths, parent_length=genome_end - genome_start, @@ -370,12 +375,12 @@ def get_alignment( # We use the alignment indices derived for the reference sequence # above - seq_start = gaps.get_seq_index(align_start) - seq_end = gaps.get_seq_index(align_end) + seq_start = imap.get_seq_index(align_start) + seq_end = imap.get_seq_index(align_end) seq_length = seq_end - seq_start if align_record.strand == "-": # if it's neg strand, the alignment start is the genome stop - seq_start = gaps.parent_length - seq_end + seq_start = imap.parent_length - seq_end s = genome.get_seq( seqid=align_record.seqid, @@ -385,7 +390,7 @@ def get_alignment( with_annotations=False, ) # we now trim the gaps for this sequence to the sub-alignment - gaps = gaps[align_start:align_end] + imap = imap[align_start:align_end] if align_record.strand == "-": s = s.rc() @@ -394,13 +399,17 @@ def get_alignment( strand_symbol = -1 if align_record.strand == "-" else 1 s.name = f"{s.name}:{strand_symbol}" - aligned = Aligned(gaps, s) - if aligned.name not in seqs: - seqs[aligned.name] = aligned - elif str(aligned) == str(seqs[aligned.name]): + if s.name in seqs: print(f"duplicated {s.name}") + seqs[s.name] = numpy.array(s) + gaps[s.name] = imap.array - aln = Alignment(list(seqs.values())) + aln_data = c3_align.AlignedSeqsData.from_seqs_and_gaps( + seqs=seqs, + gaps=gaps, + alphabet=DNA.most_degen_alphabet(), + ) + aln = c3_align.Alignment(seqs_data=aln_data, moltype=DNA) aln.annotation_db = genome.annotation_db if mask_features: aln = aln.with_masked_annotations(biotypes=mask_features) @@ -428,7 +437,7 @@ def __init__( self._mask_features = mask_features self._sep = sep - def main(self, segment: eti_genome.genome_segment) -> list[Alignment]: + def main(self, segment: eti_genome.genome_segment) -> list[c3_align.Alignment]: results = [] for aln in get_alignment( self._align_db, diff --git a/src/ensembl_tui/_faster_fasta.py b/src/ensembl_tui/_faster_fasta.py deleted file mode 100644 index 8944994..0000000 --- a/src/ensembl_tui/_faster_fasta.py +++ /dev/null @@ -1,60 +0,0 @@ -import pathlib -import typing - -import numpy -from cogent3 import get_moltype, open_ - -# old style moltype -alphabet = get_moltype("dna").alphabets.degen_gapped - - -class converter: - """Defines a linear mapping from provided characters to uint8. - The resulting object is callable, taking a bytes object and returning a - numpy array.""" - - def __init__(self, dtype=numpy.uint8): - self._tr = b"".maketrans( - "".join(alphabet).encode("utf8"), - bytes(bytearray(range(len(alphabet)))), - ) - self.dtype = dtype - - def __call__(self, seq: bytes) -> numpy.ndarray: - b = seq.translate(self._tr, delete=b" \n\r") - return numpy.array(memoryview(b), dtype=self.dtype) - - -bytes_to_array = converter() - - -def quicka_parser( - path: pathlib.Path, - converter: typing.Callable[[bytes], bytes] = bytes_to_array, -): - """generator returning sequence labels and sequences converted bytes from a fasta file - - Parameters - ---------- - path - location of the fasta file - converter - a callable that uses converts sequence characters into nominated bytes, - deleting unwanted characters. Must handle newlines. Whatever type this - callable returns will be the type of the sequence returned. - - Returns - ------- - the sequence label as a string and the sequence as transformed by converter - """ - with open_(path, mode="rb") as infile: - data: bytes = infile.read() - - records = data.split(b">") - for record in records: - eol = record.find(b"\n") - if eol == -1: - continue - label = record[:eol].strip().decode("utf8") - seq = converter(record[eol + 1 :]) - yield label, seq diff --git a/src/ensembl_tui/_genome.py b/src/ensembl_tui/_genome.py index df10d85..eb477d8 100644 --- a/src/ensembl_tui/_genome.py +++ b/src/ensembl_tui/_genome.py @@ -1,23 +1,25 @@ import dataclasses import functools import pathlib -import re import sys import typing import uuid from abc import ABC, abstractmethod +from collections.abc import Callable from typing import Any +import cogent3 import h5py import numpy -from cogent3 import get_moltype, make_seq from cogent3.app.composable import define_app +from cogent3.core import new_alphabet from cogent3.core.annotation import Feature from cogent3.core.annotation_db import ( OptionalInt, OptionalStr, ) -from cogent3.core.sequence import Sequence +from cogent3.core.new_sequence import Sequence +from cogent3.parse.fasta import iter_fasta_records from cogent3.util.table import Table from numpy.typing import NDArray @@ -26,19 +28,16 @@ from ensembl_tui import _species as eti_species from ensembl_tui import _storage_mixin as eti_storage from ensembl_tui import _util as eti_util -from ensembl_tui._faster_fasta import quicka_parser SEQ_STORE_NAME = "genome.seqs-hdf5_blosc2" -_typed_id = re.compile( - r"\b[a-z]+:", - flags=re.IGNORECASE, -) # ensembl stableid's prefixed by the type -_feature_id = re.compile(r"(?<=\bID=)[^;]+") -_exon_id = re.compile(r"(?<=\bexon_id=)[^;]+") -_parent_id = re.compile(r"(?<=\bParent=)[^;]+") -_symbol = re.compile(r"(?<=\bName=)[^;]+") -_description = re.compile(r"(?<=\bdescription=)[^;]+") +DNA = cogent3.get_moltype("dna", new_type=True) +alphabet = DNA.most_degen_alphabet() +bytes_to_array = new_alphabet.bytes_to_array( + chars=alphabet.as_bytes(), + dtype=numpy.uint8, + delete=b" \n\r\t", +) def _rename(label: str) -> str: @@ -47,7 +46,11 @@ def _rename(label: str) -> str: @define_app class fasta_to_hdf5: # noqa: N801 - def __init__(self, config: eti_config.Config, label_to_name=_rename) -> None: + def __init__( + self, + config: eti_config.Config, + label_to_name: Callable[[str], str] = _rename, + ) -> None: self.config = config self.label_to_name = label_to_name @@ -63,8 +66,11 @@ def main(self, db_name: str) -> bool: src_dir = src_dir / "fasta" for path in src_dir.glob("*.fa.gz"): - for label, seq in quicka_parser(path): - seqid = self.label_to_name(label) + for seqid, seq in iter_fasta_records( + path, + converter=bytes_to_array, + label_to_name=self.label_to_name, + ): seq_store.add_record(seq, seqid) del seq @@ -125,21 +131,15 @@ class str2arr: # noqa: N801 """convert string to array of uint8""" def __init__(self, moltype: str = "dna", max_length: int | None = None) -> None: - moltype = get_moltype(moltype) - self.canonical = "".join(moltype) + mt = cogent3.get_moltype(moltype, new_type=True) + self.alphabet = mt.most_degen_alphabet() self.max_length = max_length - extended = "".join(list(moltype.alphabets.degen)) - self.translation = b"".maketrans( - extended.encode("utf8"), - "".join(chr(i) for i in range(len(extended))).encode("utf8"), - ) def main(self, data: str) -> numpy.ndarray: if self.max_length: data = data[: self.max_length] - b = data.encode("utf8").translate(self.translation) - return numpy.array(memoryview(b), dtype=numpy.uint8) + return self.alphabet.to_indices(data) @define_app @@ -147,21 +147,14 @@ class arr2str: # noqa: N801 """convert array of uint8 to str""" def __init__(self, moltype: str = "dna", max_length: int | None = None) -> None: - moltype = get_moltype(moltype) - self.canonical = "".join(moltype) + mt = cogent3.get_moltype(moltype, new_type=True) + self.alphabet = mt.most_degen_alphabet() self.max_length = max_length - extended = "".join(list(moltype.alphabets.degen)) - self.translation = b"".maketrans( - "".join(chr(i) for i in range(len(extended))).encode("utf8"), - extended.encode("utf8"), - ) def main(self, data: numpy.ndarray) -> str: if self.max_length: data = data[: self.max_length] - - b = data.tobytes().translate(self.translation) - return bytearray(b).decode("utf8") + return self.alphabet.from_indices(data) @dataclasses.dataclass @@ -326,7 +319,7 @@ def get_seq( stop: int | None = None, namer: typing.Callable | None = None, with_annotations: bool = True, - ) -> str: + ) -> Sequence: """returns annotated sequence Parameters @@ -350,20 +343,20 @@ def get_seq( ----- Full annotations are bound to the instance. """ - seq = self._seqs.get_seq_str(seqid=seqid, start=start, stop=stop) + seq = self._seqs.get_seq_arr(seqid=seqid, start=start, stop=stop) if namer: name = namer(self.species, seqid, start, stop) else: name = f"{self.species}:{seqid}:{start}-{stop}" # we use seqid to make the sequence here because that identifies the # parent seq identity, required for querying annotations - try: - seq = make_seq(seq, name=seqid, moltype="dna", annotation_offset=start or 0) - except TypeError: - # older version of cogent3 - seq = make_seq(seq, name=seqid, moltype="dna") - seq.annotation_offset = start or 0 - + seq = cogent3.make_seq( + seq, + name=seqid, + moltype="dna", + annotation_offset=start or 0, + new_type=True, + ) seq.name = name seq.annotation_db = self.annotation_db if with_annotations else None return seq diff --git a/src/ensembl_tui/_homology.py b/src/ensembl_tui/_homology.py index f46c158..e748810 100644 --- a/src/ensembl_tui/_homology.py +++ b/src/ensembl_tui/_homology.py @@ -228,5 +228,6 @@ def main(self, homologs: homolog_group) -> SeqsCollectionType: return make_unaligned_seqs( data=seqs, moltype="dna", - info=dict(source=homologs.source), + info={"source": homologs.source}, + new_type=True, ) diff --git a/src/ensembl_tui/_ingest_align.py b/src/ensembl_tui/_ingest_align.py index 62c53b7..76e9218 100644 --- a/src/ensembl_tui/_ingest_align.py +++ b/src/ensembl_tui/_ingest_align.py @@ -18,7 +18,7 @@ def seq2gaps(record: dict) -> eti_align.AlignRecord: - seq = make_seq(record.pop("seq")) + seq = make_seq(record.pop("seq"), new_type=True, moltype="dna") indel_map, _ = seq.parse_out_gaps() if indel_map.num_gaps: record["gap_spans"] = numpy.array( diff --git a/tests/test_align.py b/tests/test_align.py index 7bec464..0b0a05c 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -88,6 +88,7 @@ def small_seqs(): moltype="dna", array_align=False, info={"species": {"s1": "human", "s2": "mouse", "s3": "dog"}}, + new_type=True, ) @@ -96,8 +97,8 @@ def make_records(start, end, block_id): records = [] species = aln.info.species for seq in aln.seqs: - seqid, seq_start, seq_end, seq_strand = seq.data.parent_coordinates() - gs = seq.get_gapped_seq() + seqid, seq_start, seq_end, seq_strand = seq.parent_coordinates(seq_coords=True) + gs = seq.gapped_seq imap, s = gs.parse_out_gaps() if imap.num_gaps: gap_spans = numpy.array( @@ -156,16 +157,6 @@ def test_aligndb_records_skip_duplicated_block_ids(small_records): assert agg.sql(sql).fetchone()[0] == count -def _find_nth_gap_index(data: str, n: int) -> int: - num = -1 - for i, c in enumerate(data): - if c == "-": - num += 1 - if num == n: - return i - raise ValueError(f"{data=}, {n=}") - - # fixture to make synthetic GenomeSeqsDb and alignment db # based on a given alignment @pytest.fixture @@ -233,7 +224,7 @@ def make_sample(two_aligns=False): genomes = {} for seq in aln.seqs: name = seq.name - seq = seq.data.degap() + seq = seq.seq if seq.name == "s2": seq = seq.rc() s2_genome = str(seq) diff --git a/tests/test_genome.py b/tests/test_genome.py index de583df..98bc1f6 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -1,5 +1,3 @@ -import pathlib - import pytest from ensembl_tui import _genome as eti_genome @@ -21,7 +19,6 @@ def test_get_seq_feature_seq_correct(yeast): assert raw_seq.endswith("ATAAGAAATTCCATAAATAG") -@pytest.mark.xfail(reason="Need to fix cogent3 to apply feature name to new sequence") def test_get_seq_feature_seq_correct_name(yeast): # need to modify cogent3 so it applies the feature name # to the new sequence @@ -124,26 +121,6 @@ def test_get_feature_child_parent(worm_db): assert got.stable_id == gene.stable_id -@pytest.fixture(params=("\n", "\r\n")) -def fasta_data(DATA_DIR, tmp_path, request): - data = pathlib.Path(DATA_DIR / "c_elegans_WS199_shortened.fasta").read_text() - outpath = tmp_path / "demo.fasta" - outpath.write_text(data, newline=request.param) - return outpath - - -def test_faster_fasta(fasta_data): - from cogent3.parse.fasta import MinimalFastaParser - - from ensembl_tui._faster_fasta import bytes_to_array, quicka_parser - - expect = { - n: bytes_to_array(s.encode("utf8")) for n, s in MinimalFastaParser(fasta_data) - } - got = dict(quicka_parser(fasta_data)) - assert (got["I"] == expect["I"]).all() - - def test_genome_segment(): segment = eti_genome.genome_segment( species="abcd_efg", diff --git a/tests/test_installed.py b/tests/test_installed.py index ce8bc27..1d55151 100644 --- a/tests/test_installed.py +++ b/tests/test_installed.py @@ -36,7 +36,7 @@ def test_get_genes(small_install_path): ), ) # we check we can make a aa seq which has the correct start and end - aa = str(gene.trim_stop_codon().get_translation(incomplete_ok=True)) + aa = str(gene.get_translation(trim_stop=True, incomplete_ok=True)) # expected values from ensembl.org assert aa.startswith("MTNSSEFTDVLQS") assert aa.endswith("TIMNRINYKLQ") From 9e88c1f26499ddcda4eec0ea687e0b3ae4b15340 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 17 Jan 2025 13:03:26 +1100 Subject: [PATCH 3/5] DEV: updated cogent3 minimum version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c3397cd..9f52cd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ license = { file = "LICENSE" } requires-python = ">=3.10,<3.13" dependencies = ["blosc2", "click", - "cogent3", + "cogent3>=2024.12.19a2", "duckdb", "h5py", "hdf5plugin", From 19a81283bc5de09cf7145bc117740ff9b2ff4c54 Mon Sep 17 00:00:00 2001 From: GavinHuttley Date: Fri, 17 Jan 2025 02:04:29 +0000 Subject: [PATCH 4/5] STY: pre-commit linting with ruff --- src/ensembl_tui/_storage_mixin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ensembl_tui/_storage_mixin.py b/src/ensembl_tui/_storage_mixin.py index 2ad43e9..fd5e89d 100644 --- a/src/ensembl_tui/_storage_mixin.py +++ b/src/ensembl_tui/_storage_mixin.py @@ -6,9 +6,10 @@ import pathlib import duckdb +import h5py import numpy import typing_extensions -import h5py + from ensembl_tui import _util as eti_util ReturnType = tuple[str, tuple] # the sql statement and corresponding values @@ -76,7 +77,7 @@ class Hdf5Mixin(eti_util.SerialisableMixin): def __getstate__(self) -> dict: if set(self.mode) & {"w", "a"}: raise NotImplementedError(f"pickling not supported for mode={self.mode!r}") - return self._init_vals.copy() # type: ignore + return self._init_vals.copy() # type: ignore def __setstate__(self, state: dict) -> None: obj = self.__class__(**state) From 3818b4a0e20ea0b6fb920cce338d4f590cbe1707 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 17 Jan 2025 13:38:46 +1100 Subject: [PATCH 5/5] TST: added test fo closing AlignDb [CHANGED] remove effort to better handle garbage collection error as it was invalid --- src/ensembl_tui/_align.py | 6 ++++++ src/ensembl_tui/_storage_mixin.py | 2 -- tests/test_align.py | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/ensembl_tui/_align.py b/src/ensembl_tui/_align.py index c57f98c..e4f397f 100644 --- a/src/ensembl_tui/_align.py +++ b/src/ensembl_tui/_align.py @@ -288,6 +288,12 @@ def get_distinct(self, field: str) -> list[str]: def num_records(self) -> int: return self.conn.sql(f"SELECT COUNT(*) from {self._tables[0]}").fetchone()[0] + def close(self) -> None: + """closes duckdb and h5py storage""" + if self.gap_store: + self.gap_store.close() + self.conn.close() + def get_alignment( align_db: AlignDb, diff --git a/src/ensembl_tui/_storage_mixin.py b/src/ensembl_tui/_storage_mixin.py index fd5e89d..7952060 100644 --- a/src/ensembl_tui/_storage_mixin.py +++ b/src/ensembl_tui/_storage_mixin.py @@ -95,8 +95,6 @@ def close(self) -> None: """closes the hdf5 file""" # hdf5 dumps content to stdout if resource already closed, so # we trap that here, and capture expected exceptions raised in the process - if "open" not in locals(): - return with ( open(os.devnull, "w") as devnull, # noqa: PTH123 contextlib.redirect_stderr(devnull), diff --git a/tests/test_align.py b/tests/test_align.py index 0b0a05c..9ef03a8 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -590,3 +590,9 @@ def test_aligndb_post_init_failure(tmp_path): outfile.write_text("blah") with pytest.raises(OSError): eti_align.AlignDb(source="/non/existent/directory") + + +def test_aligndb_close(db_align): + db_align.close() + with pytest.raises(duckdb.duckdb.ConnectionException): + db_align.num_records()