From 0e330c0febc238cea4ffc6f5d7a7b2c34d502f9d Mon Sep 17 00:00:00 2001
From: Gavin Huttley <Gavin.Huttley@anu.edu.au>
Date: Fri, 17 Jan 2025 12:57:42 +1100
Subject: [PATCH 1/5] MAINT: avoid exceptions closing Hdf5 due to garbage
 collection

---
 src/ensembl_tui/_storage_mixin.py | 40 +++++++++++++++++--------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/ensembl_tui/_storage_mixin.py b/src/ensembl_tui/_storage_mixin.py
index fcb9072..2ad43e9 100644
--- a/src/ensembl_tui/_storage_mixin.py
+++ b/src/ensembl_tui/_storage_mixin.py
@@ -8,7 +8,7 @@
 import duckdb
 import numpy
 import typing_extensions
-
+import h5py
 from ensembl_tui import _util as eti_util
 
 ReturnType = tuple[str, tuple]  # the sql statement and corresponding values
@@ -69,15 +69,16 @@ def _make_table_sql(
 class Hdf5Mixin(eti_util.SerialisableMixin):
     """HDF5 sequence data storage"""
 
-    _file = None
-    _is_open = False
+    _file: h5py.File | None = None
+    _is_open: bool = False
+    mode: str = "r"
 
-    def __getstate__(self):
+    def __getstate__(self) -> dict:
         if set(self.mode) & {"w", "a"}:
             raise NotImplementedError(f"pickling not supported for mode={self.mode!r}")
-        return self._init_vals.copy()
+        return self._init_vals.copy() # type: ignore
 
-    def __setstate__(self, state):
+    def __setstate__(self, state: dict) -> None:
         obj = self.__class__(**state)
         self.__dict__.update(obj.__dict__)
         # because we have a __del__ method, and self attributes point to
@@ -86,23 +87,26 @@ def __setstate__(self, state):
         obj._is_open = False
         obj._file = None
 
-    def __del__(self):
+    def __del__(self) -> None:
         self.close()
 
-    def close(self):
+    def close(self) -> None:
         """closes the hdf5 file"""
         # hdf5 dumps content to stdout if resource already closed, so
         # we trap that here, and capture expected exceptions raised in the process
-        with open(os.devnull, "w") as devnull:
-            with (
-                contextlib.redirect_stderr(devnull),
-                contextlib.redirect_stdout(devnull),
-            ):
-                with contextlib.suppress(ValueError, AttributeError):
-                    if self._is_open:
-                        self._file.flush()
-
-                with contextlib.suppress(AttributeError):
+        if "open" not in locals():
+            return
+        with (
+            open(os.devnull, "w") as devnull,  # noqa: PTH123
+            contextlib.redirect_stderr(devnull),
+            contextlib.redirect_stdout(devnull),
+        ):
+            with contextlib.suppress(ValueError, AttributeError):
+                if self._is_open and self._file:
+                    self._file.flush()
+
+            with contextlib.suppress(AttributeError):
+                if self._file:
                     self._file.close()
 
         self._is_open = False

From bd010f191eb02227b6a38178ad7b16dbe5a57e80 Mon Sep 17 00:00:00 2001
From: Gavin Huttley <Gavin.Huttley@anu.edu.au>
Date: Fri, 17 Jan 2025 12:58:01 +1100
Subject: [PATCH 2/5] MAINT: migrate to cogent3 new_types

---
 src/ensembl_tui/_align.py        | 45 +++++++++++-------
 src/ensembl_tui/_faster_fasta.py | 60 -----------------------
 src/ensembl_tui/_genome.py       | 81 +++++++++++++++-----------------
 src/ensembl_tui/_homology.py     |  3 +-
 src/ensembl_tui/_ingest_align.py |  2 +-
 tests/test_align.py              | 17 ++-----
 tests/test_genome.py             | 23 ---------
 tests/test_installed.py          |  2 +-
 8 files changed, 72 insertions(+), 161 deletions(-)
 delete mode 100644 src/ensembl_tui/_faster_fasta.py

diff --git a/src/ensembl_tui/_align.py b/src/ensembl_tui/_align.py
index 58bbd4a..c57f98c 100644
--- a/src/ensembl_tui/_align.py
+++ b/src/ensembl_tui/_align.py
@@ -4,17 +4,20 @@
 from collections import defaultdict
 from dataclasses import dataclass
 
+import cogent3
 import h5py
 import numpy
 import typing_extensions
 from cogent3.app.composable import define_app
-from cogent3.core.alignment import Aligned, Alignment
+from cogent3.core import new_alignment as c3_align
 from cogent3.core.location import _DEFAULT_GAP_DTYPE, IndelMap
 
 from ensembl_tui import _genome as eti_genome
 from ensembl_tui import _storage_mixin as eti_storage
 from ensembl_tui import _util as eti_util
 
+DNA = cogent3.get_moltype("dna", new_type=True)
+
 _no_gaps = numpy.array([], dtype=_DEFAULT_GAP_DTYPE)
 
 GAP_STORE_SUFFIX = "indels-hdf5_blosc2"
@@ -288,18 +291,19 @@ def num_records(self) -> int:
 
 def get_alignment(
     align_db: AlignDb,
-    genomes: dict,
+    genomes: dict[str, eti_genome.Genome],
     ref_species: str,
     seqid: str,
     ref_start: int | None = None,
     ref_end: int | None = None,
     namer: typing.Callable | None = None,
     mask_features: list[str] | None = None,
-) -> typing.Iterable[Alignment]:
+) -> typing.Iterable[c3_align.Alignment]:
     """yields cogent3 Alignments"""
 
     if ref_species not in genomes:
-        raise ValueError(f"unknown species {ref_species!r}")
+        msg = f"unknown species {ref_species!r}"
+        raise ValueError(msg)
 
     align_records = align_db.get_records_matching(
         species=ref_species,
@@ -322,7 +326,7 @@ def get_alignment(
                 genome_start = align_record.start
                 genome_end = align_record.stop
                 gap_pos, gap_lengths = align_record.gap_data
-                gaps = IndelMap(
+                imap = IndelMap(
                     gap_pos=gap_pos,
                     gap_lengths=gap_lengths,
                     parent_length=genome_end - genome_start,
@@ -346,14 +350,15 @@ def get_alignment(
                     seq_start = seq_start - genome_start
                     seq_end = seq_end - genome_start
 
-                align_start = gaps.get_align_index(seq_start)
-                align_end = gaps.get_align_index(seq_end)
+                align_start = imap.get_align_index(seq_start)
+                align_end = imap.get_align_index(seq_end)
                 break
         else:
             msg = f"no matching alignment record for {ref_species!r}"
             raise ValueError(msg)
 
         seqs = {}
+        gaps = {}
         for align_record in block:
             record_species = align_record.species
             genome = genomes[record_species]
@@ -362,7 +367,7 @@ def get_alignment(
             genome_start = align_record.start
             genome_end = align_record.stop
             gap_pos, gap_lengths = align_record.gap_data
-            gaps = IndelMap(
+            imap = IndelMap(
                 gap_pos=gap_pos,
                 gap_lengths=gap_lengths,
                 parent_length=genome_end - genome_start,
@@ -370,12 +375,12 @@ def get_alignment(
 
             # We use the alignment indices derived for the reference sequence
             # above
-            seq_start = gaps.get_seq_index(align_start)
-            seq_end = gaps.get_seq_index(align_end)
+            seq_start = imap.get_seq_index(align_start)
+            seq_end = imap.get_seq_index(align_end)
             seq_length = seq_end - seq_start
             if align_record.strand == "-":
                 # if it's neg strand, the alignment start is the genome stop
-                seq_start = gaps.parent_length - seq_end
+                seq_start = imap.parent_length - seq_end
 
             s = genome.get_seq(
                 seqid=align_record.seqid,
@@ -385,7 +390,7 @@ def get_alignment(
                 with_annotations=False,
             )
             # we now trim the gaps for this sequence to the sub-alignment
-            gaps = gaps[align_start:align_end]
+            imap = imap[align_start:align_end]
 
             if align_record.strand == "-":
                 s = s.rc()
@@ -394,13 +399,17 @@ def get_alignment(
                 strand_symbol = -1 if align_record.strand == "-" else 1
                 s.name = f"{s.name}:{strand_symbol}"
 
-            aligned = Aligned(gaps, s)
-            if aligned.name not in seqs:
-                seqs[aligned.name] = aligned
-            elif str(aligned) == str(seqs[aligned.name]):
+            if s.name in seqs:
                 print(f"duplicated {s.name}")
+            seqs[s.name] = numpy.array(s)
+            gaps[s.name] = imap.array
 
-        aln = Alignment(list(seqs.values()))
+        aln_data = c3_align.AlignedSeqsData.from_seqs_and_gaps(
+            seqs=seqs,
+            gaps=gaps,
+            alphabet=DNA.most_degen_alphabet(),
+        )
+        aln = c3_align.Alignment(seqs_data=aln_data, moltype=DNA)
         aln.annotation_db = genome.annotation_db
         if mask_features:
             aln = aln.with_masked_annotations(biotypes=mask_features)
@@ -428,7 +437,7 @@ def __init__(
         self._mask_features = mask_features
         self._sep = sep
 
-    def main(self, segment: eti_genome.genome_segment) -> list[Alignment]:
+    def main(self, segment: eti_genome.genome_segment) -> list[c3_align.Alignment]:
         results = []
         for aln in get_alignment(
             self._align_db,
diff --git a/src/ensembl_tui/_faster_fasta.py b/src/ensembl_tui/_faster_fasta.py
deleted file mode 100644
index 8944994..0000000
--- a/src/ensembl_tui/_faster_fasta.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import pathlib
-import typing
-
-import numpy
-from cogent3 import get_moltype, open_
-
-# old style moltype
-alphabet = get_moltype("dna").alphabets.degen_gapped
-
-
-class converter:
-    """Defines a linear mapping from provided characters to uint8.
-    The resulting object is callable, taking a bytes object and returning a
-    numpy array."""
-
-    def __init__(self, dtype=numpy.uint8):
-        self._tr = b"".maketrans(
-            "".join(alphabet).encode("utf8"),
-            bytes(bytearray(range(len(alphabet)))),
-        )
-        self.dtype = dtype
-
-    def __call__(self, seq: bytes) -> numpy.ndarray:
-        b = seq.translate(self._tr, delete=b" \n\r")
-        return numpy.array(memoryview(b), dtype=self.dtype)
-
-
-bytes_to_array = converter()
-
-
-def quicka_parser(
-    path: pathlib.Path,
-    converter: typing.Callable[[bytes], bytes] = bytes_to_array,
-):
-    """generator returning sequence labels and sequences converted bytes from a fasta file
-
-    Parameters
-    ----------
-    path
-        location of the fasta file
-    converter
-        a callable that uses converts sequence characters into nominated bytes,
-        deleting unwanted characters. Must handle newlines. Whatever type this
-        callable returns will be the type of the sequence returned.
-
-    Returns
-    -------
-    the sequence label as a string and the sequence as transformed by converter
-    """
-    with open_(path, mode="rb") as infile:
-        data: bytes = infile.read()
-
-    records = data.split(b">")
-    for record in records:
-        eol = record.find(b"\n")
-        if eol == -1:
-            continue
-        label = record[:eol].strip().decode("utf8")
-        seq = converter(record[eol + 1 :])
-        yield label, seq
diff --git a/src/ensembl_tui/_genome.py b/src/ensembl_tui/_genome.py
index df10d85..eb477d8 100644
--- a/src/ensembl_tui/_genome.py
+++ b/src/ensembl_tui/_genome.py
@@ -1,23 +1,25 @@
 import dataclasses
 import functools
 import pathlib
-import re
 import sys
 import typing
 import uuid
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from typing import Any
 
+import cogent3
 import h5py
 import numpy
-from cogent3 import get_moltype, make_seq
 from cogent3.app.composable import define_app
+from cogent3.core import new_alphabet
 from cogent3.core.annotation import Feature
 from cogent3.core.annotation_db import (
     OptionalInt,
     OptionalStr,
 )
-from cogent3.core.sequence import Sequence
+from cogent3.core.new_sequence import Sequence
+from cogent3.parse.fasta import iter_fasta_records
 from cogent3.util.table import Table
 from numpy.typing import NDArray
 
@@ -26,19 +28,16 @@
 from ensembl_tui import _species as eti_species
 from ensembl_tui import _storage_mixin as eti_storage
 from ensembl_tui import _util as eti_util
-from ensembl_tui._faster_fasta import quicka_parser
 
 SEQ_STORE_NAME = "genome.seqs-hdf5_blosc2"
 
-_typed_id = re.compile(
-    r"\b[a-z]+:",
-    flags=re.IGNORECASE,
-)  # ensembl stableid's prefixed by the type
-_feature_id = re.compile(r"(?<=\bID=)[^;]+")
-_exon_id = re.compile(r"(?<=\bexon_id=)[^;]+")
-_parent_id = re.compile(r"(?<=\bParent=)[^;]+")
-_symbol = re.compile(r"(?<=\bName=)[^;]+")
-_description = re.compile(r"(?<=\bdescription=)[^;]+")
+DNA = cogent3.get_moltype("dna", new_type=True)
+alphabet = DNA.most_degen_alphabet()
+bytes_to_array = new_alphabet.bytes_to_array(
+    chars=alphabet.as_bytes(),
+    dtype=numpy.uint8,
+    delete=b" \n\r\t",
+)
 
 
 def _rename(label: str) -> str:
@@ -47,7 +46,11 @@ def _rename(label: str) -> str:
 
 @define_app
 class fasta_to_hdf5:  # noqa: N801
-    def __init__(self, config: eti_config.Config, label_to_name=_rename) -> None:
+    def __init__(
+        self,
+        config: eti_config.Config,
+        label_to_name: Callable[[str], str] = _rename,
+    ) -> None:
         self.config = config
         self.label_to_name = label_to_name
 
@@ -63,8 +66,11 @@ def main(self, db_name: str) -> bool:
 
         src_dir = src_dir / "fasta"
         for path in src_dir.glob("*.fa.gz"):
-            for label, seq in quicka_parser(path):
-                seqid = self.label_to_name(label)
+            for seqid, seq in iter_fasta_records(
+                path,
+                converter=bytes_to_array,
+                label_to_name=self.label_to_name,
+            ):
                 seq_store.add_record(seq, seqid)
                 del seq
 
@@ -125,21 +131,15 @@ class str2arr:  # noqa: N801
     """convert string to array of uint8"""
 
     def __init__(self, moltype: str = "dna", max_length: int | None = None) -> None:
-        moltype = get_moltype(moltype)
-        self.canonical = "".join(moltype)
+        mt = cogent3.get_moltype(moltype, new_type=True)
+        self.alphabet = mt.most_degen_alphabet()
         self.max_length = max_length
-        extended = "".join(list(moltype.alphabets.degen))
-        self.translation = b"".maketrans(
-            extended.encode("utf8"),
-            "".join(chr(i) for i in range(len(extended))).encode("utf8"),
-        )
 
     def main(self, data: str) -> numpy.ndarray:
         if self.max_length:
             data = data[: self.max_length]
 
-        b = data.encode("utf8").translate(self.translation)
-        return numpy.array(memoryview(b), dtype=numpy.uint8)
+        return self.alphabet.to_indices(data)
 
 
 @define_app
@@ -147,21 +147,14 @@ class arr2str:  # noqa: N801
     """convert array of uint8 to str"""
 
     def __init__(self, moltype: str = "dna", max_length: int | None = None) -> None:
-        moltype = get_moltype(moltype)
-        self.canonical = "".join(moltype)
+        mt = cogent3.get_moltype(moltype, new_type=True)
+        self.alphabet = mt.most_degen_alphabet()
         self.max_length = max_length
-        extended = "".join(list(moltype.alphabets.degen))
-        self.translation = b"".maketrans(
-            "".join(chr(i) for i in range(len(extended))).encode("utf8"),
-            extended.encode("utf8"),
-        )
 
     def main(self, data: numpy.ndarray) -> str:
         if self.max_length:
             data = data[: self.max_length]
-
-        b = data.tobytes().translate(self.translation)
-        return bytearray(b).decode("utf8")
+        return self.alphabet.from_indices(data)
 
 
 @dataclasses.dataclass
@@ -326,7 +319,7 @@ def get_seq(
         stop: int | None = None,
         namer: typing.Callable | None = None,
         with_annotations: bool = True,
-    ) -> str:
+    ) -> Sequence:
         """returns annotated sequence
 
         Parameters
@@ -350,20 +343,20 @@ def get_seq(
         -----
         Full annotations are bound to the instance.
         """
-        seq = self._seqs.get_seq_str(seqid=seqid, start=start, stop=stop)
+        seq = self._seqs.get_seq_arr(seqid=seqid, start=start, stop=stop)
         if namer:
             name = namer(self.species, seqid, start, stop)
         else:
             name = f"{self.species}:{seqid}:{start}-{stop}"
         # we use seqid to make the sequence here because that identifies the
         # parent seq identity, required for querying annotations
-        try:
-            seq = make_seq(seq, name=seqid, moltype="dna", annotation_offset=start or 0)
-        except TypeError:
-            # older version of cogent3
-            seq = make_seq(seq, name=seqid, moltype="dna")
-            seq.annotation_offset = start or 0
-
+        seq = cogent3.make_seq(
+            seq,
+            name=seqid,
+            moltype="dna",
+            annotation_offset=start or 0,
+            new_type=True,
+        )
         seq.name = name
         seq.annotation_db = self.annotation_db if with_annotations else None
         return seq
diff --git a/src/ensembl_tui/_homology.py b/src/ensembl_tui/_homology.py
index f46c158..e748810 100644
--- a/src/ensembl_tui/_homology.py
+++ b/src/ensembl_tui/_homology.py
@@ -228,5 +228,6 @@ def main(self, homologs: homolog_group) -> SeqsCollectionType:
         return make_unaligned_seqs(
             data=seqs,
             moltype="dna",
-            info=dict(source=homologs.source),
+            info={"source": homologs.source},
+            new_type=True,
         )
diff --git a/src/ensembl_tui/_ingest_align.py b/src/ensembl_tui/_ingest_align.py
index 62c53b7..76e9218 100644
--- a/src/ensembl_tui/_ingest_align.py
+++ b/src/ensembl_tui/_ingest_align.py
@@ -18,7 +18,7 @@
 
 
 def seq2gaps(record: dict) -> eti_align.AlignRecord:
-    seq = make_seq(record.pop("seq"))
+    seq = make_seq(record.pop("seq"), new_type=True, moltype="dna")
     indel_map, _ = seq.parse_out_gaps()
     if indel_map.num_gaps:
         record["gap_spans"] = numpy.array(
diff --git a/tests/test_align.py b/tests/test_align.py
index 7bec464..0b0a05c 100644
--- a/tests/test_align.py
+++ b/tests/test_align.py
@@ -88,6 +88,7 @@ def small_seqs():
         moltype="dna",
         array_align=False,
         info={"species": {"s1": "human", "s2": "mouse", "s3": "dog"}},
+        new_type=True,
     )
 
 
@@ -96,8 +97,8 @@ def make_records(start, end, block_id):
     records = []
     species = aln.info.species
     for seq in aln.seqs:
-        seqid, seq_start, seq_end, seq_strand = seq.data.parent_coordinates()
-        gs = seq.get_gapped_seq()
+        seqid, seq_start, seq_end, seq_strand = seq.parent_coordinates(seq_coords=True)
+        gs = seq.gapped_seq
         imap, s = gs.parse_out_gaps()
         if imap.num_gaps:
             gap_spans = numpy.array(
@@ -156,16 +157,6 @@ def test_aligndb_records_skip_duplicated_block_ids(small_records):
     assert agg.sql(sql).fetchone()[0] == count
 
 
-def _find_nth_gap_index(data: str, n: int) -> int:
-    num = -1
-    for i, c in enumerate(data):
-        if c == "-":
-            num += 1
-        if num == n:
-            return i
-    raise ValueError(f"{data=}, {n=}")
-
-
 # fixture to make synthetic GenomeSeqsDb and alignment db
 # based on a given alignment
 @pytest.fixture
@@ -233,7 +224,7 @@ def make_sample(two_aligns=False):
     genomes = {}
     for seq in aln.seqs:
         name = seq.name
-        seq = seq.data.degap()
+        seq = seq.seq
         if seq.name == "s2":
             seq = seq.rc()
             s2_genome = str(seq)
diff --git a/tests/test_genome.py b/tests/test_genome.py
index de583df..98bc1f6 100644
--- a/tests/test_genome.py
+++ b/tests/test_genome.py
@@ -1,5 +1,3 @@
-import pathlib
-
 import pytest
 
 from ensembl_tui import _genome as eti_genome
@@ -21,7 +19,6 @@ def test_get_seq_feature_seq_correct(yeast):
     assert raw_seq.endswith("ATAAGAAATTCCATAAATAG")
 
 
-@pytest.mark.xfail(reason="Need to fix cogent3 to apply feature name to new sequence")
 def test_get_seq_feature_seq_correct_name(yeast):
     # need to modify cogent3 so it applies the feature name
     # to the new sequence
@@ -124,26 +121,6 @@ def test_get_feature_child_parent(worm_db):
     assert got.stable_id == gene.stable_id
 
 
-@pytest.fixture(params=("\n", "\r\n"))
-def fasta_data(DATA_DIR, tmp_path, request):
-    data = pathlib.Path(DATA_DIR / "c_elegans_WS199_shortened.fasta").read_text()
-    outpath = tmp_path / "demo.fasta"
-    outpath.write_text(data, newline=request.param)
-    return outpath
-
-
-def test_faster_fasta(fasta_data):
-    from cogent3.parse.fasta import MinimalFastaParser
-
-    from ensembl_tui._faster_fasta import bytes_to_array, quicka_parser
-
-    expect = {
-        n: bytes_to_array(s.encode("utf8")) for n, s in MinimalFastaParser(fasta_data)
-    }
-    got = dict(quicka_parser(fasta_data))
-    assert (got["I"] == expect["I"]).all()
-
-
 def test_genome_segment():
     segment = eti_genome.genome_segment(
         species="abcd_efg",
diff --git a/tests/test_installed.py b/tests/test_installed.py
index ce8bc27..1d55151 100644
--- a/tests/test_installed.py
+++ b/tests/test_installed.py
@@ -36,7 +36,7 @@ def test_get_genes(small_install_path):
         ),
     )
     # we check we can make a aa seq which has the correct start and end
-    aa = str(gene.trim_stop_codon().get_translation(incomplete_ok=True))
+    aa = str(gene.get_translation(trim_stop=True, incomplete_ok=True))
     # expected values from ensembl.org
     assert aa.startswith("MTNSSEFTDVLQS")
     assert aa.endswith("TIMNRINYKLQ")

From 9e88c1f26499ddcda4eec0ea687e0b3ae4b15340 Mon Sep 17 00:00:00 2001
From: Gavin Huttley <Gavin.Huttley@anu.edu.au>
Date: Fri, 17 Jan 2025 13:03:26 +1100
Subject: [PATCH 3/5] DEV: updated cogent3 minimum version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c3397cd..9f52cd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ license = { file = "LICENSE" }
 requires-python = ">=3.10,<3.13"
 dependencies = ["blosc2",
         "click",
-        "cogent3",
+        "cogent3>=2024.12.19a2",
         "duckdb",
         "h5py",
         "hdf5plugin",

From 19a81283bc5de09cf7145bc117740ff9b2ff4c54 Mon Sep 17 00:00:00 2001
From: GavinHuttley <GavinHuttley@users.noreply.github.com>
Date: Fri, 17 Jan 2025 02:04:29 +0000
Subject: [PATCH 4/5] STY: pre-commit linting with ruff

---
 src/ensembl_tui/_storage_mixin.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ensembl_tui/_storage_mixin.py b/src/ensembl_tui/_storage_mixin.py
index 2ad43e9..fd5e89d 100644
--- a/src/ensembl_tui/_storage_mixin.py
+++ b/src/ensembl_tui/_storage_mixin.py
@@ -6,9 +6,10 @@
 import pathlib
 
 import duckdb
+import h5py
 import numpy
 import typing_extensions
-import h5py
+
 from ensembl_tui import _util as eti_util
 
 ReturnType = tuple[str, tuple]  # the sql statement and corresponding values
@@ -76,7 +77,7 @@ class Hdf5Mixin(eti_util.SerialisableMixin):
     def __getstate__(self) -> dict:
         if set(self.mode) & {"w", "a"}:
             raise NotImplementedError(f"pickling not supported for mode={self.mode!r}")
-        return self._init_vals.copy() # type: ignore
+        return self._init_vals.copy()  # type: ignore
 
     def __setstate__(self, state: dict) -> None:
         obj = self.__class__(**state)

From 3818b4a0e20ea0b6fb920cce338d4f590cbe1707 Mon Sep 17 00:00:00 2001
From: Gavin Huttley <Gavin.Huttley@anu.edu.au>
Date: Fri, 17 Jan 2025 13:38:46 +1100
Subject: [PATCH 5/5] TST: added test fo closing AlignDb

[CHANGED] remove effort to better handle garbage collection
    error as it was invalid
---
 src/ensembl_tui/_align.py         | 6 ++++++
 src/ensembl_tui/_storage_mixin.py | 2 --
 tests/test_align.py               | 6 ++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/ensembl_tui/_align.py b/src/ensembl_tui/_align.py
index c57f98c..e4f397f 100644
--- a/src/ensembl_tui/_align.py
+++ b/src/ensembl_tui/_align.py
@@ -288,6 +288,12 @@ def get_distinct(self, field: str) -> list[str]:
     def num_records(self) -> int:
         return self.conn.sql(f"SELECT COUNT(*) from {self._tables[0]}").fetchone()[0]
 
+    def close(self) -> None:
+        """closes duckdb and h5py storage"""
+        if self.gap_store:
+            self.gap_store.close()
+        self.conn.close()
+
 
 def get_alignment(
     align_db: AlignDb,
diff --git a/src/ensembl_tui/_storage_mixin.py b/src/ensembl_tui/_storage_mixin.py
index fd5e89d..7952060 100644
--- a/src/ensembl_tui/_storage_mixin.py
+++ b/src/ensembl_tui/_storage_mixin.py
@@ -95,8 +95,6 @@ def close(self) -> None:
         """closes the hdf5 file"""
         # hdf5 dumps content to stdout if resource already closed, so
         # we trap that here, and capture expected exceptions raised in the process
-        if "open" not in locals():
-            return
         with (
             open(os.devnull, "w") as devnull,  # noqa: PTH123
             contextlib.redirect_stderr(devnull),
diff --git a/tests/test_align.py b/tests/test_align.py
index 0b0a05c..9ef03a8 100644
--- a/tests/test_align.py
+++ b/tests/test_align.py
@@ -590,3 +590,9 @@ def test_aligndb_post_init_failure(tmp_path):
     outfile.write_text("blah")
     with pytest.raises(OSError):
         eti_align.AlignDb(source="/non/existent/directory")
+
+
+def test_aligndb_close(db_align):
+    db_align.close()
+    with pytest.raises(duckdb.duckdb.ConnectionException):
+        db_align.num_records()