Merge pull request #121 from GavinHuttley/develop

Refactor for performance querying for homologs
cogent3 · Jun 14, 2024 · f822015 · f822015
2 parents 7424e3b + e03c98b
commit f822015
Show file tree

Hide file tree

Showing 10 changed files with 335 additions and 131 deletions.
diff --git a/src/ensembl_lite/_config.py b/src/ensembl_lite/_config.py
@@ -215,6 +215,7 @@ def write_installed_cfg(config: Config) -> PathType:
 
 def read_installed_cfg(path: PathType) -> InstalledConfig:
     """reads an ini file under config.installed_path"""
+    path = pathlib.Path(path).expanduser()
     parser = configparser.ConfigParser()
     path = (
         path if path.name == INSTALLED_CONFIG_NAME else (path / INSTALLED_CONFIG_NAME)
@@ -228,7 +229,7 @@ def read_installed_cfg(path: PathType) -> InstalledConfig:
     return InstalledConfig(release=release, install_path=path.parent)
 
 
-def _standardise_path(path: str, config_path: pathlib.Path) -> pathlib.Path:
+def _standardise_path(path: PathType, config_path: pathlib.Path) -> pathlib.Path:
     path = pathlib.Path(path).expanduser()
     return path if path.is_absolute() else (config_path / path).resolve()
 

diff --git a/src/ensembl_lite/_db_base.py b/src/ensembl_lite/_db_base.py
@@ -1,3 +1,4 @@
+import contextlib
 import dataclasses
 import sqlite3
 
@@ -146,10 +147,12 @@ def __setstate__(self, state):
         obj._file = None
 
     def __del__(self):
-        if self._is_open and self._file is not None:
+        with contextlib.suppress(ValueError, AttributeError):
             self._file.flush()
-        if self._file is not None:
+
+        with contextlib.suppress(AttributeError):
             self._file.close()
+
         self._is_open = False
 
     def close(self):

diff --git a/src/ensembl_lite/_genomedb.py b/src/ensembl_lite/_genomedb.py
@@ -139,6 +139,10 @@ def custom_gff_parser(
         reduced[record].start = min(reduced[record].start, record.start)
         reduced[record].stop = max(reduced[record].stop, record.stop)
 
+    # make sure feature location data is sorted
+    for record in reduced.values():
+        record.spans = sorted([sorted(span) for span in record.spans])
+
     return reduced, num_fake_ids
 
 
@@ -562,7 +566,6 @@ def main(self, db_name: str) -> bool:
 
         src_dir = src_dir / "fasta"
         for path in src_dir.glob("*.fa.gz"):
-            # for label, seq in quicka_parser(path, one_seq=False):
             for label, seq in quicka_parser(path):
                 seqid = self.label_to_name(label)
                 seq_store.add_record(seq, seqid)
@@ -847,7 +850,10 @@ def get_gene_cds(self, name: str, is_canonical: bool = True):
             stop = cds["spans"].max()
             seq = self.get_seq(seqid=seqid, start=start, stop=stop)
             cds["spans"] = cds["spans"] - start
-            yield seq.make_feature(feature=cds)
+            try:
+                yield seq.make_feature(feature=cds)
+            except ValueError:
+                raise ValueError(f"invalid location data for {cds!r}")
 
     def get_ids_for_biotype(self, biotype: str, limit: OptionalInt = None):
         annot_db = self.annotation_db