Merge pull request #122 from GavinHuttley/develop

Tidy cli for exporting alignments
cogent3 · Jun 15, 2024 · 0393854 · 0393854
2 parents f822015 + a06031b
commit 0393854
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ Usage: elt exportrc [OPTIONS]
   contents to override the default ensembl_lite settings
 
 Options:
-  -o, --outpath PATH  path to directory to export all rc contents
+  -o, --outpath PATH  Path to directory to export all rc contents.
   --help              Show this message and exit.
 
 ```

diff --git a/src/ensembl_lite/_config.py b/src/ensembl_lite/_config.py
@@ -187,14 +187,16 @@ def path_to_alignment(self, pattern: str) -> pathlib.Path | None:
             glob pattern for the Ensembl alignment name
         """
         align_dirs = [
-            d for d in self.aligns_path.glob("*") if fnmatch.fnmatch(d.name, pattern)
+            d
+            for d in self.aligns_path.glob("*")
+            if fnmatch.fnmatch(d.name, pattern) and d.name.endswith(".sqlitedb")
         ]
         if not align_dirs:
             return None
 
         if len(align_dirs) > 1:
             raise ValueError(
-                f"{pattern!r} matches too many directories in {self.aligns_path}"
+                f"{pattern!r} matches too many directories in {self.aligns_path} {align_dirs}"
             )
 
         return align_dirs[0]

diff --git a/src/ensembl_lite/_homologydb.py b/src/ensembl_lite/_homologydb.py
@@ -1,8 +1,6 @@
 import dataclasses
 import typing
 
-from collections import defaultdict
-
 import blosc2
 
 from cogent3 import make_unaligned_seqs
@@ -44,15 +42,15 @@ def __eq__(self, other):
         return self.species == other.species and self.gene_ids == other.gene_ids
 
     def __post_init__(self):
-        self.gene_ids = [] if not self.gene_ids else list(self.gene_ids)
+        self.gene_ids = list(self.gene_ids) if self.gene_ids else []
 
-    def __getstate__(self) -> tuple[str, tuple[str, ...]]:
-        return self.species, tuple(self.gene_ids)
+    def __getstate__(self) -> tuple[str, list[str]]:
+        return self.species, self.gene_ids
 
     def __setstate__(self, args):
         species, gene_ids = args
         self.species = species
-        self.gene_ids = list(gene_ids)
+        self.gene_ids = gene_ids
 
 
 @dataclasses.dataclass
@@ -65,7 +63,7 @@ class homolog_group:
     source: str | None = None
 
     def __post_init__(self):
-        self.gene_ids = self.gene_ids if self.gene_ids else {}
+        self.gene_ids = self.gene_ids or {}
         if self.source is None:
             self.source = next(iter(self.gene_ids), None)
 
@@ -157,7 +155,7 @@ def grouped_related(
     return reduced
 
 
-def _gene_id_to_group(series: tuple[homolog_group, ...]) -> dict[str:homolog_group]:
+def _gene_id_to_group(series: tuple[homolog_group, ...]) -> dict[str, homolog_group]:
     """converts series of homolog_group instances to {geneid: groupl, ..}"""
     result = {}
     for group in series:
@@ -219,14 +217,18 @@ class HomologyDb(SqliteDbMixin):
     _relationship_schema = {  # e.g. ortholog_one2one
         "homology_type": "TEXT",
     }
+
     _homology_schema = {  # e.g. an individual homolog group of homology_type
         "relationship_id": "INTEGER",
     }
+
     _species_schema = {"species_db": "TEXT"}
+
     _stableid_schema = {
         "stableid": "TEXT PRIMARY KEY",
         "species_id": "INTEGER",
     }
+
     _member_schema = {  # gene membership of a specific homolog group
         "stableid_id": "INTEGER",  # from stableid table
         "homology_id": "INTEGER",  # from homology table
@@ -250,23 +252,6 @@ def __init__(self, source: PathType = ":memory:"):
 
     def _create_views(self):
         """define views to simplify queries"""
-        # we want to be able to query for all ortholog groups of a
-        # particular type. For example, get all groups of IDs of
-        # type one-to-one orthologs
-        sql = """
-        CREATE VIEW IF NOT EXISTS related_groups AS
-        SELECT r.homology_type as homology_type,
-                r.rowid as relationship_id,
-                h.rowid as homology_id,
-                st.stableid as gene_id,
-                st.rowid as stableid_id,
-                sp.species_db as species_db
-        FROM homology h JOIN relationship r ON h.relationship_id = r.rowid 
-        JOIN member as m ON m.homology_id = h.rowid
-        JOIN stableid as st ON st.rowid = m.stableid_id
-        JOIN species as sp ON sp.rowid = st.species_id
-        """
-        self._execute_sql(sql)
         sql = """CREATE VIEW IF NOT EXISTS homology_member AS
         SELECT h.rowid as homology_id,
                h.relationship_id as relationship_id,
@@ -287,6 +272,16 @@ def _create_views(self):
         JOIN stableid st ON st.species_id = sp.rowid
         """
         self._execute_sql(sql)
+        sql = """
+        CREATE VIEW IF NOT EXISTS related_groups AS
+        SELECT gs.stableid as stableid,
+               gs.species_db as species_db,
+               hm.homology_id as homology_id,
+               hm.homology_type as homology_type
+        FROM gene_species gs
+        JOIN homology_member hm ON hm.stableid_id = gs.stableid_id
+        """
+        self._execute_sql(sql)
 
     def _make_species_id(self, species: str) -> int:
         """returns the species.id value for species"""
@@ -416,27 +411,21 @@ def get_related_to(self, *, gene_id: str, relationship_type: str) -> homolog_gro
 
         return result
 
-    def get_related_groups(
-        self, relationship_type: str
-    ) -> typing.Sequence[homolog_group]:
+    def get_related_groups(self, relationship_type: str) -> list[homolog_group]:
         """returns all groups of relationship type"""
         sql = """
-        SELECT 
-        r.homology_id as homology_id, 
-        r.gene_id as gene_id,
-        r.species_db as species_db
-        FROM related_groups r
-        WHERE r.homology_type = ?
+        SELECT stableid, species_db, homology_id
+        FROM related_groups
+        WHERE homology_type = ?
         """
-        results = defaultdict(list)
+        results = {}
         for result in self._execute_sql(sql, (relationship_type,)).fetchall():
-            results[result["homology_id"]].append(
-                (result["gene_id"], result["species_db"])
+            record = results.get(
+                result["homology_id"], homolog_group(relationship=relationship_type)
             )
-        return [
-            homolog_group(relationship=relationship_type, gene_ids=dict(gene_ids))
-            for gene_ids in results.values()
-        ]
+            record.gene_ids |= {result["stableid"]: result["species_db"]}
+            results[result["homology_id"]] = record
+        return list(results.values())
 
     def num_records(self):
         return list(

diff --git a/src/ensembl_lite/cli.py b/src/ensembl_lite/cli.py
@@ -81,21 +81,21 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
     "--configpath",
     default=elt_download._cfg,
     type=pathlib.Path,
-    help="path to config file specifying databases, only "
-    "species or compara at present",
+    help="Path to config file specifying databases, (only "
+    "species or compara at present).",
 )
 _download = click.option(
     "-d",
     "--download",
     type=pathlib.Path,
-    help="path to local download directory, contains a cfg file",
+    help="Path to local download directory containing a cfg file.",
 )
 _installed = click.option(
     "-i",
     "--installed",
     required=True,
     callback=_get_installed_config_path,
-    help="string pointing to installation",
+    help="Path to root directory of an installation.",
 )
 _outpath = click.option(
     "-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file"
@@ -106,14 +106,14 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
 _align_name = click.option(
     "--align_name",
     default=None,
-    help="Ensembl name of the alignment or a glob pattern, e.g. '*primates*'",
+    help="Ensembl alignment name or a glob pattern, e.g. '*primates*'.",
 )
 _ref = click.option("--ref", default=None, help="Reference species.")
 _ref_genes_file = click.option(
     "--ref_genes_file",
     default=None,
     type=click.Path(resolve_path=True, exists=True),
-    help=".csv or .tsv file with a header containing a stableid column",
+    help=".csv or .tsv file with a header containing a stableid column.",
 )
 _limit = click.option(
     "--limit",
@@ -122,7 +122,6 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
     help="Limit to this number of genes.",
     show_default=True,
 )
-
 _verbose = click.option(
     "-v",
     "--verbose",
@@ -132,56 +131,53 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
     "-f",
     "--force_overwrite",
     is_flag=True,
-    help="drop existing database if it exists prior to " "installing",
+    help="Overwrite existing data.",
 )
 _debug = click.option(
     "-d",
     "--debug",
     is_flag=True,
-    help="maximum verbosity, and reduces number of downloads",
+    help="Maximum verbosity, and reduces number of downloads, etc...",
 )
 _dbrc_out = click.option(
     "-o",
     "--outpath",
     type=pathlib.Path,
-    help="path to directory to export all rc contents",
+    help="Path to directory to export all rc contents.",
 )
 _nprocs = click.option(
     "-np",
     "--num_procs",
     type=int,
     default=1,
-    help="number of procs to use, defaults to 1",
+    help="Number of procs to use.",
+    show_default=True,
 )
-
-
 _outdir = click.option(
     "--outdir",
     type=pathlib.Path,
     default=".",
     help="Output directory name.",
     show_default=True,
 )
-
 _species = click.option(
     "--species",
     required=True,
     callback=_species_names_from_csv,
-    help="Single species name, or multiple (comma separated).",
+    help="Single species name or multiple (comma separated).",
 )
-
 _mask_features = click.option(
     "--mask_features",
     callback=_values_from_csv,
-    help="biotypes to mask (comma separated).",
+    help="Biotypes to mask (comma separated).",
 )
 
 
 @tui()
 @click.group()
 @click.version_option(__version__)
 def main():
-    """tools for obtaining and interrogating subsets of https://ensembl.org genomic data"""
+    """Tools for obtaining and interrogating subsets of https://ensembl.org genomic data."""
     pass
 
 
@@ -404,7 +400,6 @@ def alignments(
     from cogent3 import load_table
 
     from ensembl_lite._aligndb import AlignDb, write_alignments
-    from ensembl_lite._genomedb import load_genome, update_stableid_prefixes
     from ensembl_lite._species import Species
 
     # todo support genomic coordinates, e.g. coord_name:start-stop:strand, for
@@ -423,8 +418,6 @@ def alignments(
     outdir.mkdir(parents=True, exist_ok=True)
 
     config = elt_config.read_installed_cfg(installed)
-    # update the prefixes
-    update_stableid_prefixes(config)
     align_path = config.path_to_alignment(align_name)
     if align_path is None:
         click.secho(

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+from ensembl_lite._aligndb import _GAP_STORE_SUFFIX
 from ensembl_lite._config import (
     _ALIGNS_NAME,
     _COMPARA_NAME,
@@ -50,6 +51,10 @@ def installed_aligns(tmp_path):
     # make two alignment paths with similar names
     (align_dir / "10_primates.epo.sqlitedb").open(mode="w")
     (align_dir / "24_primates.epo_extended.sqlitedb").open(mode="w")
+    # and their associated HDF5 seqs
+    (align_dir / f"10_primates.epo.{_GAP_STORE_SUFFIX}").open(mode="w")
+    (align_dir / f"24_primates.epo_extended.{_GAP_STORE_SUFFIX}").open(mode="w")
+
     return InstalledConfig(release="11", install_path=tmp_path)