Skip to content

Commit

Permalink
Merge pull request #122 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
Tidy cli for exporting alignments
  • Loading branch information
GavinHuttley authored Jun 15, 2024
2 parents f822015 + a06031b commit 0393854
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 65 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Usage: elt exportrc [OPTIONS]
contents to override the default ensembl_lite settings
Options:
-o, --outpath PATH path to directory to export all rc contents
-o, --outpath PATH Path to directory to export all rc contents.
--help Show this message and exit.
```
Expand Down
6 changes: 4 additions & 2 deletions src/ensembl_lite/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,16 @@ def path_to_alignment(self, pattern: str) -> pathlib.Path | None:
glob pattern for the Ensembl alignment name
"""
align_dirs = [
d for d in self.aligns_path.glob("*") if fnmatch.fnmatch(d.name, pattern)
d
for d in self.aligns_path.glob("*")
if fnmatch.fnmatch(d.name, pattern) and d.name.endswith(".sqlitedb")
]
if not align_dirs:
return None

if len(align_dirs) > 1:
raise ValueError(
f"{pattern!r} matches too many directories in {self.aligns_path}"
f"{pattern!r} matches too many directories in {self.aligns_path} {align_dirs}"
)

return align_dirs[0]
Expand Down
71 changes: 30 additions & 41 deletions src/ensembl_lite/_homologydb.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import dataclasses
import typing

from collections import defaultdict

import blosc2

from cogent3 import make_unaligned_seqs
Expand Down Expand Up @@ -44,15 +42,15 @@ def __eq__(self, other):
return self.species == other.species and self.gene_ids == other.gene_ids

def __post_init__(self):
self.gene_ids = [] if not self.gene_ids else list(self.gene_ids)
self.gene_ids = list(self.gene_ids) if self.gene_ids else []

def __getstate__(self) -> tuple[str, tuple[str, ...]]:
return self.species, tuple(self.gene_ids)
def __getstate__(self) -> tuple[str, list[str]]:
return self.species, self.gene_ids

def __setstate__(self, args):
species, gene_ids = args
self.species = species
self.gene_ids = list(gene_ids)
self.gene_ids = gene_ids


@dataclasses.dataclass
Expand All @@ -65,7 +63,7 @@ class homolog_group:
source: str | None = None

def __post_init__(self):
self.gene_ids = self.gene_ids if self.gene_ids else {}
self.gene_ids = self.gene_ids or {}
if self.source is None:
self.source = next(iter(self.gene_ids), None)

Expand Down Expand Up @@ -157,7 +155,7 @@ def grouped_related(
return reduced


def _gene_id_to_group(series: tuple[homolog_group, ...]) -> dict[str:homolog_group]:
def _gene_id_to_group(series: tuple[homolog_group, ...]) -> dict[str, homolog_group]:
"""converts series of homolog_group instances to {geneid: groupl, ..}"""
result = {}
for group in series:
Expand Down Expand Up @@ -219,14 +217,18 @@ class HomologyDb(SqliteDbMixin):
_relationship_schema = { # e.g. ortholog_one2one
"homology_type": "TEXT",
}

_homology_schema = { # e.g. an individual homolog group of homology_type
"relationship_id": "INTEGER",
}

_species_schema = {"species_db": "TEXT"}

_stableid_schema = {
"stableid": "TEXT PRIMARY KEY",
"species_id": "INTEGER",
}

_member_schema = { # gene membership of a specific homolog group
"stableid_id": "INTEGER", # from stableid table
"homology_id": "INTEGER", # from homology table
Expand All @@ -250,23 +252,6 @@ def __init__(self, source: PathType = ":memory:"):

def _create_views(self):
"""define views to simplify queries"""
# we want to be able to query for all ortholog groups of a
# particular type. For example, get all groups of IDs of
# type one-to-one orthologs
sql = """
CREATE VIEW IF NOT EXISTS related_groups AS
SELECT r.homology_type as homology_type,
r.rowid as relationship_id,
h.rowid as homology_id,
st.stableid as gene_id,
st.rowid as stableid_id,
sp.species_db as species_db
FROM homology h JOIN relationship r ON h.relationship_id = r.rowid
JOIN member as m ON m.homology_id = h.rowid
JOIN stableid as st ON st.rowid = m.stableid_id
JOIN species as sp ON sp.rowid = st.species_id
"""
self._execute_sql(sql)
sql = """CREATE VIEW IF NOT EXISTS homology_member AS
SELECT h.rowid as homology_id,
h.relationship_id as relationship_id,
Expand All @@ -287,6 +272,16 @@ def _create_views(self):
JOIN stableid st ON st.species_id = sp.rowid
"""
self._execute_sql(sql)
sql = """
CREATE VIEW IF NOT EXISTS related_groups AS
SELECT gs.stableid as stableid,
gs.species_db as species_db,
hm.homology_id as homology_id,
hm.homology_type as homology_type
FROM gene_species gs
JOIN homology_member hm ON hm.stableid_id = gs.stableid_id
"""
self._execute_sql(sql)

def _make_species_id(self, species: str) -> int:
"""returns the species.id value for species"""
Expand Down Expand Up @@ -416,27 +411,21 @@ def get_related_to(self, *, gene_id: str, relationship_type: str) -> homolog_gro

return result

def get_related_groups(
self, relationship_type: str
) -> typing.Sequence[homolog_group]:
def get_related_groups(self, relationship_type: str) -> list[homolog_group]:
"""returns all groups of relationship type"""
sql = """
SELECT
r.homology_id as homology_id,
r.gene_id as gene_id,
r.species_db as species_db
FROM related_groups r
WHERE r.homology_type = ?
SELECT stableid, species_db, homology_id
FROM related_groups
WHERE homology_type = ?
"""
results = defaultdict(list)
results = {}
for result in self._execute_sql(sql, (relationship_type,)).fetchall():
results[result["homology_id"]].append(
(result["gene_id"], result["species_db"])
record = results.get(
result["homology_id"], homolog_group(relationship=relationship_type)
)
return [
homolog_group(relationship=relationship_type, gene_ids=dict(gene_ids))
for gene_ids in results.values()
]
record.gene_ids |= {result["stableid"]: result["species_db"]}
results[result["homology_id"]] = record
return list(results.values())

def num_records(self):
return list(
Expand Down
35 changes: 14 additions & 21 deletions src/ensembl_lite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,21 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
"--configpath",
default=elt_download._cfg,
type=pathlib.Path,
help="path to config file specifying databases, only "
"species or compara at present",
help="Path to config file specifying databases, (only "
"species or compara at present).",
)
_download = click.option(
"-d",
"--download",
type=pathlib.Path,
help="path to local download directory, contains a cfg file",
help="Path to local download directory containing a cfg file.",
)
_installed = click.option(
"-i",
"--installed",
required=True,
callback=_get_installed_config_path,
help="string pointing to installation",
help="Path to root directory of an installation.",
)
_outpath = click.option(
"-o", "--outpath", required=True, type=pathlib.Path, help="path to write json file"
Expand All @@ -106,14 +106,14 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
_align_name = click.option(
"--align_name",
default=None,
help="Ensembl name of the alignment or a glob pattern, e.g. '*primates*'",
help="Ensembl alignment name or a glob pattern, e.g. '*primates*'.",
)
_ref = click.option("--ref", default=None, help="Reference species.")
_ref_genes_file = click.option(
"--ref_genes_file",
default=None,
type=click.Path(resolve_path=True, exists=True),
help=".csv or .tsv file with a header containing a stableid column",
help=".csv or .tsv file with a header containing a stableid column.",
)
_limit = click.option(
"--limit",
Expand All @@ -122,7 +122,6 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
help="Limit to this number of genes.",
show_default=True,
)

_verbose = click.option(
"-v",
"--verbose",
Expand All @@ -132,56 +131,53 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
"-f",
"--force_overwrite",
is_flag=True,
help="drop existing database if it exists prior to " "installing",
help="Overwrite existing data.",
)
_debug = click.option(
"-d",
"--debug",
is_flag=True,
help="maximum verbosity, and reduces number of downloads",
help="Maximum verbosity, and reduces number of downloads, etc...",
)
_dbrc_out = click.option(
"-o",
"--outpath",
type=pathlib.Path,
help="path to directory to export all rc contents",
help="Path to directory to export all rc contents.",
)
_nprocs = click.option(
"-np",
"--num_procs",
type=int,
default=1,
help="number of procs to use, defaults to 1",
help="Number of procs to use.",
show_default=True,
)


_outdir = click.option(
"--outdir",
type=pathlib.Path,
default=".",
help="Output directory name.",
show_default=True,
)

_species = click.option(
"--species",
required=True,
callback=_species_names_from_csv,
help="Single species name, or multiple (comma separated).",
help="Single species name or multiple (comma separated).",
)

_mask_features = click.option(
"--mask_features",
callback=_values_from_csv,
help="biotypes to mask (comma separated).",
help="Biotypes to mask (comma separated).",
)


@tui()
@click.group()
@click.version_option(__version__)
def main():
"""tools for obtaining and interrogating subsets of https://ensembl.org genomic data"""
"""Tools for obtaining and interrogating subsets of https://ensembl.org genomic data."""
pass


Expand Down Expand Up @@ -404,7 +400,6 @@ def alignments(
from cogent3 import load_table

from ensembl_lite._aligndb import AlignDb, write_alignments
from ensembl_lite._genomedb import load_genome, update_stableid_prefixes
from ensembl_lite._species import Species

# todo support genomic coordinates, e.g. coord_name:start-stop:strand, for
Expand All @@ -423,8 +418,6 @@ def alignments(
outdir.mkdir(parents=True, exist_ok=True)

config = elt_config.read_installed_cfg(installed)
# update the prefixes
update_stableid_prefixes(config)
align_path = config.path_to_alignment(align_name)
if align_path is None:
click.secho(
Expand Down
5 changes: 5 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from ensembl_lite._aligndb import _GAP_STORE_SUFFIX
from ensembl_lite._config import (
_ALIGNS_NAME,
_COMPARA_NAME,
Expand Down Expand Up @@ -50,6 +51,10 @@ def installed_aligns(tmp_path):
# make two alignment paths with similar names
(align_dir / "10_primates.epo.sqlitedb").open(mode="w")
(align_dir / "24_primates.epo_extended.sqlitedb").open(mode="w")
# and their associated HDF5 seqs
(align_dir / f"10_primates.epo.{_GAP_STORE_SUFFIX}").open(mode="w")
(align_dir / f"24_primates.epo_extended.{_GAP_STORE_SUFFIX}").open(mode="w")

return InstalledConfig(release="11", install_path=tmp_path)


Expand Down

0 comments on commit 0393854

Please sign in to comment.