Skip to content

Commit

Permalink
Merge pull request #193 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
ENH: display repeat info plus other cli improvements
  • Loading branch information
GavinHuttley authored Feb 23, 2025
2 parents 2f370a3 + a0c5b1a commit 643fc2b
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 46 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ We provide a conventional command line interface for querying the data with subc
Commands:
alignments export multiple alignments in fasta format for named genes
compara-summary summary data for compara
download download data from Ensembl's ftp site
dump-genes export meta-data table for genes from one species to...
exportrc exports sample config and species table to the nominated...
Expand Down
1 change: 0 additions & 1 deletion src/ensembl_tui/_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,6 @@ def distinct(self) -> tuple[str, ...]:
sql = f"SELECT DISTINCT biotype FROM {self._tables[0]}"
return tuple(r[0] for r in self.conn.sql(sql).fetchall())

@functools.cached_property
def count_distinct(self) -> "Table":
sql = (
f"SELECT biotype, COUNT(*) AS freq FROM {self._tables[0]} GROUP BY biotype"
Expand Down
33 changes: 31 additions & 2 deletions src/ensembl_tui/_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def get_gene_table_for_species(
return table


def get_species_summary(
def get_species_gene_summary(
*,
annot_db: eti_annots.Annotations,
species: str | None = None,
Expand All @@ -618,7 +618,7 @@ def get_species_summary(
"""
# for now, just biotype
species = species or annot_db.source.parent.name
counts = annot_db.biotypes.count_distinct
counts = annot_db.biotypes.count_distinct()
try:
common_name = eti_species.Species.get_common_name(species)
except ValueError:
Expand All @@ -627,3 +627,32 @@ def get_species_summary(
counts.title = f"{common_name} features"
counts.format_column("count", lambda x: f"{x:,}")
return counts


def get_species_repeat_summary(
*,
annot_db: eti_annots.Annotations,
species: str | None = None,
) -> Table:
"""
returns the Table summarising repeat data for species_name
Parameters
----------
annot_db
feature db
species
species name, overrides inference from annot_db.source
"""
# for now, just biotype
species = species or annot_db.source.parent.name
counts = annot_db.repeats.count_distinct(repeat_class=True, repeat_type=True)
try:
common_name = eti_species.Species.get_common_name(species)
except ValueError:
common_name = species

counts = counts.sorted(columns=["repeat_type", "count"])
counts.title = f"{common_name} repeat"
counts.format_column("count", lambda x: f"{x:,}")
return counts
9 changes: 8 additions & 1 deletion src/ensembl_tui/_homology.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
from ensembl_tui import _storage_mixin as eti_storage
from ensembl_tui import _util as eti_util

if typing.TYPE_CHECKING:
from cogent3.util.table import Table

HOMOLOGY_ATTR_SCHEMA = (
"rowid INTEGER PRIMARY KEY DEFAULT nextval('rowid_seq')",
"homology_id INTEGER",
Expand Down Expand Up @@ -150,7 +153,11 @@ def num_records(self) -> int:
sql = "SELECT COUNT(DISTINCT homology_id) FROM homology_groups_attr"
return self.conn.sql(sql).fetchone()[0]

def count_distinct(self, species: bool = False, homology_type: bool = False) -> int:
def count_distinct(
self,
species: bool = False,
homology_type: bool = False,
) -> "Table":
columns = []
if species:
columns.append("species_db")
Expand Down
4 changes: 2 additions & 2 deletions src/ensembl_tui/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,9 +493,9 @@ class _printer: # noqa: N801
def __init__(self) -> None:
self._console = self.Console()

def __call__(self, text: str, colour: str) -> None:
def __call__(self, text: str, colour: str, style: str = "") -> None:
"""print text in colour"""
msg = rich_text.Text(text)
msg = rich_text.Text(text, style=style)
msg.stylize(colour)
self._console.print(msg)

Expand Down
104 changes: 67 additions & 37 deletions src/ensembl_tui/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ensembl_tui import _config as eti_config
from ensembl_tui import _download as eti_download
from ensembl_tui import _genome as eti_genome
from ensembl_tui import _homology as eti_homology
from ensembl_tui import _species as eti_species
from ensembl_tui import _util as eti_util

Expand All @@ -27,7 +28,7 @@ def _get_coord_names(ctx, param, coord_names) -> list[str] | None:
return [f.strip() for f in coord_names.split(",")]


def _get_installed_config_path(ctx, param, path) -> eti_util.PathType:
def _get_installed_config_path(ctx, param, path) -> pathlib.Path:
"""path to installed.cfg"""
path = pathlib.Path(path)
if path.name == eti_config.INSTALLED_CONFIG_NAME:
Expand Down Expand Up @@ -72,7 +73,6 @@ def _species_names_from_csv(ctx, param, species) -> list[str] | None:
_cfgpath = click.option(
"-c",
"--configpath",
default=eti_download.DEFAULT_CFG,
type=pathlib.Path,
help="Path to config file specifying databases, (only "
"species or compara at present).",
Expand Down Expand Up @@ -183,7 +183,7 @@ def main():

@main.command(**_click_command_opts)
@_dbrc_out
def exportrc(outpath):
def exportrc(outpath: pathlib.Path) -> None:
"""exports sample config and species table to the nominated path"""

outpath = outpath.expanduser()
Expand All @@ -204,16 +204,18 @@ def exportrc(outpath):
@_cfgpath
@_debug
@_verbose
def download(configpath, debug, verbose):
def download(configpath: pathlib.Path, debug: bool, verbose: bool) -> None:
"""download data from Ensembl's ftp site"""
from rich import progress

if configpath.name == eti_download.DEFAULT_CFG:
# TODO is this statement correct if we're seting a root dir now?
if not configpath:
eti_util.print_colour(
text="WARN: using the built in demo cfg, will write to /tmp",
colour="yellow",
text="No config specified, exiting.",
colour="red",
style="bold",
)
sys.exit(1)

config = eti_config.read_config(configpath, root_dir=pathlib.Path.cwd())

if verbose:
Expand Down Expand Up @@ -323,7 +325,7 @@ def install(

@main.command(**_click_command_opts)
@_installed
def installed(installed):
def installed(installed: pathlib.Path) -> None:
"""show what is installed"""
from cogent3 import make_table

Expand All @@ -340,26 +342,33 @@ def installed(installed):
data["species"].append(name)
data["common name"].append(cn)

table = make_table(data=data, title="Installed genomes")
table = make_table(data=data, title="Installed genomes:")
eti_util.rich_display(table)

if config.homologies_path.exists():
eti_util.print_colour("Installed homologies: ✅", colour="blue", style="bold")

# TODO as above
compara_aligns = config.aligns_path
if compara_aligns.exists():
align_names = {
fn.stem for fn in compara_aligns.glob("*") if not fn.name.startswith(".")
}
eti_util.print_colour(
"Installed whole genome alignments:",
colour="blue",
style="bold",
)
table = make_table(
data={"align name": list(align_names)},
title="Installed whole genome alignments",
)
eti_util.rich_display(table)


@main.command(**_click_command_opts)
@_installed
@_species
def species_summary(installed, species):
def species_summary(installed: pathlib.Path, species: str) -> None:
"""genome summary data for a species"""

config = eti_config.read_installed_cfg(installed)
Expand All @@ -378,10 +387,28 @@ def species_summary(installed, species):
annot_db = eti_genome.load_annotations_for_species(
path=config.installed_genome(species=species),
)
summary = eti_genome.get_species_summary(annot_db=annot_db, species=species)
summary = eti_genome.get_species_gene_summary(annot_db=annot_db, species=species)
eti_util.rich_display(summary)
summary = eti_genome.get_species_repeat_summary(annot_db=annot_db, species=species)
eti_util.rich_display(summary)


@main.command(**_click_command_opts)
@_installed
def compara_summary(installed: pathlib.Path) -> None:
"""summary data for compara"""

config = eti_config.read_installed_cfg(installed)
if config.homologies_path.exists():
db = eti_homology.load_homology_db(
path=config.homologies_path,
)
table = db.count_distinct(homology_type=True)
table.title = "Homology types"
table.format_column("count", lambda x: f"{x:,}")
eti_util.rich_display(table)


@main.command(**_click_command_opts)
@_installed
@_outdir
Expand All @@ -394,17 +421,17 @@ def species_summary(installed, species):
@_force
@_verbose
def alignments(
installed,
outdir,
align_name,
ref,
coord_names,
ref_genes_file,
mask_features,
limit,
force_overwrite,
verbose,
):
installed: pathlib.Path,
outdir: pathlib.Path,
align_name: str,
ref: str,
coord_names: str,
ref_genes_file: pathlib.Path,
mask_features: pathlib.Path,
limit: int,
force_overwrite: bool,
verbose: bool,
) -> None:
"""export multiple alignments in fasta format for named genes"""
from cogent3 import load_table
from rich import progress
Expand Down Expand Up @@ -546,21 +573,19 @@ def alignments(
@_force
@_verbose
def homologs(
installed,
outdir,
relationship,
ref,
coord_names,
num_procs,
limit,
force_overwrite,
verbose,
):
installed: pathlib.Path,
outdir: pathlib.Path,
relationship: str,
ref: str,
coord_names: str,
num_procs: int,
limit: int,
force_overwrite: bool,
verbose: bool,
) -> None:
"""exports CDS sequence data in fasta format for homology type relationship"""
from rich import progress

from ensembl_tui import _homology as eti_homology

LOGGER = CachingLogger()
LOGGER.log_args()

Expand Down Expand Up @@ -671,7 +696,12 @@ def homologs(
@_species
@_outdir
@_limit
def dump_genes(installed, species, outdir, limit):
def dump_genes(
installed: pathlib.Path,
species: str,
outdir: pathlib.Path,
limit: int,
) -> None:
"""export meta-data table for genes from one species to <species>-<release>.gene_metadata.tsv"""

config = eti_config.read_installed_cfg(installed)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_biotype_view(worm_biotypes):
distinct = bt.distinct
assert "protein_coding" in distinct
assert "miRNA" in distinct
counts = bt.count_distinct
counts = bt.count_distinct()
assert counts["protein_coding", "count"] > 10_000


Expand Down
18 changes: 18 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ def test_download(tmp_config):
assert r.exit_code == 0, r.output


def test_download_no_config():
r = RUNNER.invoke(eti_cli.download, ["-d"], catch_exceptions=False)
assert r.exit_code != 0, r.output
assert "No config" in r.output


def test_exportrc(tmp_dir):
"""exportrc works correctly"""
outdir = tmp_dir / "exported"
Expand Down Expand Up @@ -164,3 +170,15 @@ def test_homologs_coord_name(installed, tmp_dir):
assert r.exit_code == 0, r.output
dstore = cogent3.open_data_store(outdir, suffix="fa", mode="r")
assert len(dstore.completed) == limit


@pytest.mark.slow
def test_compara_summary(installed):
r = RUNNER.invoke(
eti_cli.compara_summary,
[f"-i{installed}"],
catch_exceptions=False,
)
assert r.exit_code == 0, r.output
assert "homology_type" in r.output
assert "ortholog_one2many" in r.output
15 changes: 13 additions & 2 deletions tests/test_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,24 @@ def test_get_gene_table_for_species(yeast_db):
assert len(got) > 0


def test_get_species_summary(yeast_db):
def test_get_species_gene_summary(yeast_db):
from cogent3.util.table import Table

got = eti_genome.get_species_summary(annot_db=yeast_db)
got = eti_genome.get_species_gene_summary(annot_db=yeast_db)
# we do not check values here, only the Type and that we have > 0 records
assert isinstance(got, Table)
assert len(got) > 0
assert "biotype" in got.header


def test_get_species_repeat_summary(yeast_db):
from cogent3.util.table import Table

got = eti_genome.get_species_repeat_summary(annot_db=yeast_db)
# we do not check values here, only the Type and that we have > 0 records
assert isinstance(got, Table)
assert len(got) > 0
assert "repeat_type" in got.header


def test_genome_coord_names(yeast_db):
Expand Down

0 comments on commit 643fc2b

Please sign in to comment.