Skip to content

Commit

Permalink
Merge pull request #762 from broadinstitute/kl/raw_adj_check
Browse files Browse the repository at this point in the history
Add nhomalt_metric param and tests for check_raw_and_adj_callstats
  • Loading branch information
klaricch authored Feb 13, 2025
2 parents dae5f93 + 388e2b7 commit b0bf901
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 6 deletions.
29 changes: 23 additions & 6 deletions gnomad/assessment/validity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def check_raw_and_adj_callstats(
verbose: bool,
delimiter: str = "-",
metric_first_field: bool = True,
nhomalt_metric: str = "nhomalt",
) -> None:
"""
Perform validity checks on raw and adj data in input Table/MatrixTable.
Expand All @@ -608,6 +609,7 @@ def check_raw_and_adj_callstats(
:param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.
:param delimiter: String to use as delimiter when making group label combinations. Default is "-".
:param metric_first_field: If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True.
:param nhomalt_metric: Name of metric denoting homozygous alternate counts. Default is "nhomalt".
:return: None
"""
t = t.rows() if isinstance(t, hl.MatrixTable) else t
Expand All @@ -616,7 +618,7 @@ def check_raw_and_adj_callstats(

for group in ["raw", "adj"]:
# Check AC and nhomalt missing if AN is missing and defined if AN is defined.
for subfield in ["AC", "nhomalt"]:
for subfield in ["AC", nhomalt_metric]:
check_field = f"{subfield}{delimiter}{group}"
an_field = f"AN{delimiter}{group}"
field_check_expr[
Expand All @@ -633,6 +635,21 @@ def check_raw_and_adj_callstats(
),
}

# Check that nhomalt <= AC / 2.
check_field_nhomalt = f"{nhomalt_metric}{delimiter}{group}"
check_field_AC = f"AC{delimiter}{group}"

field_check_expr[f"{check_field_nhomalt} <= {check_field_AC} / 2"] = {
"expr": t.info[check_field_nhomalt] > (t.info[check_field_AC] / 2),
"agg_func": hl.agg.count_where,
"display_fields": hl.struct(
**{
check_field_nhomalt: t.info[check_field_nhomalt],
check_field_AC: t.info[check_field_AC],
}
),
}

# Check AF missing if AN is missing and defined if AN is defined and > 0.
check_field = f"AF{delimiter}{group}"
an_field = f"AN{delimiter}{group}"
Expand Down Expand Up @@ -662,15 +679,15 @@ def check_raw_and_adj_callstats(
}

for subfield in ["AC", "AF"]:
# Check raw AC, AF > 0
# If defined, check that raw AC, AF > 0.
check_field = f"{subfield}{delimiter}raw"
field_check_expr[f"{check_field} > 0"] = {
"expr": t.info[check_field] <= 0,
"agg_func": hl.agg.count_where,
"display_fields": hl.struct(**{check_field: t.info[check_field]}),
}

# Check adj AC, AF > 0
# If defined, check adj AC, AF >= 0.
check_field = f"{subfield}{delimiter}adj"
field_check_expr[f"{check_field} >= 0"] = {
"expr": t.info[check_field] < 0,
Expand All @@ -680,8 +697,8 @@ def check_raw_and_adj_callstats(
),
}

# Check overall gnomad's raw subfields >= adj
for subfield in ["AC", "AN", "nhomalt"]:
# Check overall gnomad's raw subfields >= adj.
for subfield in ["AC", "AN", nhomalt_metric]:
check_field_left = f"{subfield}{delimiter}raw"
check_field_right = f"{subfield}{delimiter}adj"

Expand All @@ -697,7 +714,7 @@ def check_raw_and_adj_callstats(
}

for subset in subsets:
# Add delimiter for subsets but not "" representing entire callset
# Add delimiter for subsets but not "" representing entire callset.
if subset:
subset += delimiter
field_check_label = (
Expand Down
136 changes: 136 additions & 0 deletions tests/assessment/test_validity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from gnomad.assessment.validity_checks import (
check_missingness_of_struct,
check_raw_and_adj_callstats,
check_sex_chr_metrics,
flatten_missingness_struct,
make_group_sum_expr_dict,
Expand Down Expand Up @@ -497,3 +498,138 @@ def test_sum_group_callstats(ht_for_group_sums, caplog) -> None:

for msg in expected_logs:
assert msg in log_messages, f"Expected log message is missing: {msg}"


@pytest.fixture
def ht_for_check_raw_and_adj_callstats() -> hl.Table:
"""Fixture to create a Hail Table with the expected structure and test values for check_raw_and_adj_callstats, using underscore as the delimiter."""
data = [
{
"idx": 0,
"info": {
"AC_raw": 5,
"AC_adj": 3,
"AF_raw": 0.02,
"AF_adj": 0.01,
"AN_raw": 2500,
"AN_adj": 2400,
"nhomalt_raw": 1, # Defined since AN_raw is defined
"nhomalt_adj": 0,
},
"filters": hl.empty_set(hl.tstr),
},
{
"idx": 1,
"info": {
"AC_raw": 0,
"AC_adj": 0,
"AF_raw": 0.0,
"AF_adj": 0.0,
"AN_raw": 0,
"AN_adj": 0,
"nhomalt_raw": None,
"nhomalt_adj": None,
},
"filters": hl.empty_set(hl.tstr),
},
{
"idx": 2,
"info": {
"AC_raw": -1,
"AC_adj": -1,
"AF_raw": -0.01,
"AF_adj": -0.01,
"AN_raw": 1000,
"AN_adj": 1100,
"nhomalt_raw": -3,
"nhomalt_adj": 2,
},
"filters": {"LowQual"},
},
{
"idx": 3,
"info": {
"AC_raw": 10,
"AF_raw": 0.05,
"AN_raw": 3000,
"AN_adj": 2000,
"AC_adj": 8,
"AF_adj": 0.02,
"nhomalt_raw": 3,
"nhomalt_adj": 1,
},
"filters": hl.empty_set(hl.tstr),
},
{
"idx": 4,
"info": {
"AC_raw": None,
"AF_raw": 0.05,
"AN_raw": None,
"AN_adj": 1000,
"AC_adj": 8,
"AF_adj": 0.03,
"nhomalt_raw": None,
"nhomalt_adj": 1,
},
"filters": hl.empty_set(hl.tstr),
},
]

ht = hl.Table.parallelize(
data,
hl.tstruct(
idx=hl.tint32,
info=hl.tstruct(
AC_raw=hl.tint32,
AC_adj=hl.tint32,
AF_raw=hl.tfloat64,
AF_adj=hl.tfloat64,
AN_raw=hl.tint32,
AN_adj=hl.tint32,
nhomalt_raw=hl.tint32,
nhomalt_adj=hl.tint32,
),
filters=hl.tset(hl.tstr),
),
)

return ht


def test_check_raw_and_adj_callstats(
ht_for_check_raw_and_adj_callstats, caplog
) -> None:
"""Test check_raw_and_adj_callstats function and it's expected log output."""
ht = ht_for_check_raw_and_adj_callstats
with caplog.at_level(logging.INFO, logger="gnomad.assessment.validity_checks"):
check_raw_and_adj_callstats(
ht, subsets=[""], verbose=True, delimiter="_", metric_first_field=True
)

log_messages = [record.getMessage() for record in caplog.records]

expected_logs = [
# Expected PASSES.
"PASSED AC_raw defined when AN defined and missing when AN missing check",
"PASSED AC_adj defined when AN defined and missing when AN missing check",
"PASSED AF_adj defined when AN defined (and > 0) and missing when AN missing check",
"PASSED AC_raw >= AC_adj check",
"PASSED nhomalt_raw <= AC_raw / 2 check",
# Expected FAILURES.
"Found 1 sites that fail nhomalt_raw defined when AN defined and missing when AN missing check:",
"Found 1 sites that fail AF_raw defined when AN defined (and > 0) and missing when AN missing check:",
"Found 1 sites that fail AF_raw missing when AN 0 check:",
"Found 1 sites that fail nhomalt_adj defined when AN defined and missing when AN missing check:",
"Found 1 sites that fail AF_adj missing when AN 0 check:",
"Found 2 sites that fail AC_raw > 0 check:",
"Found 1 sites that fail AC_adj >= 0 check:",
"Found 2 sites that fail AF_raw > 0 check:",
"Found 1 sites that fail AF_adj >= 0 check:",
"Found 1 sites that fail AN_raw >= AN_adj check:",
"Found 1 sites that fail nhomalt_raw >= nhomalt_adj check:",
"Found 1 sites that fail nhomalt_adj <= AC_adj / 2 check:",
]

for msg in expected_logs:
assert msg in log_messages, f"Expected log message is missing: {msg}"

0 comments on commit b0bf901

Please sign in to comment.