Merge pull request #762 from broadinstitute/kl/raw_adj_check

Add nhomalt_metric param and tests for check_raw_and_adj_callstats
broadinstitute · Feb 13, 2025 · b0bf901 · b0bf901
2 parents dae5f93 + 388e2b7
commit b0bf901
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 6 deletions.
diff --git a/gnomad/assessment/validity_checks.py b/gnomad/assessment/validity_checks.py
@@ -592,6 +592,7 @@ def check_raw_and_adj_callstats(
     verbose: bool,
     delimiter: str = "-",
     metric_first_field: bool = True,
+    nhomalt_metric: str = "nhomalt",
 ) -> None:
     """
     Perform validity checks on raw and adj data in input Table/MatrixTable.
@@ -608,6 +609,7 @@ def check_raw_and_adj_callstats(
     :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.
     :param delimiter: String to use as delimiter when making group label combinations. Default is "-".
     :param metric_first_field: If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True.
+    :param nhomalt_metric: Name of metric denoting homozygous alternate counts. Default is "nhomalt".
     :return: None
     """
     t = t.rows() if isinstance(t, hl.MatrixTable) else t
@@ -616,7 +618,7 @@ def check_raw_and_adj_callstats(
 
     for group in ["raw", "adj"]:
         # Check AC and nhomalt missing if AN is missing and defined if AN is defined.
-        for subfield in ["AC", "nhomalt"]:
+        for subfield in ["AC", nhomalt_metric]:
             check_field = f"{subfield}{delimiter}{group}"
             an_field = f"AN{delimiter}{group}"
             field_check_expr[
@@ -633,6 +635,21 @@ def check_raw_and_adj_callstats(
                 ),
             }
 
+        # Check that nhomalt <= AC / 2.
+        check_field_nhomalt = f"{nhomalt_metric}{delimiter}{group}"
+        check_field_AC = f"AC{delimiter}{group}"
+
+        field_check_expr[f"{check_field_nhomalt} <= {check_field_AC} / 2"] = {
+            "expr": t.info[check_field_nhomalt] > (t.info[check_field_AC] / 2),
+            "agg_func": hl.agg.count_where,
+            "display_fields": hl.struct(
+                **{
+                    check_field_nhomalt: t.info[check_field_nhomalt],
+                    check_field_AC: t.info[check_field_AC],
+                }
+            ),
+        }
+
         # Check AF missing if AN is missing and defined if AN is defined and > 0.
         check_field = f"AF{delimiter}{group}"
         an_field = f"AN{delimiter}{group}"
@@ -662,15 +679,15 @@ def check_raw_and_adj_callstats(
         }
 
     for subfield in ["AC", "AF"]:
-        # Check raw AC, AF > 0
+        # If defined, check that raw AC, AF > 0.
         check_field = f"{subfield}{delimiter}raw"
         field_check_expr[f"{check_field} > 0"] = {
             "expr": t.info[check_field] <= 0,
             "agg_func": hl.agg.count_where,
             "display_fields": hl.struct(**{check_field: t.info[check_field]}),
         }
 
-        # Check adj AC, AF > 0
+        # If defined, check adj AC, AF >= 0.
         check_field = f"{subfield}{delimiter}adj"
         field_check_expr[f"{check_field} >= 0"] = {
             "expr": t.info[check_field] < 0,
@@ -680,8 +697,8 @@ def check_raw_and_adj_callstats(
             ),
         }
 
-    # Check overall gnomad's raw subfields >= adj
-    for subfield in ["AC", "AN", "nhomalt"]:
+    # Check overall gnomad's raw subfields >= adj.
+    for subfield in ["AC", "AN", nhomalt_metric]:
         check_field_left = f"{subfield}{delimiter}raw"
         check_field_right = f"{subfield}{delimiter}adj"
 
@@ -697,7 +714,7 @@ def check_raw_and_adj_callstats(
         }
 
         for subset in subsets:
-            # Add delimiter for subsets but not "" representing entire callset
+            # Add delimiter for subsets but not "" representing entire callset.
             if subset:
                 subset += delimiter
             field_check_label = (

diff --git a/tests/assessment/test_validity_checks.py b/tests/assessment/test_validity_checks.py
@@ -8,6 +8,7 @@
 
 from gnomad.assessment.validity_checks import (
     check_missingness_of_struct,
+    check_raw_and_adj_callstats,
     check_sex_chr_metrics,
     flatten_missingness_struct,
     make_group_sum_expr_dict,
@@ -497,3 +498,138 @@ def test_sum_group_callstats(ht_for_group_sums, caplog) -> None:
 
     for msg in expected_logs:
         assert msg in log_messages, f"Expected log message is missing: {msg}"
+
+
+@pytest.fixture
+def ht_for_check_raw_and_adj_callstats() -> hl.Table:
+    """Fixture to create a Hail Table with the expected structure and test values for check_raw_and_adj_callstats, using underscore as the delimiter."""
+    data = [
+        {
+            "idx": 0,
+            "info": {
+                "AC_raw": 5,
+                "AC_adj": 3,
+                "AF_raw": 0.02,
+                "AF_adj": 0.01,
+                "AN_raw": 2500,
+                "AN_adj": 2400,
+                "nhomalt_raw": 1,  # Defined since AN_raw is defined
+                "nhomalt_adj": 0,
+            },
+            "filters": hl.empty_set(hl.tstr),
+        },
+        {
+            "idx": 1,
+            "info": {
+                "AC_raw": 0,
+                "AC_adj": 0,
+                "AF_raw": 0.0,
+                "AF_adj": 0.0,
+                "AN_raw": 0,
+                "AN_adj": 0,
+                "nhomalt_raw": None,
+                "nhomalt_adj": None,
+            },
+            "filters": hl.empty_set(hl.tstr),
+        },
+        {
+            "idx": 2,
+            "info": {
+                "AC_raw": -1,
+                "AC_adj": -1,
+                "AF_raw": -0.01,
+                "AF_adj": -0.01,
+                "AN_raw": 1000,
+                "AN_adj": 1100,
+                "nhomalt_raw": -3,
+                "nhomalt_adj": 2,
+            },
+            "filters": {"LowQual"},
+        },
+        {
+            "idx": 3,
+            "info": {
+                "AC_raw": 10,
+                "AF_raw": 0.05,
+                "AN_raw": 3000,
+                "AN_adj": 2000,
+                "AC_adj": 8,
+                "AF_adj": 0.02,
+                "nhomalt_raw": 3,
+                "nhomalt_adj": 1,
+            },
+            "filters": hl.empty_set(hl.tstr),
+        },
+        {
+            "idx": 4,
+            "info": {
+                "AC_raw": None,
+                "AF_raw": 0.05,
+                "AN_raw": None,
+                "AN_adj": 1000,
+                "AC_adj": 8,
+                "AF_adj": 0.03,
+                "nhomalt_raw": None,
+                "nhomalt_adj": 1,
+            },
+            "filters": hl.empty_set(hl.tstr),
+        },
+    ]
+
+    ht = hl.Table.parallelize(
+        data,
+        hl.tstruct(
+            idx=hl.tint32,
+            info=hl.tstruct(
+                AC_raw=hl.tint32,
+                AC_adj=hl.tint32,
+                AF_raw=hl.tfloat64,
+                AF_adj=hl.tfloat64,
+                AN_raw=hl.tint32,
+                AN_adj=hl.tint32,
+                nhomalt_raw=hl.tint32,
+                nhomalt_adj=hl.tint32,
+            ),
+            filters=hl.tset(hl.tstr),
+        ),
+    )
+
+    return ht
+
+
+def test_check_raw_and_adj_callstats(
+    ht_for_check_raw_and_adj_callstats, caplog
+) -> None:
+    """Test check_raw_and_adj_callstats function and it's expected log output."""
+    ht = ht_for_check_raw_and_adj_callstats
+    with caplog.at_level(logging.INFO, logger="gnomad.assessment.validity_checks"):
+        check_raw_and_adj_callstats(
+            ht, subsets=[""], verbose=True, delimiter="_", metric_first_field=True
+        )
+
+    log_messages = [record.getMessage() for record in caplog.records]
+
+    expected_logs = [
+        # Expected PASSES.
+        "PASSED AC_raw defined when AN defined and missing when AN missing check",
+        "PASSED AC_adj defined when AN defined and missing when AN missing check",
+        "PASSED AF_adj defined when AN defined (and > 0) and missing when AN missing check",
+        "PASSED AC_raw >= AC_adj check",
+        "PASSED nhomalt_raw <= AC_raw / 2 check",
+        # Expected FAILURES.
+        "Found 1 sites that fail nhomalt_raw defined when AN defined and missing when AN missing check:",
+        "Found 1 sites that fail AF_raw defined when AN defined (and > 0) and missing when AN missing check:",
+        "Found 1 sites that fail AF_raw missing when AN 0 check:",
+        "Found 1 sites that fail nhomalt_adj defined when AN defined and missing when AN missing check:",
+        "Found 1 sites that fail AF_adj missing when AN 0 check:",
+        "Found 2 sites that fail AC_raw > 0 check:",
+        "Found 1 sites that fail AC_adj >= 0 check:",
+        "Found 2 sites that fail AF_raw > 0 check:",
+        "Found 1 sites that fail AF_adj >= 0 check:",
+        "Found 1 sites that fail AN_raw >= AN_adj check:",
+        "Found 1 sites that fail nhomalt_raw >= nhomalt_adj check:",
+        "Found 1 sites that fail nhomalt_adj <= AC_adj / 2 check:",
+    ]
+
+    for msg in expected_logs:
+        assert msg in log_messages, f"Expected log message is missing: {msg}"