From f24e6a52e1aa8e44408586b4236349bb3b9f86e4 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 21 Jan 2025 17:55:08 -0500 Subject: [PATCH 01/56] Add function to compute post probability of de novos --- gnomad/sample_qc/relatedness.py | 148 ++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index b10880261..76f7566a2 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -7,6 +7,7 @@ import hail as hl import networkx as nx +from gnomad.utils.filtering import add_filters_expr logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) @@ -1298,3 +1299,150 @@ def _get_alt_count(locus, gt, is_female): ) return sib_stats + + +def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 / 3e7): + """ + Get the population frequency prior for a de novo mutation. + + :param freq_prior_expr: The population frequency prior for the variant. + :param min_pop_prior: The minimum population frequency prior. + """ + return hl.max( + hl.or_else( + hl.case() + .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) + .or_error( + hl.format( + "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", + freq_prior_expr, + ) + ), + 0.0, + ), + min_pop_prior, + ) + + +def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: + """ + Transform the PLs into the probability of observing genotype. + + :param pl_expr: ArrayExpression of PL. + :return: ArrayExpression of the probability of observing each genotype. + """ + return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) + + +def calculate_dn_post_prob( + pl_proband: hl.expr.ArrayExpression, + pl_father: hl.expr.ArrayExpression, + pl_mother: hl.expr.ArrayExpression, + freq_prior_expr: hl.expr.Float64Expression, + dn_prior: float = 1 / 3e7, + hemi_x: bool = False, + hemi_y: bool = False, +) -> hl.expr.Float64Expression: + """ + Calculate the posterior probability of a de novo mutation. + + This function computes the posterior probability of a de novo mutation (P_dn) + using the likelihoods of the proband and parents' genotypes and the population + frequency prior for the variant. + + Based on [Samocha et al. 2014](https://github.com/ksamocha/de_novo_scripts), + the posterior probability of a de novo mutation (P_dn) is computed as: + + P_dn = P(DN | data) / (P(DN | data) + P(missed het in parent(s) | data)) + + The terms are defined as: + - P(DN | data): The probability of a de novo mutation given the data. This is computed as: + P(DN | data) = P(data | DN) * P(DN) + + - P(data | DN): The probability of observing the data under the assumption of a de novo mutation: + * Autosomes and PAR regions: + P(data | DN) = P(hom_ref in father) * P(hom_ref in mother) * P(het in proband) + * X non-PAR regions (males only): + P(data | DN) = P(hom_ref in mother) * P(het in proband) + * Y non-PAR regions (males only): + P(data | DN) = P(hom_ref in father) * P(het in proband) + + - P(DN): The prior probability of a de novo mutation, fixed at: + P(DN) = 1 / 3e7 + + - P(missed het in parent(s) | data): The probability of observing missed het in + parent(s) given the data. This is computed as: + + P(missed het in parent(s) | data) = P(data | missed het in parent(s)) * P(missed het in parent(s)) + + - P(data | missed het in parent(s)): The probability of observing the data under + the assumption of a missed het in parent(s): + * Autosomes and PAR regions: + P(data | missed het in parents) = (P(het in father) * P(hom_ref in + mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) * + P(het in one parent) + * X non-PAR regions: + P(data | missed het in mother) = (P(het in mother) * P(hom_var in + mother)) * P(hom_var in proband) * P(het in one parent) + * Y non-PAR regions: + P(data | missed het in father) = (P(het in father) * P(hom_var in + father)) * P(hom_var in proband) * P(het in one parent) + - P(het in one parent): The prior probability of a het in one parent, fixed at: + 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. + + Parameters + ---------- + pl_proband : hl.expr.ArrayExpression + Phred-scaled genotype likelihoods for the proband. + pl_father : hl.expr.ArrayExpression + Phred-scaled genotype likelihoods for the father. + pl_mother : hl.expr.ArrayExpression + Phred-scaled genotype likelihoods for the mother. + freq_prior_expr : hl.expr.Float64Expression + The population frequency prior for the variant. + dn_prior : float, optional + The prior probability of a de novo mutation, by default 1 / 3e7. + hemi_x : bool, optional + Whether the variant is in the non-PAR region of the X chromosome (males only). + hemi_y : bool, optional + Whether the variant is in the non-PAR region of the Y chromosome (males only). + + Returns + ------- + hl.expr.Float64Expression + Posterior probability of a de novo mutation (P_dn). + """ + if hemi_x and hemi_y: + raise ValueError("Both hemi_x and hemi_y cannot be True simultaneously.") + + # Prior probability of a het in one parent + prior_one_het = 1 - (1 - freq_prior_expr) ** 4 + + # Convert PL to probabilities + pp_proband = transform_pl_to_pp(pl_proband) + pp_father = transform_pl_to_pp(pl_father) + pp_mother = transform_pl_to_pp(pl_mother) + + # Compute P(data | DN) and P(data | missed het in parent(s)) + if hemi_x: + prob_data_given_dn = pp_mother[0] * pp_proband[1] + prob_data_missed_het = ( + (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_het + ) + elif hemi_y: + prob_data_given_dn = pp_father[0] * pp_proband[1] + prob_data_missed_het = ( + (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_het + ) + else: + prob_data_given_dn = pp_father[0] * pp_mother[0] * pp_proband[1] + prob_data_missed_het = ( + (pp_father[1] * pp_mother[0] + pp_father[0] * pp_mother[1]) + * pp_proband[1] + * prior_one_het + ) + + # Compute P(DN | data) and normalize + prob_dn_given_data = prob_data_given_dn * dn_prior + p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) + return p_dn From 3f3f8e31dbb5f361246192a178528636df54033d Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 24 Jan 2025 08:56:47 -0500 Subject: [PATCH 02/56] confidence and fail check function --- gnomad/sample_qc/relatedness.py | 207 +++++++++++++++++++++++++++++++- 1 file changed, 203 insertions(+), 4 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 76f7566a2..6e06160d1 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1334,12 +1334,13 @@ def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpress return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) -def calculate_dn_post_prob( +def calculate_de_novo_post_prob( pl_proband: hl.expr.ArrayExpression, pl_father: hl.expr.ArrayExpression, pl_mother: hl.expr.ArrayExpression, freq_prior_expr: hl.expr.Float64Expression, - dn_prior: float = 1 / 3e7, + min_pop_prior: float, + de_novo_prior: float, hemi_x: bool = False, hemi_y: bool = False, ) -> hl.expr.Float64Expression: @@ -1400,7 +1401,8 @@ def calculate_dn_post_prob( Phred-scaled genotype likelihoods for the mother. freq_prior_expr : hl.expr.Float64Expression The population frequency prior for the variant. - dn_prior : float, optional + min_pop_prior : float, optional. + de_novo_prior : float, optional. The prior probability of a de novo mutation, by default 1 / 3e7. hemi_x : bool, optional Whether the variant is in the non-PAR region of the X chromosome (males only). @@ -1415,6 +1417,8 @@ def calculate_dn_post_prob( if hemi_x and hemi_y: raise ValueError("Both hemi_x and hemi_y cannot be True simultaneously.") + freq_prior_expr = get_freq_prior(freq_prior_expr, min_pop_prior) + # Prior probability of a het in one parent prior_one_het = 1 - (1 - freq_prior_expr) ** 4 @@ -1443,6 +1447,201 @@ def calculate_dn_post_prob( ) # Compute P(DN | data) and normalize - prob_dn_given_data = prob_data_given_dn * dn_prior + prob_dn_given_data = prob_data_given_dn * de_novo_prior p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) return p_dn + + +def get_de_novo_expr( + locus_expr: hl.expr.LocusExpression, + proband_expr: hl.expr.StructExpression, + father_expr: hl.expr.StructExpression, + mother_expr: hl.expr.StructExpression, + allele_expr: hl.expr.ArrayExpression, + is_female_expr: hl.expr.BooleanExpression, + freq_prior_expr: hl.expr.Float64Expression, + min_pop_prior: float = 100 / 3e7, + de_novo_prior: float = 1 / 3e7, + min_dp_ratio: float = 0.1, + min_gq: int = 20, + min_proband_ab: float = 0.2, + max_parent_ab: float = 0.05, + min_de_novo_p: float = 0.05, + high_conf_dp_ratio: float = 0.2, + dp_threshold_snp: int = 10, + high_med_conf_ab: float = 0.3, + low_conf_ab: float = 0.2, + high_conf_p: float = 0.99, + med_conf_p: float = 0.5, + low_conf_p: float = 0.2, +) -> hl.expr.StructExpression: + """ + Get the de novo status of a variant, based on the proband and parent genotypes. + + Thresholds: + ------------ + +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+ + | Metric | FAIL | HIGH (Indel) | HIGH (SNV) 1 | HIGH (SNV 2) | MEDIUM | LOW | + +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+ + | P (de novo) | < 0.05 | > 0.99 | > 0.99 | > 0.5 | > 0.5 | | + | AB | AB(proband) < 0.2 | AB > 0.3 | AB > 0.3 | AB > 0.3 | > 0.3 | > 0.2 | + | | OR AB(parent(s)) > 0.05 | | | | | 0.2 | + | AD | 0 in either parent | | | | | | + | DP | | | DP(proband) > 10 | | | | + | DR (DP ratio) | DP(proband/parent(s)) < 0.1| | | DR > 0.2 | | | + | GQ | GQ(proband) < 20 | | | | | | + | AC | | AC = 1 | | AC < 10 | | | + +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+ + + locus_expr : hl.expr.LocusExpression + Variant's genomic locus. + proband_expr : hl.expr.StructExpression + Proband genotype details (e.g., DP, GQ, AD, GT). + father_expr : hl.expr.StructExpression + Father's genotype details (e.g., DP, AD). + mother_expr : hl.expr.StructExpression + Mother's genotype details (e.g., DP, AD). + allele_expr : hl.expr.ArrayExpression + Variant alleles. + is_female_expr : hl.expr.BooleanExpression + Whether the proband is female. + freq_prior_expr : hl.expr.Float64Expression + Population frequency prior for the variant. + min_pop_prior : float, optional + Minimum population frequency prior (default: 100 / 3e7). + de_novo_prior : float, optional + Prior probability of a de novo mutation (default: 1 / 3e7). + min_dp_ratio : float, optional + Minimum depth ratio for proband to parents (default: 0.1). + min_gq : int, optional + Minimum genotype quality for the proband (default: 20). + min_proband_ab : float, optional + Minimum allele balance for the proband (default: 0.2). + max_parent_ab : float, optional + Maximum allele balance for parents (default: 0.05). + min_de_novo_p : float, optional + Minimum de novo probability to pass (default: 0.05). + high_conf_dp_ratio : float, optional + DP ratio threshold for high confidence (default: 0.2). + dp_threshold_snp : int, optional + Minimum depth for high-confidence SNPs (default: 10). + high_med_conf_ab : float, optional + AB threshold for high/medium confidence (default: 0.3). + low_conf_ab : float, optional + AB threshold for low confidence (default: 0.2). + high_conf_p : float, optional + P(de novo) threshold for high confidence (default: 0.99). + med_conf_p : float, optional + P(de novo) threshold for medium confidence (default: 0.5). + low_conf_p : float, optional + P(de novo) threshold for low confidence (default: 0.2). + + Returns + ------- + hl.expr.StructExpression + A struct containing: + - `confidence`: Confidence level ("HIGH", "MEDIUM", "LOW", or missing). + - `fail`: Boolean indicating if the variant fails any checks. + - `fail_reason`: Set of strings with reasons for failure. + """ + # Determine genomic context + not_hemi_expr = locus_expr.in_autosome_or_par() | ( + locus_expr.in_x_nonpar() & is_female_expr + ) + hemi_x_expr = locus_expr.in_x_nonpar() & ~is_female_expr + hemi_y_expr = locus_expr.in_y_nonpar() & ~is_female_expr + + p_de_novo = calculate_de_novo_post_prob( + proband_expr.PL, + father_expr.PL, + mother_expr.PL, + freq_prior_expr, + min_pop_prior=min_pop_prior, + de_novo_prior=de_novo_prior, + hemi_x=hemi_x_expr, + hemi_y=hemi_y_expr, + ) + + # Calculate DP ratio + parent_dp = ( + hl.case() + .when(not_hemi_expr, father_expr.DP + mother_expr.DP) + .when(hemi_x_expr, mother_expr.DP) + .when(hemi_y_expr, father_expr.DP) + .or_missing() + ) + dp_ratio = proband_expr.DP / parent_dp + + # Key metrics + proband_ab = proband_expr.AD[1] / hl.sum(proband_expr.AD) + is_snp = hl.is_snp(allele_expr[0], allele_expr[1]) + + # Confidence assignment + confidence = ( + hl.case() + .when( + ( + is_snp + & (p_de_novo > 0.99) + & (proband_ab > high_med_conf_ab) + & ( + (proband_expr.DP > dp_threshold_snp) + | (dp_ratio > high_conf_dp_ratio) + ) + ) + | (~is_snp & (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab)), + "HIGH", + ) + .when((p_de_novo > med_conf_p) & (proband_ab > high_med_conf_ab), "MEDIUM") + .when((p_de_novo > low_conf_p) & (proband_ab > low_conf_ab), "LOW") + .or_missing() + ) + + # Fail checks + fail_checks = { + "min_dp_ratio": dp_ratio < min_dp_ratio, + "parent_sum_ad_0": ( + hl.case() + .when(not_hemi_expr, (father_expr.AD[0] == 0) | (mother_expr.AD[0] == 0)) + .when(hemi_x_expr, mother_expr.AD[0] == 0) + .when(hemi_y_expr, father_expr.AD[0] == 0) + .or_missing() + ), + "max_parent_ab": ( + hl.case() + .when( + not_hemi_expr, + (father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab) + | (mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab), + ) + .when( + hemi_x_expr, mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab + ) + .when( + hemi_y_expr, father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab + ) + .or_missing() + ), + "min_proband_ab": proband_ab < min_proband_ab, + "min_proband_gq": proband_expr.GQ < min_gq, + "min_de_novo_p": p_de_novo < min_de_novo_p, + "not_de_novo": ( + not_hemi_expr + & ~( + proband_expr.GT.is_het() + & father_expr.GT.is_hom_ref() + & mother_expr.GT.is_hom_ref() + ) + | hemi_x_expr + & ~(proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr + & ~(proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) + ), + } + + # Combine fail reasons + fail_reason = hl.set([key for key, value in fail_checks.items() if value]) + fail_expr = hl.any(list(fail_checks.values())) + + return hl.struct(p_de_novo=p_de_novo, confidence=confidence, fail=fail_expr, + fail_reason=fail_reason) From e681bdccfe0b11694b4d9ecc0220e4194bdbead1 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:24:40 -0500 Subject: [PATCH 03/56] Modify de novo function --- gnomad/sample_qc/relatedness.py | 274 ++++++++++++++++++-------------- 1 file changed, 153 insertions(+), 121 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 6e06160d1..bd1db343d 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -7,6 +7,8 @@ import hail as hl import networkx as nx +from hail.utils.tutorial import resources + from gnomad.utils.filtering import add_filters_expr logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -1334,15 +1336,40 @@ def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpress return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) +def get_genomic_context( + locus_expr: hl.expr.LocusExpression, + is_female_expr: hl.expr.BooleanExpression, +) -> Tuple[ + hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression +]: + """ + Determine the genomic context of a variant. + + :param locus_expr: LocusExpression of the variant. + :param is_female_expr: BooleanExpression indicating whether the proband is female. + :return: A tuple of BooleanExpressions: + - not_hemi_expr: True if the variant is in autosomes or PAR regions. + - hemi_x_expr: True if the variant is in the X non-PAR region for males. + - hemi_y_expr: True if the variant is in the Y non-PAR region for males. + """ + not_hemi_expr = locus_expr.in_autosome_or_par() | ( + locus_expr.in_x_nonpar() & is_female_expr + ) + hemi_x_expr = locus_expr.in_x_nonpar() & ~is_female_expr + hemi_y_expr = locus_expr.in_y_nonpar() & ~is_female_expr + + return not_hemi_expr, hemi_x_expr, hemi_y_expr + + def calculate_de_novo_post_prob( pl_proband: hl.expr.ArrayExpression, pl_father: hl.expr.ArrayExpression, pl_mother: hl.expr.ArrayExpression, + locus_expr: hl.expr.LocusExpression, + is_female_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, - min_pop_prior: float, - de_novo_prior: float, - hemi_x: bool = False, - hemi_y: bool = False, + min_pop_prior: Optional[float] = 100 / 3e7, + de_novo_prior: Optional[float] = 1 / 3e7, ) -> hl.expr.Float64Expression: """ Calculate the posterior probability of a de novo mutation. @@ -1383,43 +1410,29 @@ def calculate_de_novo_post_prob( mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) * P(het in one parent) * X non-PAR regions: - P(data | missed het in mother) = (P(het in mother) * P(hom_var in + P(data | missed het in mother) = (P(het in mother) + P(hom_var in mother)) * P(hom_var in proband) * P(het in one parent) * Y non-PAR regions: - P(data | missed het in father) = (P(het in father) * P(hom_var in + P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband) * P(het in one parent) - P(het in one parent): The prior probability of a het in one parent, fixed at: 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. - Parameters - ---------- - pl_proband : hl.expr.ArrayExpression - Phred-scaled genotype likelihoods for the proband. - pl_father : hl.expr.ArrayExpression - Phred-scaled genotype likelihoods for the father. - pl_mother : hl.expr.ArrayExpression - Phred-scaled genotype likelihoods for the mother. - freq_prior_expr : hl.expr.Float64Expression - The population frequency prior for the variant. - min_pop_prior : float, optional. - de_novo_prior : float, optional. - The prior probability of a de novo mutation, by default 1 / 3e7. - hemi_x : bool, optional - Whether the variant is in the non-PAR region of the X chromosome (males only). - hemi_y : bool, optional - Whether the variant is in the non-PAR region of the Y chromosome (males only). - - Returns - ------- - hl.expr.Float64Expression - Posterior probability of a de novo mutation (P_dn). + :param pl_proband: Phred-scaled genotype likelihoods for the proband. + :param pl_father: Phred-scaled genotype likelihoods for the father. + :param pl_mother: Phred-scaled genotype likelihoods for the mother. + :param locus_expr: LocusExpression of the variant. + :param is_female_expr: BooleanExpression indicating the proband's sex. + :param freq_prior_expr: Population frequency prior for the variant. + :param min_pop_prior: Minimum population frequency prior (default: 1e-8). + :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). + :return: Posterior probability of a de novo mutation (P_dn). """ - if hemi_x and hemi_y: - raise ValueError("Both hemi_x and hemi_y cannot be True simultaneously.") + # Ensure valid genomic context + not_hemi_expr, hemi_x, hemi_y = get_genomic_context(locus_expr, is_female_expr) + # Adjust frequency prior freq_prior_expr = get_freq_prior(freq_prior_expr, min_pop_prior) - - # Prior probability of a het in one parent prior_one_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities @@ -1427,37 +1440,42 @@ def calculate_de_novo_post_prob( pp_father = transform_pl_to_pp(pl_father) pp_mother = transform_pl_to_pp(pl_mother) - # Compute P(data | DN) and P(data | missed het in parent(s)) - if hemi_x: - prob_data_given_dn = pp_mother[0] * pp_proband[1] - prob_data_missed_het = ( - (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_het - ) - elif hemi_y: - prob_data_given_dn = pp_father[0] * pp_proband[1] - prob_data_missed_het = ( - (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_het - ) - else: - prob_data_given_dn = pp_father[0] * pp_mother[0] * pp_proband[1] - prob_data_missed_het = ( + # Compute P(data | DN) + prob_data_given_dn = ( + hl.case() + .when(hemi_x, pp_mother[0] * pp_proband[1]) + .when(hemi_y, pp_father[0] * pp_proband[1]) + .when(not_hemi_expr, pp_father[0] * pp_mother[0] * pp_proband[1]) + .or_missing() + ) + + # Compute P(data | missed het in parent(s)) + prob_data_missed_het = ( + hl.case() + .when(hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_het) + .when(hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_het) + .when( + not_hemi_expr, (pp_father[1] * pp_mother[0] + pp_father[0] * pp_mother[1]) * pp_proband[1] - * prior_one_het + * prior_one_het, ) + .or_missing() + ) - # Compute P(DN | data) and normalize + # Calculate posterior probability of de novo mutation prob_dn_given_data = prob_data_given_dn * de_novo_prior p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) + return p_dn def get_de_novo_expr( locus_expr: hl.expr.LocusExpression, + alleles_expr: hl.expr.ArrayExpression, proband_expr: hl.expr.StructExpression, father_expr: hl.expr.StructExpression, mother_expr: hl.expr.StructExpression, - allele_expr: hl.expr.ArrayExpression, is_female_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, min_pop_prior: float = 100 / 3e7, @@ -1480,29 +1498,33 @@ def get_de_novo_expr( Thresholds: ------------ - +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+ - | Metric | FAIL | HIGH (Indel) | HIGH (SNV) 1 | HIGH (SNV 2) | MEDIUM | LOW | - +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+ - | P (de novo) | < 0.05 | > 0.99 | > 0.99 | > 0.5 | > 0.5 | | - | AB | AB(proband) < 0.2 | AB > 0.3 | AB > 0.3 | AB > 0.3 | > 0.3 | > 0.2 | - | | OR AB(parent(s)) > 0.05 | | | | | 0.2 | - | AD | 0 in either parent | | | | | | - | DP | | | DP(proband) > 10 | | | | - | DR (DP ratio) | DP(proband/parent(s)) < 0.1| | | DR > 0.2 | | | - | GQ | GQ(proband) < 20 | | | | | | - | AC | | AC = 1 | | AC < 10 | | | - +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+ + +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ + | Metric | FAIL | HIGH (Indel) | HIGH (SNV) 1 | HIGH (SNV) 2 | MEDIUM | LOW | VERY LOW | + +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ + | P (de novo) | < 0.05 | > 0.99 | > 0.99 | > 0.5 | > 0.5 | > 0.2 | >= 0.05 | + | AB | AB(proband) < 0.2 | AB > 0.3 | AB > 0.3 | AB > 0.3 | > 0.3 | >= 0.2| | + | | OR AB(parent(s)) > 0.05 | | | | | | | + | AD | 0 in either parent | | | | | | | + | DP | | | DP(proband) > 10 | | | | | + | DR (DP ratio) | DP(proband/parent(s)) < 0.1| | | DR > 0.2 | | | | + | GQ | GQ(proband) < 20 | | | | | | | + | AC* | | AC = 1 | | AC < 10 | | | | + +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ + * AC is supposed to be the sum of the alternate alleles in the proband and + parents, but we have not implemented this yet because we have multiple trios + in one family, in which an allele might be de novo in a parent and + transmitted to a child in the dataset. locus_expr : hl.expr.LocusExpression Variant's genomic locus. + alleles_expr : hl.expr.ArrayExpression + Variant alleles. proband_expr : hl.expr.StructExpression - Proband genotype details (e.g., DP, GQ, AD, GT). + Proband genotype details (e.g., DP, GQ, AD, GT, PL). father_expr : hl.expr.StructExpression Father's genotype details (e.g., DP, AD). mother_expr : hl.expr.StructExpression Mother's genotype details (e.g., DP, AD). - allele_expr : hl.expr.ArrayExpression - Variant alleles. is_female_expr : hl.expr.BooleanExpression Whether the proband is female. freq_prior_expr : hl.expr.Float64Expression @@ -1544,22 +1566,33 @@ def get_de_novo_expr( - `fail`: Boolean indicating if the variant fails any checks. - `fail_reason`: Set of strings with reasons for failure. """ - # Determine genomic context - not_hemi_expr = locus_expr.in_autosome_or_par() | ( - locus_expr.in_x_nonpar() & is_female_expr - ) - hemi_x_expr = locus_expr.in_x_nonpar() & ~is_female_expr - hemi_y_expr = locus_expr.in_y_nonpar() & ~is_female_expr - p_de_novo = calculate_de_novo_post_prob( proband_expr.PL, father_expr.PL, mother_expr.PL, + locus_expr, + is_female_expr, freq_prior_expr, min_pop_prior=min_pop_prior, de_novo_prior=de_novo_prior, - hemi_x=hemi_x_expr, - hemi_y=hemi_y_expr, + ) + + # Determine genomic context + not_hemi_expr, hemi_x_expr, hemi_y_expr = get_genomic_context( + locus_expr, is_female_expr + ) + + is_de_novo = ( + not_hemi_expr + & ( + proband_expr.GT.is_het() + & father_expr.GT.is_hom_ref() + & mother_expr.GT.is_hom_ref() + ) + | hemi_x_expr + & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr + & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) ) # Calculate DP ratio @@ -1574,74 +1607,73 @@ def get_de_novo_expr( # Key metrics proband_ab = proband_expr.AD[1] / hl.sum(proband_expr.AD) - is_snp = hl.is_snp(allele_expr[0], allele_expr[1]) + is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1]) # Confidence assignment confidence = ( hl.case() .when( ( - is_snp - & (p_de_novo > 0.99) - & (proband_ab > high_med_conf_ab) - & ( - (proband_expr.DP > dp_threshold_snp) - | (dp_ratio > high_conf_dp_ratio) - ) + is_snp + & (p_de_novo > 0.99) + & (proband_ab > high_med_conf_ab) + & ( + (proband_expr.DP > dp_threshold_snp) + | (dp_ratio > high_conf_dp_ratio) + ) ) | (~is_snp & (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab)), "HIGH", ) .when((p_de_novo > med_conf_p) & (proband_ab > high_med_conf_ab), "MEDIUM") - .when((p_de_novo > low_conf_p) & (proband_ab > low_conf_ab), "LOW") + .when((p_de_novo > low_conf_p) & (proband_ab >= low_conf_ab), "LOW") + .when( + (p_de_novo >= min_de_novo_p), + "VERY LOW", + ) + .or_missing() + ) + + parent_sum_ad_0 = ( + hl.case() + .when(not_hemi_expr, (father_expr.AD[0] == 0) | (mother_expr.AD[0] == 0)) + .when(hemi_x_expr, mother_expr.AD[0] == 0) + .when(hemi_y_expr, father_expr.AD[0] == 0) + .or_missing() + ) + + fail_max_parent_ab = ( + hl.case() + .when( + not_hemi_expr, + (father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab) + | (mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab), + ) + .when( + hemi_x_expr, mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab + ) + .when( + hemi_y_expr, father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab + ) .or_missing() ) # Fail checks fail_checks = { "min_dp_ratio": dp_ratio < min_dp_ratio, - "parent_sum_ad_0": ( - hl.case() - .when(not_hemi_expr, (father_expr.AD[0] == 0) | (mother_expr.AD[0] == 0)) - .when(hemi_x_expr, mother_expr.AD[0] == 0) - .when(hemi_y_expr, father_expr.AD[0] == 0) - .or_missing() - ), - "max_parent_ab": ( - hl.case() - .when( - not_hemi_expr, - (father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab) - | (mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab), - ) - .when( - hemi_x_expr, mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab - ) - .when( - hemi_y_expr, father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab - ) - .or_missing() - ), + "parent_sum_ad_0": parent_sum_ad_0, + "max_parent_ab": fail_max_parent_ab, "min_proband_ab": proband_ab < min_proband_ab, "min_proband_gq": proband_expr.GQ < min_gq, "min_de_novo_p": p_de_novo < min_de_novo_p, - "not_de_novo": ( - not_hemi_expr - & ~( - proband_expr.GT.is_het() - & father_expr.GT.is_hom_ref() - & mother_expr.GT.is_hom_ref() - ) - | hemi_x_expr - & ~(proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) - | hemi_y_expr - & ~(proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) - ), } - # Combine fail reasons - fail_reason = hl.set([key for key, value in fail_checks.items() if value]) - fail_expr = hl.any(list(fail_checks.values())) + fail = hl.any(list(fail_checks.values())) + result_expr = hl.struct( + is_de_novo=hl.if_else(is_de_novo, True, False), + p_de_novo=hl.if_else(fail, hl.missing(hl.tfloat64), p_de_novo), + confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence), + fail_reason=add_filters_expr(filters=fail_checks), + ) - return hl.struct(p_de_novo=p_de_novo, confidence=confidence, fail=fail_expr, - fail_reason=fail_reason) + return result_expr From 4ee8cdce63efa24ed97b0d302da596ca79b3c2cb Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 29 Jan 2025 09:52:07 -0500 Subject: [PATCH 04/56] black formatting --- gnomad/sample_qc/relatedness.py | 34 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index bd1db343d..1bd93fc2b 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1583,16 +1583,14 @@ def get_de_novo_expr( ) is_de_novo = ( - not_hemi_expr - & ( + not_hemi_expr + & ( proband_expr.GT.is_het() & father_expr.GT.is_hom_ref() & mother_expr.GT.is_hom_ref() - ) - | hemi_x_expr - & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) - | hemi_y_expr - & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) + ) + | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) ) # Calculate DP ratio @@ -1614,13 +1612,13 @@ def get_de_novo_expr( hl.case() .when( ( - is_snp - & (p_de_novo > 0.99) - & (proband_ab > high_med_conf_ab) - & ( - (proband_expr.DP > dp_threshold_snp) - | (dp_ratio > high_conf_dp_ratio) - ) + is_snp + & (p_de_novo > 0.99) + & (proband_ab > high_med_conf_ab) + & ( + (proband_expr.DP > dp_threshold_snp) + | (dp_ratio > high_conf_dp_ratio) + ) ) | (~is_snp & (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab)), "HIGH", @@ -1649,12 +1647,8 @@ def get_de_novo_expr( (father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab) | (mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab), ) - .when( - hemi_x_expr, mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab - ) - .when( - hemi_y_expr, father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab - ) + .when(hemi_x_expr, mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab) + .when(hemi_y_expr, father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab) .or_missing() ) From 376811df2ce45aa0cf6268511aea0349ce881382 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 29 Jan 2025 11:27:51 -0500 Subject: [PATCH 05/56] Reformat docstring --- gnomad/sample_qc/relatedness.py | 54 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 1bd93fc2b..6277b8d2d 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -7,7 +7,6 @@ import hail as hl import networkx as nx -from hail.utils.tutorial import resources from gnomad.utils.filtering import add_filters_expr @@ -1384,37 +1383,45 @@ def calculate_de_novo_post_prob( P_dn = P(DN | data) / (P(DN | data) + P(missed het in parent(s) | data)) The terms are defined as: - - P(DN | data): The probability of a de novo mutation given the data. This is computed as: + P(DN | data): The probability of a de novo mutation given the data. This is + computed as: + P(DN | data) = P(data | DN) * P(DN) - - P(data | DN): The probability of observing the data under the assumption of a de novo mutation: - * Autosomes and PAR regions: - P(data | DN) = P(hom_ref in father) * P(hom_ref in mother) * P(het in proband) - * X non-PAR regions (males only): - P(data | DN) = P(hom_ref in mother) * P(het in proband) - * Y non-PAR regions (males only): - P(data | DN) = P(hom_ref in father) * P(het in proband) + P(data | DN): The probability of observing the data under the assumption of a de novo mutation: + * Autosomes and PAR regions: + + P(data | DN) = P(hom_ref in father) * P(hom_ref in mother) * P(het in proband) + + * X non-PAR regions (males only): + + P(data | DN) = P(hom_ref in mother) * P(het in proband) + + * Y non-PAR regions (males only): + + P(data | DN) = P(hom_ref in father) * P(het in proband) - - P(DN): The prior probability of a de novo mutation, fixed at: + P(DN): The prior probability of a de novo mutation, fixed at: P(DN) = 1 / 3e7 - - P(missed het in parent(s) | data): The probability of observing missed het in + P(missed het in parent(s) | data): The probability of observing missed het in parent(s) given the data. This is computed as: P(missed het in parent(s) | data) = P(data | missed het in parent(s)) * P(missed het in parent(s)) - - P(data | missed het in parent(s)): The probability of observing the data under - the assumption of a missed het in parent(s): - * Autosomes and PAR regions: - P(data | missed het in parents) = (P(het in father) * P(hom_ref in - mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) * - P(het in one parent) - * X non-PAR regions: - P(data | missed het in mother) = (P(het in mother) + P(hom_var in - mother)) * P(hom_var in proband) * P(het in one parent) - * Y non-PAR regions: - P(data | missed het in father) = (P(het in father) + P(hom_var in - father)) * P(hom_var in proband) * P(het in one parent) + P(data | missed het in parent(s)): The probability of observing the data under the assumption of a missed het in parent(s): + * Autosomes and PAR regions: + + P(data | missed het in parents) = (P(het in father) * P(hom_ref in mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) * P(het in one parent) + + * X non-PAR regions: + + P(data | missed het in mother) = (P(het in mother) + P(hom_var in mother)) * P(hom_var in proband) * P(het in one parent) + + * Y non-PAR regions: + + P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband) * P(het in one parent) + - P(het in one parent): The prior probability of a het in one parent, fixed at: 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. @@ -1497,7 +1504,6 @@ def get_de_novo_expr( Get the de novo status of a variant, based on the proband and parent genotypes. Thresholds: - ------------ +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ | Metric | FAIL | HIGH (Indel) | HIGH (SNV) 1 | HIGH (SNV) 2 | MEDIUM | LOW | VERY LOW | +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ From f80698c44104f6b4b27eee1edde03735bcca73d1 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:59:21 -0500 Subject: [PATCH 06/56] Change the citation --- gnomad/sample_qc/relatedness.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 6277b8d2d..6ca46dc92 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1377,7 +1377,8 @@ def calculate_de_novo_post_prob( using the likelihoods of the proband and parents' genotypes and the population frequency prior for the variant. - Based on [Samocha et al. 2014](https://github.com/ksamocha/de_novo_scripts), + Based on Kaitlin Samocha's de novo caller ( + https://github.com/ksamocha/de_novo_scripts), the posterior probability of a de novo mutation (P_dn) is computed as: P_dn = P(DN | data) / (P(DN | data) + P(missed het in parent(s) | data)) From ddf3811d8c9a8443afe4e1e4624394bcd19854e3 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:38:43 -0500 Subject: [PATCH 07/56] Apply suggestions from code review Co-authored-by: Katherine Chao --- gnomad/sample_qc/relatedness.py | 46 ++++++++++++++++----------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 6ca46dc92..5880373b1 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1337,7 +1337,7 @@ def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpress def get_genomic_context( locus_expr: hl.expr.LocusExpression, - is_female_expr: hl.expr.BooleanExpression, + is_XX_expr: hl.expr.BooleanExpression, ) -> Tuple[ hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression ]: @@ -1345,18 +1345,17 @@ def get_genomic_context( Determine the genomic context of a variant. :param locus_expr: LocusExpression of the variant. - :param is_female_expr: BooleanExpression indicating whether the proband is female. + :param is_XX_expr: BooleanExpression indicating whether the proband has an XX sex karyotype. :return: A tuple of BooleanExpressions: - not_hemi_expr: True if the variant is in autosomes or PAR regions. - - hemi_x_expr: True if the variant is in the X non-PAR region for males. - - hemi_y_expr: True if the variant is in the Y non-PAR region for males. + - hemi_x_expr: True if the variant is in the X non-PAR region for XY individuals. + - hemi_y_expr: True if the variant is in the Y non-PAR region for XY individuals. """ not_hemi_expr = locus_expr.in_autosome_or_par() | ( - locus_expr.in_x_nonpar() & is_female_expr + locus_expr.in_x_nonpar() & is_XX_expr ) - hemi_x_expr = locus_expr.in_x_nonpar() & ~is_female_expr - hemi_y_expr = locus_expr.in_y_nonpar() & ~is_female_expr - + hemi_x_expr = locus_expr.in_x_nonpar() & ~is_XX_expr + hemi_y_expr = locus_expr.in_y_nonpar() & ~is_XX_expr return not_hemi_expr, hemi_x_expr, hemi_y_expr @@ -1365,7 +1364,7 @@ def calculate_de_novo_post_prob( pl_father: hl.expr.ArrayExpression, pl_mother: hl.expr.ArrayExpression, locus_expr: hl.expr.LocusExpression, - is_female_expr: hl.expr.BooleanExpression, + is_XX_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, min_pop_prior: Optional[float] = 100 / 3e7, de_novo_prior: Optional[float] = 1 / 3e7, @@ -1374,12 +1373,12 @@ def calculate_de_novo_post_prob( Calculate the posterior probability of a de novo mutation. This function computes the posterior probability of a de novo mutation (P_dn) - using the likelihoods of the proband and parents' genotypes and the population + using the likelihoods of the proband's and parents' genotypes and the population frequency prior for the variant. - Based on Kaitlin Samocha's de novo caller ( + Based on Kaitlin Samocha's [de novo caller]( https://github.com/ksamocha/de_novo_scripts), - the posterior probability of a de novo mutation (P_dn) is computed as: + the posterior probability of a de novo mutation (`P_dn`) is computed as: P_dn = P(DN | data) / (P(DN | data) + P(missed het in parent(s) | data)) @@ -1394,21 +1393,21 @@ def calculate_de_novo_post_prob( P(data | DN) = P(hom_ref in father) * P(hom_ref in mother) * P(het in proband) - * X non-PAR regions (males only): + * X non-PAR regions (XY only): P(data | DN) = P(hom_ref in mother) * P(het in proband) - * Y non-PAR regions (males only): + * Y non-PAR regions (XY only): - P(data | DN) = P(hom_ref in father) * P(het in proband) + P(data | DN) = P(hemi_ref in father) * P(hemi_alt in proband) - P(DN): The prior probability of a de novo mutation, fixed at: + P(DN): The prior probability of a de novo mutation from the literature, P(DN) = 1 / 3e7 P(missed het in parent(s) | data): The probability of observing missed het in parent(s) given the data. This is computed as: - P(missed het in parent(s) | data) = P(data | missed het in parent(s)) * P(missed het in parent(s)) + P(missed het in parent(s) | data) = P(data | at least one parent is het) * P(one parent is het) P(data | missed het in parent(s)): The probability of observing the data under the assumption of a missed het in parent(s): * Autosomes and PAR regions: @@ -1423,14 +1422,14 @@ def calculate_de_novo_post_prob( P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband) * P(het in one parent) - - P(het in one parent): The prior probability of a het in one parent, fixed at: + - P(het in one parent): The prior probability for at least one alternate allele between the parents depends on the alternate allele frequency: 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. :param pl_proband: Phred-scaled genotype likelihoods for the proband. :param pl_father: Phred-scaled genotype likelihoods for the father. :param pl_mother: Phred-scaled genotype likelihoods for the mother. :param locus_expr: LocusExpression of the variant. - :param is_female_expr: BooleanExpression indicating the proband's sex. + :param is_XX_expr: BooleanExpression indicating whether the proband has XX sex karyotype. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: 1e-8). :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). @@ -1474,7 +1473,6 @@ def calculate_de_novo_post_prob( # Calculate posterior probability of de novo mutation prob_dn_given_data = prob_data_given_dn * de_novo_prior p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) - return p_dn @@ -1484,7 +1482,7 @@ def get_de_novo_expr( proband_expr: hl.expr.StructExpression, father_expr: hl.expr.StructExpression, mother_expr: hl.expr.StructExpression, - is_female_expr: hl.expr.BooleanExpression, + is_XX_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, min_pop_prior: float = 100 / 3e7, de_novo_prior: float = 1 / 3e7, @@ -1502,7 +1500,7 @@ def get_de_novo_expr( low_conf_p: float = 0.2, ) -> hl.expr.StructExpression: """ - Get the de novo status of a variant, based on the proband and parent genotypes. + Get the de novo status of a variant based on the proband and parent genotypes. Thresholds: +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ @@ -1549,9 +1547,9 @@ def get_de_novo_expr( max_parent_ab : float, optional Maximum allele balance for parents (default: 0.05). min_de_novo_p : float, optional - Minimum de novo probability to pass (default: 0.05). + Minimum probability for variant to be called de novo (default: 0.05). high_conf_dp_ratio : float, optional - DP ratio threshold for high confidence (default: 0.2). + DP ratio threshold of proband DP to combined DP in parents for high confidence (default: 0.2). dp_threshold_snp : int, optional Minimum depth for high-confidence SNPs (default: 10). high_med_conf_ab : float, optional From b5fc1b71f04f9d3a3a376bdc705662a1f90d27c1 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:39:31 -0500 Subject: [PATCH 08/56] Transpose the thresholds table --- gnomad/sample_qc/relatedness.py | 96 +++++++++++++++++---------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 6ca46dc92..295893527 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1310,17 +1310,15 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 :param min_pop_prior: The minimum population frequency prior. """ return hl.max( - hl.or_else( - hl.case() - .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) - .or_error( - hl.format( - "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", - freq_prior_expr, - ) - ), - 0.0, - ), + hl.case() + .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) + .or_error( + hl.format( + "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", + freq_prior_expr, + ) + ) + .default(0.0), min_pop_prior, ) @@ -1396,11 +1394,11 @@ def calculate_de_novo_post_prob( * X non-PAR regions (males only): - P(data | DN) = P(hom_ref in mother) * P(het in proband) + P(data | DN) = P(hom_ref in mother) * P(hom_alt in proband) * Y non-PAR regions (males only): - P(data | DN) = P(hom_ref in father) * P(het in proband) + P(data | DN) = P(hom_ref in father) * P(hom_alt in proband) P(DN): The prior probability of a de novo mutation, fixed at: P(DN) = 1 / 3e7 @@ -1413,15 +1411,15 @@ def calculate_de_novo_post_prob( P(data | missed het in parent(s)): The probability of observing the data under the assumption of a missed het in parent(s): * Autosomes and PAR regions: - P(data | missed het in parents) = (P(het in father) * P(hom_ref in mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) * P(het in one parent) + P(data | missed het in parents) = (P(het in father) * P(hom_ref in mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) * X non-PAR regions: - P(data | missed het in mother) = (P(het in mother) + P(hom_var in mother)) * P(hom_var in proband) * P(het in one parent) + P(data | missed het in mother) = (P(het in mother) + P(hom_var in mother)) * P(hom_var in proband) * Y non-PAR regions: - P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband) * P(het in one parent) + P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband) - P(het in one parent): The prior probability of a het in one parent, fixed at: 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. @@ -1432,7 +1430,7 @@ def calculate_de_novo_post_prob( :param locus_expr: LocusExpression of the variant. :param is_female_expr: BooleanExpression indicating the proband's sex. :param freq_prior_expr: Population frequency prior for the variant. - :param min_pop_prior: Minimum population frequency prior (default: 1e-8). + :param min_pop_prior: Minimum population frequency prior (default: 100/3e7). :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). :return: Posterior probability of a de novo mutation (P_dn). """ @@ -1451,8 +1449,8 @@ def calculate_de_novo_post_prob( # Compute P(data | DN) prob_data_given_dn = ( hl.case() - .when(hemi_x, pp_mother[0] * pp_proband[1]) - .when(hemi_y, pp_father[0] * pp_proband[1]) + .when(hemi_x, pp_mother[0] * pp_proband[2]) + .when(hemi_y, pp_father[0] * pp_proband[2]) .when(not_hemi_expr, pp_father[0] * pp_mother[0] * pp_proband[1]) .or_missing() ) @@ -1504,32 +1502,35 @@ def get_de_novo_expr( """ Get the de novo status of a variant, based on the proband and parent genotypes. - Thresholds: - +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ - | Metric | FAIL | HIGH (Indel) | HIGH (SNV) 1 | HIGH (SNV) 2 | MEDIUM | LOW | VERY LOW | - +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ - | P (de novo) | < 0.05 | > 0.99 | > 0.99 | > 0.5 | > 0.5 | > 0.2 | >= 0.05 | - | AB | AB(proband) < 0.2 | AB > 0.3 | AB > 0.3 | AB > 0.3 | > 0.3 | >= 0.2| | - | | OR AB(parent(s)) > 0.05 | | | | | | | - | AD | 0 in either parent | | | | | | | - | DP | | | DP(proband) > 10 | | | | | - | DR (DP ratio) | DP(proband/parent(s)) < 0.1| | | DR > 0.2 | | | | - | GQ | GQ(proband) < 20 | | | | | | | - | AC* | | AC = 1 | | AC < 10 | | | | - +----------------------+----------------------------+----------------+------------------+----------------+----------+-------+-----------+ - * AC is supposed to be the sum of the alternate alleles in the proband and - parents, but we have not implemented this yet because we have multiple trios - in one family, in which an allele might be de novo in a parent and - transmitted to a child in the dataset. - - locus_expr : hl.expr.LocusExpression - Variant's genomic locus. - alleles_expr : hl.expr.ArrayExpression - Variant alleles. - proband_expr : hl.expr.StructExpression - Proband genotype details (e.g., DP, GQ, AD, GT, PL). - father_expr : hl.expr.StructExpression - Father's genotype details (e.g., DP, AD). + Thresholds: + + +----------------+------------+----------------------+------+------+------+------+------+ + | Category | P(de novo) | AB* | AD* | DP* | DR* | GQ* | AC* | + +----------------+------------+----------------------+------+------+------+------+------+ + | FAIL | < 0.05 | AB(parents) > 0.05 | 0 | | <0.1 | <20 | | + | | | OR AB(proband) < 0.2 | | | | | | + | HIGH (Indel) | > 0.99 | > 0.3 | | | | | =1 | + | HIGH (SNV) 1 | > 0.99 | > 0.3 | | >10 | | | | + | HIGH (SNV) 2 | > 0.5 | > 0.3 | | | >0.2 | | <10 | + | MEDIUM | > 0.5 | > 0.3 | | | | | | + | LOW | > 0.2 | > 0.2 | | | | | | + | VERY LOW | >= 0.05 | | | | | | | + +----------------+------------+----------------------+------+------+------+------+------+ + + Notes: + - AB: Normally refers to AB for the proband, except when a threshold for + parent(s) is specified for FAIL. + - DP: DP for the proband. + - DR: Defined as DP(proband) / DP(parent(s)). + - GQ: GQ for the proband. + - AC: Intended to be the sum of alternate alleles in the proband and parents. + This has **not been implemented yet** due to multiple trios in one family, + where an allele might be **de novo in a parent** and **transmitted to a child** in the dataset. + + locus_expr: Locus of the variant. + alleles_expr: Variant alleles. + proband_expr: Proband genotype info, required fields: GT, DP, GQ, AD, PL. + father_expr: Fa mother_expr : hl.expr.StructExpression Mother's genotype details (e.g., DP, AD). is_female_expr : hl.expr.BooleanExpression @@ -1641,7 +1642,8 @@ def get_de_novo_expr( parent_sum_ad_0 = ( hl.case() - .when(not_hemi_expr, (father_expr.AD[0] == 0) | (mother_expr.AD[0] == 0)) + .when(not_hemi_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) + == 0)) .when(hemi_x_expr, mother_expr.AD[0] == 0) .when(hemi_y_expr, father_expr.AD[0] == 0) .or_missing() @@ -1666,7 +1668,7 @@ def get_de_novo_expr( "max_parent_ab": fail_max_parent_ab, "min_proband_ab": proband_ab < min_proband_ab, "min_proband_gq": proband_expr.GQ < min_gq, - "min_de_novo_p": p_de_novo < min_de_novo_p, + "min_de_novo_p": p_de_novo <= min_de_novo_p, } fail = hl.any(list(fail_checks.values())) From b565fa251aa5df421fc9c8541ffddb61d6d96562 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 31 Jan 2025 12:18:55 -0500 Subject: [PATCH 09/56] Add a call_de_novo function --- gnomad/sample_qc/relatedness.py | 225 ++++++++++++++------------------ gnomad/utils/annotations.py | 25 ++++ 2 files changed, 126 insertions(+), 124 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 9ba78b430..157dccf19 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -9,6 +9,7 @@ import networkx as nx from gnomad.utils.filtering import add_filters_expr +from gnomad.utils.annotations import get_copy_state_by_sex logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) @@ -1309,7 +1310,7 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 :param freq_prior_expr: The population frequency prior for the variant. :param min_pop_prior: The minimum population frequency prior. """ - return hl.max( + return hl.max(hl.or_else( hl.case() .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) .or_error( @@ -1317,8 +1318,8 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", freq_prior_expr, ) - ) - .default(0.0), + ), + 0.0), min_pop_prior, ) @@ -1333,36 +1334,12 @@ def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpress return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) -def get_genomic_context( - locus_expr: hl.expr.LocusExpression, - is_XX_expr: hl.expr.BooleanExpression, -) -> Tuple[ - hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression -]: - """ - Determine the genomic context of a variant. - - :param locus_expr: LocusExpression of the variant. - :param is_XX_expr: BooleanExpression indicating whether the proband has an XX sex karyotype. - :return: A tuple of BooleanExpressions: - - not_hemi_expr: True if the variant is in autosomes or PAR regions. - - hemi_x_expr: True if the variant is in the X non-PAR region for XY individuals. - - hemi_y_expr: True if the variant is in the Y non-PAR region for XY individuals. - """ - not_hemi_expr = locus_expr.in_autosome_or_par() | ( - locus_expr.in_x_nonpar() & is_XX_expr - ) - hemi_x_expr = locus_expr.in_x_nonpar() & ~is_XX_expr - hemi_y_expr = locus_expr.in_y_nonpar() & ~is_XX_expr - return not_hemi_expr, hemi_x_expr, hemi_y_expr - - def calculate_de_novo_post_prob( - pl_proband: hl.expr.ArrayExpression, - pl_father: hl.expr.ArrayExpression, - pl_mother: hl.expr.ArrayExpression, + proband_pl: hl.expr.ArrayExpression, + father_pl: hl.expr.ArrayExpression, + mother_pl: hl.expr.ArrayExpression, locus_expr: hl.expr.LocusExpression, - is_XX_expr: hl.expr.BooleanExpression, + is_xx_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, min_pop_prior: Optional[float] = 100 / 3e7, de_novo_prior: Optional[float] = 1 / 3e7, @@ -1423,47 +1400,47 @@ def calculate_de_novo_post_prob( - P(het in one parent): The prior probability for at least one alternate allele between the parents depends on the alternate allele frequency: 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. - :param pl_proband: Phred-scaled genotype likelihoods for the proband. - :param pl_father: Phred-scaled genotype likelihoods for the father. - :param pl_mother: Phred-scaled genotype likelihoods for the mother. + :param proband_pl: Phred-scaled genotype likelihoods for the proband. + :param father_pl: Phred-scaled genotype likelihoods for the father. + :param mother_pl: Phred-scaled genotype likelihoods for the mother. :param locus_expr: LocusExpression of the variant. - :param is_XX_expr: BooleanExpression indicating whether the proband has XX sex karyotype. + :param is_xx_expr: BooleanExpression indicating whether the proband has XX sex karyotype. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: 100/3e7). :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). :return: Posterior probability of a de novo mutation (P_dn). """ # Ensure valid genomic context - not_hemi_expr, hemi_x, hemi_y = get_genomic_context(locus_expr, is_XX_expr) + diploid_expr, hemi_x, hemi_y = get_copy_state_by_sex(locus_expr, is_xx_expr) # Adjust frequency prior freq_prior_expr = get_freq_prior(freq_prior_expr, min_pop_prior) - prior_one_het = 1 - (1 - freq_prior_expr) ** 4 + prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pp_proband = transform_pl_to_pp(pl_proband) - pp_father = transform_pl_to_pp(pl_father) - pp_mother = transform_pl_to_pp(pl_mother) + pp_proband = transform_pl_to_pp(proband_pl) + pp_father = transform_pl_to_pp(father_pl) + pp_mother = transform_pl_to_pp(mother_pl) # Compute P(data | DN) prob_data_given_dn = ( hl.case() .when(hemi_x, pp_mother[0] * pp_proband[2]) .when(hemi_y, pp_father[0] * pp_proband[2]) - .when(not_hemi_expr, pp_father[0] * pp_mother[0] * pp_proband[1]) + .when(diploid_expr, pp_father[0] * pp_mother[0] * pp_proband[1]) .or_missing() ) # Compute P(data | missed het in parent(s)) prob_data_missed_het = ( hl.case() - .when(hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_het) - .when(hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_het) + .when(hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het) + .when(hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het) .when( - not_hemi_expr, + diploid_expr, (pp_father[1] * pp_mother[0] + pp_father[0] * pp_mother[1]) * pp_proband[1] - * prior_one_het, + * prior_one_parent_het, ) .or_missing() ) @@ -1473,6 +1450,40 @@ def calculate_de_novo_post_prob( p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) return p_dn +def call_de_novo( + locus_expr: hl.expr.LocusExpression, + proband_expr: hl.expr.StructExpression, + father_expr: hl.expr.StructExpression, + mother_expr: hl.expr.StructExpression, + is_xx_expr: hl.expr.BooleanExpression, +) -> hl.expr.BooleanExpression: + """ + Call a de novo mutation based on the proband and parent genotypes. + + :param locus_expr: Variant locus. + :param proband_expr: Proband genotype info, required fields: GT. + :param father_expr: Father genotype info, required fields: GT. + :param mother_expr: Mother genotype info, required fields: GT. + :param is_xx_expr: Whether the proband is XX. + :return: BooleanExpression indicating whether the variant is a de novo mutation. + """ + # Ensure valid genomic context + diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex(locus_expr, + is_xx_expr) + + is_de_novo = ( + diploid_expr + & ( + proband_expr.GT.is_het() + & father_expr.GT.is_hom_ref() + & mother_expr.GT.is_hom_ref() + ) + | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) + ) + + return is_de_novo + def get_de_novo_expr( locus_expr: hl.expr.LocusExpression, @@ -1480,7 +1491,7 @@ def get_de_novo_expr( proband_expr: hl.expr.StructExpression, father_expr: hl.expr.StructExpression, mother_expr: hl.expr.StructExpression, - is_XX_expr: hl.expr.BooleanExpression, + is_xx_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, min_pop_prior: float = 100 / 3e7, de_novo_prior: float = 1 / 3e7, @@ -1515,101 +1526,68 @@ def get_de_novo_expr( | VERY LOW | >= 0.05 | | | | | | | +----------------+------------+----------------------+------+------+------+------+------+ - Notes: - - AB: Normally refers to AB for the proband, except when a threshold for - parent(s) is specified for FAIL. - - DP: DP for the proband. - - DR: Defined as DP(proband) / DP(parent(s)). - - GQ: GQ for the proband. - - AC: Intended to be the sum of alternate alleles in the proband and parents. - This has **not been implemented yet** due to multiple trios in one family, - where an allele might be **de novo in a parent** and **transmitted to a child** in the dataset. - - locus_expr: Locus of the variant. - alleles_expr: Variant alleles. - proband_expr: Proband genotype info, required fields: GT, DP, GQ, AD, PL. - father_expr: Fa - mother_expr : hl.expr.StructExpression - Mother's genotype details (e.g., DP, AD). - is_female_expr : hl.expr.BooleanExpression - Whether the proband is female. - freq_prior_expr : hl.expr.Float64Expression - Population frequency prior for the variant. - min_pop_prior : float, optional - Minimum population frequency prior (default: 100 / 3e7). - de_novo_prior : float, optional - Prior probability of a de novo mutation (default: 1 / 3e7). - min_dp_ratio : float, optional - Minimum depth ratio for proband to parents (default: 0.1). - min_gq : int, optional - Minimum genotype quality for the proband (default: 20). - min_proband_ab : float, optional - Minimum allele balance for the proband (default: 0.2). - max_parent_ab : float, optional - Maximum allele balance for parents (default: 0.05). - min_de_novo_p : float, optional - Minimum probability for variant to be called de novo (default: 0.05). - high_conf_dp_ratio : float, optional - DP ratio threshold of proband DP to combined DP in parents for high confidence (default: 0.2). - dp_threshold_snp : int, optional - Minimum depth for high-confidence SNPs (default: 10). - high_med_conf_ab : float, optional - AB threshold for high/medium confidence (default: 0.3). - low_conf_ab : float, optional - AB threshold for low confidence (default: 0.2). - high_conf_p : float, optional - P(de novo) threshold for high confidence (default: 0.99). - med_conf_p : float, optional - P(de novo) threshold for medium confidence (default: 0.5). - low_conf_p : float, optional - P(de novo) threshold for low confidence (default: 0.2). - - Returns - ------- - hl.expr.StructExpression - A struct containing: - - `confidence`: Confidence level ("HIGH", "MEDIUM", "LOW", or missing). - - `fail`: Boolean indicating if the variant fails any checks. - - `fail_reason`: Set of strings with reasons for failure. + * AB: Normally refers to AB for the proband, except when a threshold for parent(s) is specified for FAIL. + + * DP: DP for the proband. + + * DR: Defined as DP(proband) / DP(parent(s)). + + * GQ: GQ for the proband. + + * AC: Supposed to be the sum of alternate alleles in the proband and parents. This has not been implemented yet due to multiple trios in one family, where an allele might be de novo in a parent and transmitted to a child in the dataset. + + :param locus_expr: Variant locus. + :param alleles_expr: Variant alleles. It assumes bi-allelic variants, meaning + that the matrix table or table should be already split to bi-allelics. + :param proband_expr: Proband genotype info, required fields: GT, DP, GQ, AD, PL. + :param father_expr: Father genotype info, required fields: GT, DP, GQ, AD, PL. + :param mother_expr: Mother genotype info, required fields: GT, DP, GQ, AD, PL. + :param is_xx_expr: Whether the proband is XX. + :param freq_prior_expr: Population frequency prior for the variant. + :param min_pop_prior: Minimum population frequency prior, default to 100 / 3e7. + :param de_novo_prior: Prior probability of a de novo mutation, default to 1 / 3e7. + :param min_dp_ratio: Minimum depth ratio for proband to parents, default to 0.1. + :param min_gq: Minimum genotype quality for the proband, default to 20. + :param min_proband_ab: Minimum allele balance for the proband, default to 0.2. + :param max_parent_ab: Maximum allele balance for parents, default to 0.05. + :param min_de_novo_p: Minimum probability for variant to be called de novo, default to 0.05. + :param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence, default to 0.2. + :param dp_threshold_snp: Minimum depth for high-confidence SNPs, default to 10. + :param high_med_conf_ab: AB threshold for high/medium confidence, default to 0.3. + :param low_conf_ab: AB threshold for low confidence, default to 0.2. + :param high_conf_p: P(de novo) threshold for high confidence, default to 0.99. + :param med_conf_p: P(de novo) threshold for medium confidence, default to 0.5. + :param low_conf_p: P(de novo) threshold for low confidence, default to 0.2. + + :return: A StructExpression with the de novo status and confidence. """ + # Determine genomic context + diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex( + locus_expr, is_xx_expr + ) + p_de_novo = calculate_de_novo_post_prob( proband_expr.PL, father_expr.PL, mother_expr.PL, locus_expr, - is_XX_expr, + is_xx_expr, freq_prior_expr, min_pop_prior=min_pop_prior, de_novo_prior=de_novo_prior, ) - # Determine genomic context - not_hemi_expr, hemi_x_expr, hemi_y_expr = get_genomic_context( - locus_expr, is_XX_expr - ) - - is_de_novo = ( - not_hemi_expr - & ( - proband_expr.GT.is_het() - & father_expr.GT.is_hom_ref() - & mother_expr.GT.is_hom_ref() - ) - | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) - | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) - ) - # Calculate DP ratio parent_dp = ( hl.case() - .when(not_hemi_expr, father_expr.DP + mother_expr.DP) + .when(diploid_expr, father_expr.DP + mother_expr.DP) .when(hemi_x_expr, mother_expr.DP) .when(hemi_y_expr, father_expr.DP) .or_missing() ) dp_ratio = proband_expr.DP / parent_dp - # Key metrics + # Calculate proband AB and assign variant type proband_ab = proband_expr.AD[1] / hl.sum(proband_expr.AD) is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1]) @@ -1640,7 +1618,7 @@ def get_de_novo_expr( parent_sum_ad_0 = ( hl.case() - .when(not_hemi_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) + .when(diploid_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) == 0)) .when(hemi_x_expr, hl.sum(mother_expr.AD) == 0) .when(hemi_y_expr, hl.sum(father_expr.AD) == 0) @@ -1650,7 +1628,7 @@ def get_de_novo_expr( fail_max_parent_ab = ( hl.case() .when( - not_hemi_expr, + diploid_expr, (father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab) | (mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab), ) @@ -1671,7 +1649,6 @@ def get_de_novo_expr( fail = hl.any(list(fail_checks.values())) result_expr = hl.struct( - is_de_novo=hl.if_else(is_de_novo, True, False), p_de_novo=hl.if_else(fail, hl.missing(hl.tfloat64), p_de_novo), confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence), fail_reason=add_filters_expr(filters=fail_checks), diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 350735a37..6def6292d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2780,3 +2780,28 @@ def _create_group_dicts( final_freq_dict["subcohortFrequency"] = list_of_group_info_dicts return final_freq_dict + + +def get_copy_state_by_sex( + locus_expr: hl.expr.LocusExpression, + is_xx_expr: hl.expr.BooleanExpression, +) -> Tuple[ + hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression +]: + """ + Determine the copy state of a variant by its locus and the sex karotype of a sample. + + :param locus_expr: LocusExpression of the variant. + :param is_xx_expr: BooleanExpression indicating whether the sample has an XX sex + karyotype. + :return: A tuple of BooleanExpressions: + - diploid_expr: True if the variant is in autosomes or PAR regions, or in the X non-PAR region for XX individuals. + - hemi_x_expr: True if the variant is in the X non-PAR region for XY individuals. + - hemi_y_expr: True if the variant is in the Y non-PAR region for XY individuals. + """ + diploid_expr = locus_expr.in_autosome_or_par() | ( + locus_expr.in_x_nonpar() & is_xx_expr + ) + hemi_x_expr = locus_expr.in_x_nonpar() & ~is_xx_expr + hemi_y_expr = locus_expr.in_y_nonpar() & ~is_xx_expr + return diploid_expr, hemi_x_expr, hemi_y_expr From 2f11811be009029d80bb7f6061253a2fb84b9a2d Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 31 Jan 2025 12:28:33 -0500 Subject: [PATCH 10/56] Formatting --- gnomad/sample_qc/relatedness.py | 65 +++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 157dccf19..6ef336846 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1310,16 +1310,18 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 :param freq_prior_expr: The population frequency prior for the variant. :param min_pop_prior: The minimum population frequency prior. """ - return hl.max(hl.or_else( - hl.case() - .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) - .or_error( - hl.format( - "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", - freq_prior_expr, - ) + return hl.max( + hl.or_else( + hl.case() + .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) + .or_error( + hl.format( + "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", + freq_prior_expr, + ) + ), + 0.0, ), - 0.0), min_pop_prior, ) @@ -1434,8 +1436,12 @@ def calculate_de_novo_post_prob( # Compute P(data | missed het in parent(s)) prob_data_missed_het = ( hl.case() - .when(hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het) - .when(hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het) + .when( + hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het + ) + .when( + hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het + ) .when( diploid_expr, (pp_father[1] * pp_mother[0] + pp_father[0] * pp_mother[1]) @@ -1450,12 +1456,13 @@ def calculate_de_novo_post_prob( p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) return p_dn + def call_de_novo( - locus_expr: hl.expr.LocusExpression, - proband_expr: hl.expr.StructExpression, - father_expr: hl.expr.StructExpression, - mother_expr: hl.expr.StructExpression, - is_xx_expr: hl.expr.BooleanExpression, + locus_expr: hl.expr.LocusExpression, + proband_expr: hl.expr.StructExpression, + father_expr: hl.expr.StructExpression, + mother_expr: hl.expr.StructExpression, + is_xx_expr: hl.expr.BooleanExpression, ) -> hl.expr.BooleanExpression: """ Call a de novo mutation based on the proband and parent genotypes. @@ -1468,18 +1475,19 @@ def call_de_novo( :return: BooleanExpression indicating whether the variant is a de novo mutation. """ # Ensure valid genomic context - diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex(locus_expr, - is_xx_expr) + diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex( + locus_expr, is_xx_expr + ) is_de_novo = ( - diploid_expr - & ( - proband_expr.GT.is_het() - & father_expr.GT.is_hom_ref() - & mother_expr.GT.is_hom_ref() - ) - | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) - | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) + diploid_expr + & ( + proband_expr.GT.is_het() + & father_expr.GT.is_hom_ref() + & mother_expr.GT.is_hom_ref() + ) + | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) ) return is_de_novo @@ -1618,8 +1626,9 @@ def get_de_novo_expr( parent_sum_ad_0 = ( hl.case() - .when(diploid_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) - == 0)) + .when( + diploid_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) == 0) + ) .when(hemi_x_expr, hl.sum(mother_expr.AD) == 0) .when(hemi_y_expr, hl.sum(father_expr.AD) == 0) .or_missing() From f4dfe3232d5a2e23ab28e0f5dc802499a58cec0b Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 31 Jan 2025 12:31:05 -0500 Subject: [PATCH 11/56] isort --- gnomad/sample_qc/relatedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 6ef336846..d03e28306 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -8,8 +8,8 @@ import hail as hl import networkx as nx -from gnomad.utils.filtering import add_filters_expr from gnomad.utils.annotations import get_copy_state_by_sex +from gnomad.utils.filtering import add_filters_expr logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) From dce95f08fe4089ed8f75252b21cbd0d3452c9853 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:00:09 -0500 Subject: [PATCH 12/56] Add test module for de novo functions --- gnomad/sample_qc/relatedness.py | 6 +- tests/sample_qc/test_de_novo.py | 228 ++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 3 deletions(-) create mode 100644 tests/sample_qc/test_de_novo.py diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index d03e28306..a9a2ceb51 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1468,9 +1468,9 @@ def call_de_novo( Call a de novo mutation based on the proband and parent genotypes. :param locus_expr: Variant locus. - :param proband_expr: Proband genotype info, required fields: GT. - :param father_expr: Father genotype info, required fields: GT. - :param mother_expr: Mother genotype info, required fields: GT. + :param proband_expr: Proband genotype info, required field: GT. + :param father_expr: Father genotype info, required field: GT. + :param mother_expr: Mother genotype info, required field: GT. :param is_xx_expr: Whether the proband is XX. :return: BooleanExpression indicating whether the variant is a de novo mutation. """ diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py new file mode 100644 index 000000000..26c0fbb8e --- /dev/null +++ b/tests/sample_qc/test_de_novo.py @@ -0,0 +1,228 @@ +"""Test suite for de novo mutation functions.""" + +import pytest +import hail as hl + +from gnomad.sample_qc.relatedness import ( + get_freq_prior, + transform_pl_to_pp, + calculate_de_novo_post_prob, + call_de_novo, + get_de_novo_expr, +) + +from gnomad.utils.annotations import get_copy_state_by_sex + + +class TestDeNovoMutation: + """Test suite for de novo mutation functions.""" + + loci: dict[str, hl.expr.LocusExpression] + + @classmethod + def setup_class(cls): + """Set up common test data for all tests.""" + cls.locus_expr = hl.locus("1", 123456) + cls.alleles_expr = hl.literal(["A", "T"]) + cls.freq_prior_expr = hl.literal(0.01) + cls.is_xx_expr = hl.literal(False) + + # Mock Genotype Likelihoods (PL) + cls.proband_pl = hl.literal([0, 10, 100]) + cls.father_pl = hl.literal([0, 100, 100]) + cls.mother_pl = hl.literal([0, 100, 100]) + + # Mock Genotype Calls + cls.proband_expr = hl.struct( + GT=hl.call(0, 1), DP=10, GQ=30, AD=[3, 7], PL=cls.proband_pl + ) + cls.father_expr = hl.struct( + GT=hl.call(0, 0), DP=12, GQ=40, AD=[12, 0], PL=cls.father_pl + ) + cls.mother_expr = hl.struct( + GT=hl.call(0, 0), DP=15, GQ=50, AD=[15, 0], PL=cls.mother_pl + ) + + cls.loci = { + "autosomal": hl.locus("chr1", 100000, reference_genome="GRCh38"), + # PAR regions (always diploid) + "par1": hl.locus("chrX", 2781479, reference_genome="GRCh38"), # PAR1 start + "par2": hl.locus("chrX", 155701383, reference_genome="GRCh38"), # PAR2 end + # X non-PAR (diploid for XX, hemizygous for XY) + "x_nonpar": hl.locus("chrX", 3000000, reference_genome="GRCh38"), + # Y non-PAR (hemizygous for XY) + "y_nonpar": hl.locus("chrY", 10000000, reference_genome="GRCh38"), + } + + @pytest.mark.parametrize( + "freq_prior, min_pop_prior, expect_error, expected", + [ + (0.05, 100 / 3e7, False, 0.05), + (-0.01, 100 / 3e7, True, None), + (1.2, 100 / 3e7, True, None), + (hl.missing(hl.tfloat64), 100 / 3e7, False, 100 / 3e7), + ], + ) + def test_get_freq_prior( + self, freq_prior, min_pop_prior, expect_error, expected + ) -> None: + """ + Test frequency prior computation. + + :param freq_prior: Frequency prior value. + :param min_pop_prior: Minimum population prior. + :param expect_error: Whether an error is expected. + :param expected: Expected frequency prior. + :return: None. + """ + if expect_error: + with pytest.raises( + hl.utils.java.HailUserError, + match="de_novo: expect 0 <= freq_prior_expr <= 1", + ): + expr = get_freq_prior(hl.literal(freq_prior), min_pop_prior) + hl.eval(expr) # Hail will throw an error here + else: + expr = get_freq_prior(hl.literal(freq_prior), min_pop_prior) + result = hl.eval(expr) # Evaluate the expression + assert result == pytest.approx( + expected, rel=1e-6 + ) # Compare floating point values safely + + @pytest.mark.parametrize( + "pl_input, expected", + [ + ( + [0, 10, 100], + [0.9090909090082644, 0.09090909090082644, 9.090909090082645e-11], + ), + ([0, 0, 0], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]), + ], + ) + def test_transform_pl_to_pp(self, pl_input, expected) -> None: + """ + Test PL to PP transformation. + + :param pl_input: Input PL values. + :param expected: Expected PP values. + :return: None. + """ + expr = transform_pl_to_pp(hl.literal(pl_input)) + result = hl.eval(expr) + + assert result == pytest.approx( + expected, abs=1e-12 + ), f"Got {result}, expected {expected}" + + @pytest.mark.parametrize( + "locus_key, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y", + [ + ("autosomal", True, True, False, False), + ("autosomal", False, True, False, False), + ("par1", True, True, False, False), + ("par2", False, True, False, False), + ("x_nonpar", True, True, False, False), + ("x_nonpar", False, False, True, False), + ("y_nonpar", True, False, False, False), + ("y_nonpar", False, False, False, True), + ], + ) + def test_get_copy_state_by_sex( + self, locus_key, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y + ) -> None: + """ + Test copy state determination based on locus type and sex. + + :param locus_key: Locus key. + :param is_xx: Whether the individual is XX. + :param expected_diploid: Expected diploid state. + :param expected_hemi_x: Expected hemizygous X state. + :param expected_hemi_y: Expected hemizygous Y state. + :return: None. + """ + locus = self.loci[locus_key] + is_xx_expr = hl.literal(is_xx) + + diploid, hemi_x, hemi_y = get_copy_state_by_sex(locus, is_xx_expr) + result = hl.eval([diploid, hemi_x, hemi_y]) + + assert result == [ + expected_diploid, + expected_hemi_x, + expected_hemi_y, + ], f"Failed for locus={locus}, is_xx={is_xx}. Expected {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" + + @pytest.mark.parametrize( + "locus_key, proband_gt, father_gt, mother_gt, is_xx, expected", + [ + ("autosomal", (0, 1), (0, 0), (0, 0), False, True), + ("autosomal", (1, 1), (0, 0), (0, 0), False, False), + ("x_nonpar", (1, 1), None, (0, 0), False, True), + ("x_nonpar", (1, 1), (0, 0), None, False, None), + ("y_nonpar", (1, 1), None, None, False, None), + ], + ) + def test_call_de_novo( + self, locus_key, proband_gt, father_gt, mother_gt, is_xx, expected + ) -> None: + """ + Test de novo mutation detection with different loci and parental genotypes. + + :param locus_key: Locus key. + :param proband_gt: Proband genotype. + :param father_gt: Father genotype. + :param mother_gt: Mother genotype. + :param is_xx: Whether the individual is XX. + :param expected: Expected de novo mutation status. + :return: None. + """ + locus_expr = self.loci[locus_key] + proband_expr = hl.struct( + GT=hl.call(*proband_gt) if proband_gt else hl.missing(hl.tcall) + ) + father_expr = hl.struct( + GT=hl.call(*father_gt) if father_gt else hl.missing(hl.tcall) + ) + mother_expr = hl.struct( + GT=hl.call(*mother_gt) if mother_gt else hl.missing(hl.tcall) + ) + is_xx_expr = hl.literal(is_xx) + + expr = call_de_novo( + locus_expr, proband_expr, father_expr, mother_expr, is_xx_expr + ) + result = hl.eval(expr) + + assert ( + result == expected + ), f"Mismatch in {locus_key}: Expected {expected}, got {result}" + + def test_calculate_de_novo_post_prob(self): + """Test posterior probability computation for de novo mutations.""" + expr = calculate_de_novo_post_prob( + self.proband_pl, + self.father_pl, + self.mother_pl, + self.locus_expr, + self.is_xx_expr, + self.freq_prior_expr, + ) + result = hl.eval(expr) + assert 0 <= result <= 1 # Posterior probability should be within valid range + + def test_get_de_novo_expr(self): + """Test the de novo expression struct output.""" + expr = get_de_novo_expr( + self.locus_expr, + self.alleles_expr, + self.proband_expr, + self.father_expr, + self.mother_expr, + self.is_xx_expr, + self.freq_prior_expr, + ) + result = hl.eval(expr) + + assert "p_de_novo" in result + assert "confidence" in result + assert 0 <= result.p_de_novo <= 1 # Probability must be valid From 7ccda174add1a6ea890fdc85b39832fc9d85da06 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:15:19 -0500 Subject: [PATCH 13/56] small formatting --- tests/sample_qc/test_de_novo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 26c0fbb8e..0620f1e83 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -1,7 +1,6 @@ """Test suite for de novo mutation functions.""" import pytest -import hail as hl from gnomad.sample_qc.relatedness import ( get_freq_prior, @@ -150,7 +149,8 @@ def test_get_copy_state_by_sex( expected_diploid, expected_hemi_x, expected_hemi_y, - ], f"Failed for locus={locus}, is_xx={is_xx}. Expected {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" + ], (f"Failed for locus={locus}, is_xx={is_xx}. Expected" + f" {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}") @pytest.mark.parametrize( "locus_key, proband_gt, father_gt, mother_gt, is_xx, expected", From e9d49d4fdbcbe68dc5f0448569869a59adf3ae62 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:17:55 -0500 Subject: [PATCH 14/56] black formatting --- tests/sample_qc/test_de_novo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 0620f1e83..ea79037ab 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -1,5 +1,6 @@ """Test suite for de novo mutation functions.""" +import hail as hl import pytest from gnomad.sample_qc.relatedness import ( @@ -12,7 +13,6 @@ from gnomad.utils.annotations import get_copy_state_by_sex - class TestDeNovoMutation: """Test suite for de novo mutation functions.""" @@ -149,8 +149,10 @@ def test_get_copy_state_by_sex( expected_diploid, expected_hemi_x, expected_hemi_y, - ], (f"Failed for locus={locus}, is_xx={is_xx}. Expected" - f" {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}") + ], ( + f"Failed for locus={locus}, is_xx={is_xx}. Expected" + f" {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" + ) @pytest.mark.parametrize( "locus_key, proband_gt, father_gt, mother_gt, is_xx, expected", From ef6dbe1b22eefcb4f6ae6f2e5ae63ee016dabdfb Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:20:45 -0500 Subject: [PATCH 15/56] Black --- tests/sample_qc/test_de_novo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index ea79037ab..3baa77e68 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -13,6 +13,7 @@ from gnomad.utils.annotations import get_copy_state_by_sex + class TestDeNovoMutation: """Test suite for de novo mutation functions.""" From 0a5616d3ca2af39465d2e07566e542eec79c7a3a Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 09:23:47 -0500 Subject: [PATCH 16/56] isort --- tests/sample_qc/test_de_novo.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 3baa77e68..7509f434a 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -4,13 +4,12 @@ import pytest from gnomad.sample_qc.relatedness import ( - get_freq_prior, - transform_pl_to_pp, calculate_de_novo_post_prob, call_de_novo, get_de_novo_expr, + get_freq_prior, + transform_pl_to_pp, ) - from gnomad.utils.annotations import get_copy_state_by_sex From 4376c66c07b09a016f01a2e7fe222ea0f1ca1986 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:56:00 -0500 Subject: [PATCH 17/56] Add missing docstring and notes --- gnomad/sample_qc/relatedness.py | 107 ++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 38 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index a9a2ceb51..61e7b6d33 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1308,7 +1308,7 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 Get the population frequency prior for a de novo mutation. :param freq_prior_expr: The population frequency prior for the variant. - :param min_pop_prior: The minimum population frequency prior. + :param min_pop_prior: The minimum population frequency prior. default is 100/3e7, same format as Samocha's original code. """ return hl.max( hl.or_else( @@ -1327,11 +1327,18 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: - """ - Transform the PLs into the probability of observing genotype. + r""" + Transform the Phred-scaled likelihoods (PL) into the probability of observing each genotype (PP). + + .. note:: + The Phred-scaled likelihoods (PL) are transformed back into probabilities (PP) + using the relationship: + + .. math:: + PL = -10 \\times \\log_{10}{P(\\text{Genotype} | \\text{Data})} - :param pl_expr: ArrayExpression of PL. - :return: ArrayExpression of the probability of observing each genotype. + :param pl_expr: ArrayExpression of PL values. + :return: ArrayExpression of the probability of observing each genotype (PP). """ return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) @@ -1346,61 +1353,83 @@ def calculate_de_novo_post_prob( min_pop_prior: Optional[float] = 100 / 3e7, de_novo_prior: Optional[float] = 1 / 3e7, ) -> hl.expr.Float64Expression: - """ + r""" Calculate the posterior probability of a de novo mutation. This function computes the posterior probability of a de novo mutation (P_dn) using the likelihoods of the proband's and parents' genotypes and the population - frequency prior for the variant. + frequency prior for the variant. It's based on Kaitlin Samocha's [de novo caller]( + https://github.com/ksamocha/de_novo_scripts) and Hail's [de_novo]( + https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) + method, however, the original docstring didn't provide a clear explanation on how + to calculate for hemizygous regions of the XY individuals. + + The posterior probability of a de novo mutation (:math:`P_{dn}`) is computed as: + + .. math:: + P_{dn} = \\frac{P(DN | \\text{data})}{P(DN | \\text{data}) + P(\\text{missed het in parent(s)} | \\text{data})} + + The terms are defined as follows: + + **Probability of a de novo mutation given the data** (:math:`P(DN | \text{data})`): + + .. math:: + P(DN | \\text{data}) = P(\\text{data} | DN) \\times P(DN) + + where: + + - :math:`P(\\text{data} | DN)`: Probability of observing the data under the assumption of a de novo mutation. + + - **Autosomes and PAR regions**: - Based on Kaitlin Samocha's [de novo caller]( - https://github.com/ksamocha/de_novo_scripts), - the posterior probability of a de novo mutation (`P_dn`) is computed as: + .. math:: + P(\\text{data} | DN) = P(\\text{hom\\_ref in father}) \\times P(\\text{hom\\_ref in mother}) \\times P(\\text{het in proband}) - P_dn = P(DN | data) / (P(DN | data) + P(missed het in parent(s) | data)) + - **X non-PAR regions (XY only)**: - The terms are defined as: - P(DN | data): The probability of a de novo mutation given the data. This is - computed as: + .. math:: + P(\\text{data} | DN) = P(\\text{hom\\_ref in mother}) \\times P(\\text{hom\\_alt in proband}) - P(DN | data) = P(data | DN) * P(DN) + - **Y non-PAR regions (XY only)**: - P(data | DN): The probability of observing the data under the assumption of a de novo mutation: - * Autosomes and PAR regions: + .. math:: + P(\\text{data} | DN) = P(\\text{hom\\_ref in father}) \\times P(\\text{hom\\_alt in proband}) - P(data | DN) = P(hom_ref in father) * P(hom_ref in mother) * P(het in proband) + - :math:`P(DN)`: The prior probability of a de novo mutation from literature, defined as: - * X non-PAR regions (XY only): + .. math:: + P(DN) = \\frac{1}{3 \\times 10^7} - P(data | DN) = P(hom_ref in mother) * P(hom_alt in proband) + **Probability of missed heterozygous parent(s) given the data** (:math:`P(\text{missed het in parent(s)} | \text{data})`): - * Y non-PAR regions (XY only): + .. math:: + P(\\text{missed het in parent(s)} | \\text{data}) = P(\\text{data} | \\text{at least one parent is het}) \\times P(\\text{one parent is het}) - P(data | DN) = P(hom_ref in father) * P(hom_alt in proband) + where: - P(DN): The prior probability of a de novo mutation from the literature, - P(DN) = 1 / 3e7 + - :math:`P(\\text{data} | \\text{missed het in parent(s)})`: Probability of observing the data under the assumption of a missed het in a parent. - P(missed het in parent(s) | data): The probability of observing missed het in - parent(s) given the data. This is computed as: + - **Autosomes and PAR regions**: - P(missed het in parent(s) | data) = P(data | at least one parent is het) * P(one parent is het) + .. math:: + P(\\text{data} | \\text{missed het in parents}) = \\left( P(\\text{het in father}) \\times P(\\text{hom\\_ref in mother}) + P(\\text{hom\\_ref in father}) \\times P(\\text{het in mother}) \\right) \\times P(\\text{het in proband}) - P(data | missed het in parent(s)): The probability of observing the data under the assumption of a missed het in parent(s): - * Autosomes and PAR regions: + - **X non-PAR regions**: - P(data | missed het in parents) = (P(het in father) * P(hom_ref in mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband) + .. math:: + P(\\text{data} | \\text{missed het in mother}) = (P(\\text{het in mother}) + P(\\text{hom\\_var in mother})) \\times P(\\text{hom\\_var in proband}) - * X non-PAR regions: + - **Y non-PAR regions**: - P(data | missed het in mother) = (P(het in mother) + P(hom_var in mother)) * P(hom_var in proband) + .. math:: + P(\\text{data} | \\text{missed het in father}) = (P(\\text{het in father}) + P(\\text{hom\\_var in father})) \\times P(\\text{hom\\_var in proband}) - * Y non-PAR regions: + **Prior probability for at least one heterozygous parent**: - P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband) + .. math:: + P(\\text{het in one parent}) = 1 - (1 - \\text{freq\\_prior})^4 - - P(het in one parent): The prior probability for at least one alternate allele between the parents depends on the alternate allele frequency: - 1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant. + where :math:`\\text{freq\\_prior}` is the population frequency prior for the variant. :param proband_pl: Phred-scaled genotype likelihoods for the proband. :param father_pl: Phred-scaled genotype likelihoods for the father. @@ -1530,7 +1559,7 @@ def get_de_novo_expr( | HIGH (SNV) 1 | > 0.99 | > 0.3 | | >10 | | | | | HIGH (SNV) 2 | > 0.5 | > 0.3 | | | >0.2 | | <10 | | MEDIUM | > 0.5 | > 0.3 | | | | | | - | LOW | > 0.2 | > 0.2 | | | | | | + | LOW | > 0.2 | >= 0.2 | | | | | | | VERY LOW | >= 0.05 | | | | | | | +----------------+------------+----------------------+------+------+------+------+------+ @@ -1619,7 +1648,9 @@ def get_de_novo_expr( .when((p_de_novo > low_conf_p) & (proband_ab >= low_conf_ab), "LOW") .when( (p_de_novo >= min_de_novo_p), - "VERY LOW", + "VERY LOW", # This is added to give a confidence level for variants that + # don't fail but don't meet the other thresholds for high, medium, + # or low confidence, and it's not in Samocha's original code. ) .or_missing() ) From 6bfd82e9c14df5b11446a04ba40005ba7fe168e5 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:57:09 -0500 Subject: [PATCH 18/56] formatting --- gnomad/sample_qc/relatedness.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 61e7b6d33..fe959321d 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1308,7 +1308,8 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 Get the population frequency prior for a de novo mutation. :param freq_prior_expr: The population frequency prior for the variant. - :param min_pop_prior: The minimum population frequency prior. default is 100/3e7, same format as Samocha's original code. + :param min_pop_prior: The minimum population frequency prior, default is + 100/3e7, same format as Samocha's original code. """ return hl.max( hl.or_else( From 3d977d7e436f25c8294b9951f2f734b18794238e Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 5 Feb 2025 09:18:44 -0500 Subject: [PATCH 19/56] Apply suggestions from code review Co-authored-by: Katherine Chao --- gnomad/sample_qc/relatedness.py | 65 ++++++++++++++++----------------- gnomad/utils/annotations.py | 2 +- tests/sample_qc/test_de_novo.py | 4 +- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index fe959321d..8afcd901c 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1308,8 +1308,8 @@ def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 Get the population frequency prior for a de novo mutation. :param freq_prior_expr: The population frequency prior for the variant. - :param min_pop_prior: The minimum population frequency prior, default is - 100/3e7, same format as Samocha's original code. + :param min_pop_prior: The minimum population frequency prior. Default is + 100/3e7, taken from Kaitlin Samocha's [de novo caller](https://github.com/ksamocha/de_novo_scripts). """ return hl.max( hl.or_else( @@ -1362,10 +1362,11 @@ def calculate_de_novo_post_prob( frequency prior for the variant. It's based on Kaitlin Samocha's [de novo caller]( https://github.com/ksamocha/de_novo_scripts) and Hail's [de_novo]( https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) - method, however, the original docstring didn't provide a clear explanation on how - to calculate for hemizygous regions of the XY individuals. + method. Please refer to these sources for more information on the de novo model. + + Neither Kaitlin's de novo caller nor Hail's de novo method provide a clear description on how + to calculate for de novo calls for hemizygous genotypes in XY individuals. These equations are included below. - The posterior probability of a de novo mutation (:math:`P_{dn}`) is computed as: .. math:: P_{dn} = \\frac{P(DN | \\text{data})}{P(DN | \\text{data}) + P(\\text{missed het in parent(s)} | \\text{data})} @@ -1386,7 +1387,8 @@ def calculate_de_novo_post_prob( .. math:: P(\\text{data} | DN) = P(\\text{hom\\_ref in father}) \\times P(\\text{hom\\_ref in mother}) \\times P(\\text{het in proband}) - - **X non-PAR regions (XY only)**: + **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** + - **X non-PAR regions (XY only)**: .. math:: P(\\text{data} | DN) = P(\\text{hom\\_ref in mother}) \\times P(\\text{hom\\_alt in proband}) @@ -1440,7 +1442,7 @@ def calculate_de_novo_post_prob( :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: 100/3e7). :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). - :return: Posterior probability of a de novo mutation (P_dn). + :return: Posterior probability of a de novo mutation (`P_dn`). """ # Ensure valid genomic context diploid_expr, hemi_x, hemi_y = get_copy_state_by_sex(locus_expr, is_xx_expr) @@ -1454,8 +1456,8 @@ def calculate_de_novo_post_prob( pp_father = transform_pl_to_pp(father_pl) pp_mother = transform_pl_to_pp(mother_pl) - # Compute P(data | DN) - prob_data_given_dn = ( + # Compute `P(data | DN)` + prob_data_given_dn_expr = ( hl.case() .when(hemi_x, pp_mother[0] * pp_proband[2]) .when(hemi_y, pp_father[0] * pp_proband[2]) @@ -1463,8 +1465,8 @@ def calculate_de_novo_post_prob( .or_missing() ) - # Compute P(data | missed het in parent(s)) - prob_data_missed_het = ( + # Compute `P(data | missed het in parent(s))` + prob_data_missed_het_expr = ( hl.case() .when( hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het @@ -1482,9 +1484,9 @@ def calculate_de_novo_post_prob( ) # Calculate posterior probability of de novo mutation - prob_dn_given_data = prob_data_given_dn * de_novo_prior - p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het) - return p_dn + prob_dn_given_data_expr = prob_data_given_dn_expr * de_novo_prior + p_dn_expr = prob_dn_given_data_expr / (prob_dn_given_data_expr + prob_data_missed_het_expr) + return p_dn_expr def call_de_novo( @@ -1519,7 +1521,6 @@ def call_de_novo( | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) ) - return is_de_novo @@ -1564,22 +1565,20 @@ def get_de_novo_expr( | VERY LOW | >= 0.05 | | | | | | | +----------------+------------+----------------------+------+------+------+------+------+ - * AB: Normally refers to AB for the proband, except when a threshold for parent(s) is specified for FAIL. + * AB: Proband AB. FAIL criteria also includes threshold for parent(s). - * DP: DP for the proband. + * DP: Proband DP. * DR: Defined as DP(proband) / DP(parent(s)). - * GQ: GQ for the proband. - - * AC: Supposed to be the sum of alternate alleles in the proband and parents. This has not been implemented yet due to multiple trios in one family, where an allele might be de novo in a parent and transmitted to a child in the dataset. + * GQ: Proband GQ. :param locus_expr: Variant locus. :param alleles_expr: Variant alleles. It assumes bi-allelic variants, meaning that the matrix table or table should be already split to bi-allelics. - :param proband_expr: Proband genotype info, required fields: GT, DP, GQ, AD, PL. - :param father_expr: Father genotype info, required fields: GT, DP, GQ, AD, PL. - :param mother_expr: Mother genotype info, required fields: GT, DP, GQ, AD, PL. + :param proband_expr: Proband genotype info; required fields: GT, DP, GQ, AD, PL. + :param father_expr: Father genotype info; required fields: GT, DP, GQ, AD, PL. + :param mother_expr: Mother genotype info; required fields: GT, DP, GQ, AD, PL. :param is_xx_expr: Whether the proband is XX. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior, default to 100 / 3e7. @@ -1596,8 +1595,7 @@ def get_de_novo_expr( :param high_conf_p: P(de novo) threshold for high confidence, default to 0.99. :param med_conf_p: P(de novo) threshold for medium confidence, default to 0.5. :param low_conf_p: P(de novo) threshold for low confidence, default to 0.2. - - :return: A StructExpression with the de novo status and confidence. + :return: A StructExpression with variant de novo status and confidence of de novo call. """ # Determine genomic context diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex( @@ -1630,12 +1628,12 @@ def get_de_novo_expr( is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1]) # Confidence assignment - confidence = ( + confidence_expr = ( hl.case() .when( ( is_snp - & (p_de_novo > 0.99) + & (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab) & ( (proband_expr.DP > dp_threshold_snp) @@ -1647,16 +1645,17 @@ def get_de_novo_expr( ) .when((p_de_novo > med_conf_p) & (proband_ab > high_med_conf_ab), "MEDIUM") .when((p_de_novo > low_conf_p) & (proband_ab >= low_conf_ab), "LOW") + # This level (`VERY LOW`) is added to give a confidence level for variants that + # don't fail but don't meet the other thresholds. + # This was not present in Kaitlin's original de novo caller or Hail's de novo method .when( (p_de_novo >= min_de_novo_p), - "VERY LOW", # This is added to give a confidence level for variants that - # don't fail but don't meet the other thresholds for high, medium, - # or low confidence, and it's not in Samocha's original code. + "VERY LOW", ) .or_missing() ) - parent_sum_ad_0 = ( + parent_sum_ad_0_expr = ( hl.case() .when( diploid_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) == 0) @@ -1666,7 +1665,7 @@ def get_de_novo_expr( .or_missing() ) - fail_max_parent_ab = ( + fail_max_parent_ab_expr = ( hl.case() .when( diploid_expr, @@ -1679,7 +1678,7 @@ def get_de_novo_expr( ) # Fail checks - fail_checks = { + fail_checks_expr = { "min_dp_ratio": dp_ratio < min_dp_ratio, "parent_sum_ad_0": parent_sum_ad_0, "max_parent_ab": fail_max_parent_ab, diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 6def6292d..66f146911 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2789,7 +2789,7 @@ def get_copy_state_by_sex( hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression ]: """ - Determine the copy state of a variant by its locus and the sex karotype of a sample. + Determine the copy state of a variant by its locus and sample sex karyotype. :param locus_expr: LocusExpression of the variant. :param is_xx_expr: BooleanExpression indicating whether the sample has an XX sex diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 7509f434a..69a9a6da3 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -83,10 +83,10 @@ def test_get_freq_prior( hl.eval(expr) # Hail will throw an error here else: expr = get_freq_prior(hl.literal(freq_prior), min_pop_prior) - result = hl.eval(expr) # Evaluate the expression + result = hl.eval(expr) assert result == pytest.approx( expected, rel=1e-6 - ) # Compare floating point values safely + ) @pytest.mark.parametrize( "pl_input, expected", From 72525836b6da1e540f468dbb93fe40ae05df3943 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 5 Feb 2025 11:49:25 -0500 Subject: [PATCH 20/56] Reformat docstring and move helper functions --- gnomad/sample_qc/relatedness.py | 159 +++++++++++++++++--------------- tests/sample_qc/test_de_novo.py | 4 +- 2 files changed, 87 insertions(+), 76 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 8afcd901c..c7d3395fb 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1303,47 +1303,6 @@ def _get_alt_count(locus, gt, is_female): return sib_stats -def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 / 3e7): - """ - Get the population frequency prior for a de novo mutation. - - :param freq_prior_expr: The population frequency prior for the variant. - :param min_pop_prior: The minimum population frequency prior. Default is - 100/3e7, taken from Kaitlin Samocha's [de novo caller](https://github.com/ksamocha/de_novo_scripts). - """ - return hl.max( - hl.or_else( - hl.case() - .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) - .or_error( - hl.format( - "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", - freq_prior_expr, - ) - ), - 0.0, - ), - min_pop_prior, - ) - - -def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: - r""" - Transform the Phred-scaled likelihoods (PL) into the probability of observing each genotype (PP). - - .. note:: - The Phred-scaled likelihoods (PL) are transformed back into probabilities (PP) - using the relationship: - - .. math:: - PL = -10 \\times \\log_{10}{P(\\text{Genotype} | \\text{Data})} - - :param pl_expr: ArrayExpression of PL values. - :return: ArrayExpression of the probability of observing each genotype (PP). - """ - return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) - - def calculate_de_novo_post_prob( proband_pl: hl.expr.ArrayExpression, father_pl: hl.expr.ArrayExpression, @@ -1362,77 +1321,83 @@ def calculate_de_novo_post_prob( frequency prior for the variant. It's based on Kaitlin Samocha's [de novo caller]( https://github.com/ksamocha/de_novo_scripts) and Hail's [de_novo]( https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) - method. Please refer to these sources for more information on the de novo model. - - Neither Kaitlin's de novo caller nor Hail's de novo method provide a clear description on how - to calculate for de novo calls for hemizygous genotypes in XY individuals. These equations are included below. + method. Please refer to these sources for more information on the de novo model. + Neither Kaitlin's de novo caller nor Hail's de novo method provide a clear + description on how to calculate for de novo calls for hemizygous genotypes in XY + individuals. These equations are included below: .. math:: - P_{dn} = \\frac{P(DN | \\text{data})}{P(DN | \\text{data}) + P(\\text{missed het in parent(s)} | \\text{data})} + + P_{dn} = \frac{P(DN \mid \text{data})}{P(DN \mid \text{data}) + P(\text{missed het in parent(s)} \mid \text{data})} The terms are defined as follows: - **Probability of a de novo mutation given the data** (:math:`P(DN | \text{data})`): + - :math:`P(DN \mid \text{data})` is the probability that the variant is **de novo**, given the observed genotype data. + + - :math:`P(\text{missed het in parent(s)} \mid \text{data})` is the probability that the heterozygous variant was **missed in the parent(s)**. + + Applying Bayesian Theorem to the numerator and denominator yields: .. math:: - P(DN | \\text{data}) = P(\\text{data} | DN) \\times P(DN) + + P_{dn} = \frac{P(\text{data} \mid DN) \, P(DN)}{P(\text{data} \mid DN) \, P(DN) + P(\text{data} \mid \text{missed het in parent(s)}) \, P(\text{missed het in parent(s)})} where: - - :math:`P(\\text{data} | DN)`: Probability of observing the data under the assumption of a de novo mutation. + - :math:`P(\text{data} \mid DN)`: Probability of observing the data under the assumption of a de novo mutation. - **Autosomes and PAR regions**: .. math:: - P(\\text{data} | DN) = P(\\text{hom\\_ref in father}) \\times P(\\text{hom\\_ref in mother}) \\times P(\\text{het in proband}) - **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** - - **X non-PAR regions (XY only)**: + P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_ref in mother}) \, P(\text{het in proband}) + + - **X non-PAR regions (XY only)**: .. math:: - P(\\text{data} | DN) = P(\\text{hom\\_ref in mother}) \\times P(\\text{hom\\_alt in proband}) + + P(\text{data} \mid DN) = P(\text{hom_ref in mother}) \, P(\text{hom_alt in proband}) - **Y non-PAR regions (XY only)**: .. math:: - P(\\text{data} | DN) = P(\\text{hom\\_ref in father}) \\times P(\\text{hom\\_alt in proband}) + + P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_alt in proband}) - :math:`P(DN)`: The prior probability of a de novo mutation from literature, defined as: .. math:: - P(DN) = \\frac{1}{3 \\times 10^7} - **Probability of missed heterozygous parent(s) given the data** (:math:`P(\text{missed het in parent(s)} | \text{data})`): + P(DN) = \frac{1}{3 \times 10^7} - .. math:: - P(\\text{missed het in parent(s)} | \\text{data}) = P(\\text{data} | \\text{at least one parent is het}) \\times P(\\text{one parent is het}) - - where: - - - :math:`P(\\text{data} | \\text{missed het in parent(s)})`: Probability of observing the data under the assumption of a missed het in a parent. + - :math:`P(\text{data} \mid \text{missed het in parent(s)})`: Probability of observing the data under the assumption of a missed het in a parent. - **Autosomes and PAR regions**: .. math:: - P(\\text{data} | \\text{missed het in parents}) = \\left( P(\\text{het in father}) \\times P(\\text{hom\\_ref in mother}) + P(\\text{hom\\_ref in father}) \\times P(\\text{het in mother}) \\right) \\times P(\\text{het in proband}) + + P(\text{data} \mid \text{missed het in parents}) = ( P(\text{het in father}) \times P(\text{hom_ref in mother}) + P(\text{hom_ref in father}) \times P(\text{het in mother})) \times P(\text{het in proband}) - **X non-PAR regions**: .. math:: - P(\\text{data} | \\text{missed het in mother}) = (P(\\text{het in mother}) + P(\\text{hom\\_var in mother})) \\times P(\\text{hom\\_var in proband}) + + P(\text{data} \mid \text{missed het in mother}) = (P(\text{het in mother}) + P(\text{hom_alt in mother})) \times P(\text{hom_alt in proband}) - **Y non-PAR regions**: .. math:: - P(\\text{data} | \\text{missed het in father}) = (P(\\text{het in father}) + P(\\text{hom\\_var in father})) \\times P(\\text{hom\\_var in proband}) - **Prior probability for at least one heterozygous parent**: + P(\text{data} \mid \text{missed het in father}) = (P(\text{het in father}) + P(\text{hom_alt in father})) \times P(\text{hom_alt in proband}) + + - :math:`P(\text{missed het in parent(s)` equals the **probability for at least one heterozygous parent**: .. math:: - P(\\text{het in one parent}) = 1 - (1 - \\text{freq\\_prior})^4 - where :math:`\\text{freq\\_prior}` is the population frequency prior for the variant. + P(\text{het in one parent}) = 1 - (1 - \text{freq_prior})^4 + + where :math:`\text{freq_prior}` is the population frequency prior for the variant. :param proband_pl: Phred-scaled genotype likelihoods for the proband. :param father_pl: Phred-scaled genotype likelihoods for the father. @@ -1444,17 +1409,63 @@ def calculate_de_novo_post_prob( :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). :return: Posterior probability of a de novo mutation (`P_dn`). """ + + def _get_freq_prior( + freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 / 3e7 + ): + """ + Get the population frequency prior for a de novo mutation. + + :param freq_prior_expr: The population frequency prior for the variant. + :param min_pop_prior: The minimum population frequency prior. Default is + 100/3e7, taken from Kaitlin Samocha's [de novo caller](https://github.com/ksamocha/de_novo_scripts). + """ + return hl.max( + hl.or_else( + hl.case() + .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) + .or_error( + hl.format( + "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", + freq_prior_expr, + ) + ), + 0.0, + ), + min_pop_prior, + ) + + def _transform_pl_to_pp( + pl_expr: hl.expr.ArrayExpression, + ) -> hl.expr.ArrayExpression: + r""" + Transform the Phred-scaled likelihoods (PL) into the probability of observing each genotype (PP). + + .. note:: + The Phred-scaled likelihoods (PL) are converted back into conditional genotype + probabilities (PP) given the data, as computed by HaplotypeCaller, using the + following relationship: + + .. math:: + + {PL} = -10 \times \log_{10}{P(\text{Genotype} \mid \text{Data})} + + :param pl_expr: ArrayExpression of PL values. + :return: ArrayExpression of the probability of observing each genotype (PP). + """ + return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) + # Ensure valid genomic context diploid_expr, hemi_x, hemi_y = get_copy_state_by_sex(locus_expr, is_xx_expr) # Adjust frequency prior - freq_prior_expr = get_freq_prior(freq_prior_expr, min_pop_prior) + freq_prior_expr = _get_freq_prior(freq_prior_expr, min_pop_prior) prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pp_proband = transform_pl_to_pp(proband_pl) - pp_father = transform_pl_to_pp(father_pl) - pp_mother = transform_pl_to_pp(mother_pl) + pp_proband = _transform_pl_to_pp(proband_pl) + pp_father = _transform_pl_to_pp(father_pl) + pp_mother = _transform_pl_to_pp(mother_pl) # Compute `P(data | DN)` prob_data_given_dn_expr = ( @@ -1485,7 +1496,9 @@ def calculate_de_novo_post_prob( # Calculate posterior probability of de novo mutation prob_dn_given_data_expr = prob_data_given_dn_expr * de_novo_prior - p_dn_expr = prob_dn_given_data_expr / (prob_dn_given_data_expr + prob_data_missed_het_expr) + p_dn_expr = prob_dn_given_data_expr / ( + prob_dn_given_data_expr + prob_data_missed_het_expr + ) return p_dn_expr diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 69a9a6da3..f3c2208ef 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -84,9 +84,7 @@ def test_get_freq_prior( else: expr = get_freq_prior(hl.literal(freq_prior), min_pop_prior) result = hl.eval(expr) - assert result == pytest.approx( - expected, rel=1e-6 - ) + assert result == pytest.approx(expected, rel=1e-6) @pytest.mark.parametrize( "pl_input, expected", From 90ab407163a11fca1f1c14ab6df895b5003d2d1b Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 5 Feb 2025 12:40:32 -0500 Subject: [PATCH 21/56] Correct the confidence errors --- gnomad/sample_qc/relatedness.py | 126 +++++++++++++++++--------------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index c7d3395fb..d89b6d579 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1304,9 +1304,9 @@ def _get_alt_count(locus, gt, is_female): def calculate_de_novo_post_prob( - proband_pl: hl.expr.ArrayExpression, - father_pl: hl.expr.ArrayExpression, - mother_pl: hl.expr.ArrayExpression, + proband_pl_expr: hl.expr.ArrayExpression, + father_pl_expr: hl.expr.ArrayExpression, + mother_pl_expr: hl.expr.ArrayExpression, locus_expr: hl.expr.LocusExpression, is_xx_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, @@ -1353,6 +1353,7 @@ def calculate_de_novo_post_prob( P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_ref in mother}) \, P(\text{het in proband}) + **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** - **X non-PAR regions (XY only)**: .. math:: @@ -1399,9 +1400,9 @@ def calculate_de_novo_post_prob( where :math:`\text{freq_prior}` is the population frequency prior for the variant. - :param proband_pl: Phred-scaled genotype likelihoods for the proband. - :param father_pl: Phred-scaled genotype likelihoods for the father. - :param mother_pl: Phred-scaled genotype likelihoods for the mother. + :param proband_pl_expr: Phred-scaled genotype likelihoods for the proband. + :param father_pl_expr: Phred-scaled genotype likelihoods for the father. + :param mother_pl_expr: Phred-scaled genotype likelihoods for the mother. :param locus_expr: LocusExpression of the variant. :param is_xx_expr: BooleanExpression indicating whether the proband has XX sex karyotype. :param freq_prior_expr: Population frequency prior for the variant. @@ -1463,9 +1464,9 @@ def _transform_pl_to_pp( prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pp_proband = _transform_pl_to_pp(proband_pl) - pp_father = _transform_pl_to_pp(father_pl) - pp_mother = _transform_pl_to_pp(mother_pl) + pp_proband = _transform_pl_to_pp(proband_pl_expr) + pp_father = _transform_pl_to_pp(father_pl_expr) + pp_mother = _transform_pl_to_pp(mother_pl_expr) # Compute `P(data | DN)` prob_data_given_dn_expr = ( @@ -1558,25 +1559,23 @@ def get_de_novo_expr( low_conf_ab: float = 0.2, high_conf_p: float = 0.99, med_conf_p: float = 0.5, - low_conf_p: float = 0.2, ) -> hl.expr.StructExpression: """ Get the de novo status of a variant based on the proband and parent genotypes. Thresholds: - +----------------+------------+----------------------+------+------+------+------+------+ - | Category | P(de novo) | AB* | AD* | DP* | DR* | GQ* | AC* | - +----------------+------------+----------------------+------+------+------+------+------+ - | FAIL | < 0.05 | AB(parents) > 0.05 | 0 | | <0.1 | <20 | | - | | | OR AB(proband) < 0.2 | | | | | | - | HIGH (Indel) | > 0.99 | > 0.3 | | | | | =1 | - | HIGH (SNV) 1 | > 0.99 | > 0.3 | | >10 | | | | - | HIGH (SNV) 2 | > 0.5 | > 0.3 | | | >0.2 | | <10 | - | MEDIUM | > 0.5 | > 0.3 | | | | | | - | LOW | > 0.2 | >= 0.2 | | | | | | - | VERY LOW | >= 0.05 | | | | | | | - +----------------+------------+----------------------+------+------+------+------+------+ + +----------------+------------+----------------------+------+------+------+------+ + | Category | P(de novo) | AB | AD | DP | DR | GQ | + +----------------+------------+----------------------+------+------+------+------+ + | FAIL | < 0.05 | AB(parents) > 0.05 | 0 | | <0.1 | <20 | + | | | OR AB(proband) < 0.2 | | | | | + | HIGH (Indel) | > 0.99 | > 0.3 | | | | | + | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | >0.2 | | + | HIGH (SNV) 2 | > 0.5 | > 0.3 | | >10 | | | + | MEDIUM | > 0.5 | > 0.3 | | | | | + | LOW | >= 0.05 | >= 0.2 | | | | | + +----------------+------------+----------------------+------+------+------+------+ * AB: Proband AB. FAIL criteria also includes threshold for parent(s). @@ -1586,6 +1585,18 @@ def get_de_novo_expr( * GQ: Proband GQ. + .. note:: + + The simplified version is the same as Hail's methods when using the + `ignore_in_sample_allele_frequency` parameter. The main difference is that + this mode should be used when families larger than a single trio are in the + dataset, in which an allele might be de novo in a parent and transmitted to a + child in the dataset. This mode will not consider the allele count (AC) in + the dataset, and will only consider the Phred-scaled likelihoods (PL) of the + child and parents, allele balance (AB) of the child and parents, + the genotype quality (GQ) of the child, the depth (DP) of the child and + parents, and the population frequency prior. + :param locus_expr: Variant locus. :param alleles_expr: Variant alleles. It assumes bi-allelic variants, meaning that the matrix table or table should be already split to bi-allelics. @@ -1594,20 +1605,19 @@ def get_de_novo_expr( :param mother_expr: Mother genotype info; required fields: GT, DP, GQ, AD, PL. :param is_xx_expr: Whether the proband is XX. :param freq_prior_expr: Population frequency prior for the variant. - :param min_pop_prior: Minimum population frequency prior, default to 100 / 3e7. - :param de_novo_prior: Prior probability of a de novo mutation, default to 1 / 3e7. - :param min_dp_ratio: Minimum depth ratio for proband to parents, default to 0.1. - :param min_gq: Minimum genotype quality for the proband, default to 20. - :param min_proband_ab: Minimum allele balance for the proband, default to 0.2. - :param max_parent_ab: Maximum allele balance for parents, default to 0.05. - :param min_de_novo_p: Minimum probability for variant to be called de novo, default to 0.05. - :param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence, default to 0.2. - :param dp_threshold_snp: Minimum depth for high-confidence SNPs, default to 10. - :param high_med_conf_ab: AB threshold for high/medium confidence, default to 0.3. - :param low_conf_ab: AB threshold for low confidence, default to 0.2. - :param high_conf_p: P(de novo) threshold for high confidence, default to 0.99. - :param med_conf_p: P(de novo) threshold for medium confidence, default to 0.5. - :param low_conf_p: P(de novo) threshold for low confidence, default to 0.2. + :param min_pop_prior: Minimum population frequency prior. Default is 100 / 3e7. + :param de_novo_prior: Prior probability of a de novo mutation. Default is 1 / 3e7. + :param min_dp_ratio: Minimum depth ratio for proband to parents. Default is 0.1. + :param min_gq: Minimum genotype quality for the proband. Default is 20. + :param min_proband_ab: Minimum allele balance for the proband. Default is 0.2. + :param max_parent_ab: Maximum allele balance for parents. Default is 0.05. + :param min_de_novo_p: Minimum probability for variant to be called de novo. Default is 0.05. + :param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence. Default is 0.2. + :param dp_threshold_snp: Minimum depth for high-confidence SNPs. Default is 10. + :param high_med_conf_ab: AB threshold for high/medium confidence. Default is 0.3. + :param low_conf_ab: AB threshold for low confidence. Default is 0.2. + :param high_conf_p: P(de novo) threshold for high confidence. Default is 0.99. + :param med_conf_p: P(de novo) threshold for medium confidence. Default is 0.5. :return: A StructExpression with variant de novo status and confidence of de novo call. """ # Determine genomic context @@ -1645,26 +1655,28 @@ def get_de_novo_expr( hl.case() .when( ( - is_snp - & (p_de_novo > high_conf_p) - & (proband_ab > high_med_conf_ab) - & ( - (proband_expr.DP > dp_threshold_snp) - | (dp_ratio > high_conf_dp_ratio) + ( + is_snp + & (p_de_novo > high_conf_p) + & (proband_ab > high_med_conf_ab) + & (dp_ratio > high_conf_dp_ratio) ) - ) - | (~is_snp & (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab)), + | ( + is_snp + & (p_de_novo > med_conf_p) + & (proband_ab > high_med_conf_ab) + & (proband_expr.DP > dp_threshold_snp) + ) + | ( + ~is_snp + & (p_de_novo > high_conf_p) + & (proband_ab > high_med_conf_ab) + ) + ), "HIGH", ) .when((p_de_novo > med_conf_p) & (proband_ab > high_med_conf_ab), "MEDIUM") - .when((p_de_novo > low_conf_p) & (proband_ab >= low_conf_ab), "LOW") - # This level (`VERY LOW`) is added to give a confidence level for variants that - # don't fail but don't meet the other thresholds. - # This was not present in Kaitlin's original de novo caller or Hail's de novo method - .when( - (p_de_novo >= min_de_novo_p), - "VERY LOW", - ) + .when((p_de_novo >= min_de_novo_p) & (proband_ab >= low_conf_ab), "LOW") .or_missing() ) @@ -1693,18 +1705,18 @@ def get_de_novo_expr( # Fail checks fail_checks_expr = { "min_dp_ratio": dp_ratio < min_dp_ratio, - "parent_sum_ad_0": parent_sum_ad_0, - "max_parent_ab": fail_max_parent_ab, + "parent_sum_ad_0": parent_sum_ad_0_expr, + "max_parent_ab": fail_max_parent_ab_expr, "min_proband_ab": proband_ab < min_proband_ab, "min_proband_gq": proband_expr.GQ < min_gq, "min_de_novo_p": p_de_novo <= min_de_novo_p, } - fail = hl.any(list(fail_checks.values())) + fail = hl.any(list(fail_checks_expr.values())) result_expr = hl.struct( p_de_novo=hl.if_else(fail, hl.missing(hl.tfloat64), p_de_novo), - confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence), - fail_reason=add_filters_expr(filters=fail_checks), + confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence_expr), + fail_reason=add_filters_expr(filters=fail_checks_expr), ) return result_expr From 2816f9c4a7d2e41d68f9f0995d0bcac33cd43d08 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 5 Feb 2025 15:06:53 -0500 Subject: [PATCH 22/56] Move is_de_novo --- gnomad/sample_qc/relatedness.py | 87 +++++++++++++-------------------- 1 file changed, 35 insertions(+), 52 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index d89b6d579..50cf7672e 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1307,8 +1307,9 @@ def calculate_de_novo_post_prob( proband_pl_expr: hl.expr.ArrayExpression, father_pl_expr: hl.expr.ArrayExpression, mother_pl_expr: hl.expr.ArrayExpression, - locus_expr: hl.expr.LocusExpression, - is_xx_expr: hl.expr.BooleanExpression, + diploid_expr: hl.expr.BooleanExpression, + hemi_x_expr: hl.expr.BooleanExpression, + hemi_y_expr: hl.expr.BooleanExpression, freq_prior_expr: hl.expr.Float64Expression, min_pop_prior: Optional[float] = 100 / 3e7, de_novo_prior: Optional[float] = 1 / 3e7, @@ -1403,8 +1404,9 @@ def calculate_de_novo_post_prob( :param proband_pl_expr: Phred-scaled genotype likelihoods for the proband. :param father_pl_expr: Phred-scaled genotype likelihoods for the father. :param mother_pl_expr: Phred-scaled genotype likelihoods for the mother. - :param locus_expr: LocusExpression of the variant. - :param is_xx_expr: BooleanExpression indicating whether the proband has XX sex karyotype. + :param diploid_expr: Boolean expression indicating a diploid genotype. + :param hemi_x_expr: Boolean expression indicating a hemizygous genotype on the X chromosome. + :param hemi_y_expr: Boolean expression indicating a hemizygous genotype on the Y chromosome. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: 100/3e7). :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). @@ -1412,7 +1414,7 @@ def calculate_de_novo_post_prob( """ def _get_freq_prior( - freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 / 3e7 + freq_prior: hl.expr.Float64Expression, min_prior=100 / 3e7 ): """ Get the population frequency prior for a de novo mutation. @@ -1424,16 +1426,16 @@ def _get_freq_prior( return hl.max( hl.or_else( hl.case() - .when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr) + .when((freq_prior >= 0) & (freq_prior <= 1), freq_prior) .or_error( hl.format( "de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e", - freq_prior_expr, + freq_prior, ) ), 0.0, ), - min_pop_prior, + min_prior, ) def _transform_pl_to_pp( @@ -1456,9 +1458,6 @@ def _transform_pl_to_pp( """ return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10)) - # Ensure valid genomic context - diploid_expr, hemi_x, hemi_y = get_copy_state_by_sex(locus_expr, is_xx_expr) - # Adjust frequency prior freq_prior_expr = _get_freq_prior(freq_prior_expr, min_pop_prior) prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 @@ -1471,8 +1470,8 @@ def _transform_pl_to_pp( # Compute `P(data | DN)` prob_data_given_dn_expr = ( hl.case() - .when(hemi_x, pp_mother[0] * pp_proband[2]) - .when(hemi_y, pp_father[0] * pp_proband[2]) + .when(hemi_x_expr, pp_mother[0] * pp_proband[2]) + .when(hemi_y_expr, pp_father[0] * pp_proband[2]) .when(diploid_expr, pp_father[0] * pp_mother[0] * pp_proband[1]) .or_missing() ) @@ -1481,10 +1480,12 @@ def _transform_pl_to_pp( prob_data_missed_het_expr = ( hl.case() .when( - hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het + hemi_x_expr, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * + prior_one_parent_het ) .when( - hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het + hemi_y_expr, (pp_father[1] + pp_father[2]) * pp_proband[2] * + prior_one_parent_het ) .when( diploid_expr, @@ -1503,39 +1504,6 @@ def _transform_pl_to_pp( return p_dn_expr -def call_de_novo( - locus_expr: hl.expr.LocusExpression, - proband_expr: hl.expr.StructExpression, - father_expr: hl.expr.StructExpression, - mother_expr: hl.expr.StructExpression, - is_xx_expr: hl.expr.BooleanExpression, -) -> hl.expr.BooleanExpression: - """ - Call a de novo mutation based on the proband and parent genotypes. - - :param locus_expr: Variant locus. - :param proband_expr: Proband genotype info, required field: GT. - :param father_expr: Father genotype info, required field: GT. - :param mother_expr: Mother genotype info, required field: GT. - :param is_xx_expr: Whether the proband is XX. - :return: BooleanExpression indicating whether the variant is a de novo mutation. - """ - # Ensure valid genomic context - diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex( - locus_expr, is_xx_expr - ) - - is_de_novo = ( - diploid_expr - & ( - proband_expr.GT.is_het() - & father_expr.GT.is_hom_ref() - & mother_expr.GT.is_hom_ref() - ) - | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) - | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) - ) - return is_de_novo def get_de_novo_expr( @@ -1621,16 +1589,19 @@ def get_de_novo_expr( :return: A StructExpression with variant de novo status and confidence of de novo call. """ # Determine genomic context - diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex( - locus_expr, is_xx_expr + diploid_expr = locus_expr.in_autosome_or_par() | ( + locus_expr.in_x_nonpar() & is_xx_expr ) + hemi_x_expr = locus_expr.in_x_nonpar() & ~is_xx_expr + hemi_y_expr = locus_expr.in_y_nonpar() & ~is_xx_expr p_de_novo = calculate_de_novo_post_prob( proband_expr.PL, father_expr.PL, mother_expr.PL, - locus_expr, - is_xx_expr, + diploid_expr, + hemi_x_expr, + hemi_y_expr, freq_prior_expr, min_pop_prior=min_pop_prior, de_novo_prior=de_novo_prior, @@ -1650,6 +1621,17 @@ def get_de_novo_expr( proband_ab = proband_expr.AD[1] / hl.sum(proband_expr.AD) is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1]) + is_de_novo = ( + diploid_expr + & ( + proband_expr.GT.is_het() + & father_expr.GT.is_hom_ref() + & mother_expr.GT.is_hom_ref() + ) + | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) + ) + # Confidence assignment confidence_expr = ( hl.case() @@ -1714,6 +1696,7 @@ def get_de_novo_expr( fail = hl.any(list(fail_checks_expr.values())) result_expr = hl.struct( + is_de_novo=is_de_novo, p_de_novo=hl.if_else(fail, hl.missing(hl.tfloat64), p_de_novo), confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence_expr), fail_reason=add_filters_expr(filters=fail_checks_expr), From 50f4745f8853ebd6d9ed52b32d0fe84f05e01431 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 5 Feb 2025 16:12:30 -0500 Subject: [PATCH 23/56] reST hyperlink format in docstring --- gnomad/sample_qc/relatedness.py | 77 ++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 50cf7672e..a494ae05f 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1319,9 +1319,8 @@ def calculate_de_novo_post_prob( This function computes the posterior probability of a de novo mutation (P_dn) using the likelihoods of the proband's and parents' genotypes and the population - frequency prior for the variant. It's based on Kaitlin Samocha's [de novo caller]( - https://github.com/ksamocha/de_novo_scripts) and Hail's [de_novo]( - https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) + frequency prior for the variant. It's based on Kaitlin Samocha's `de novo caller `_ + and Hail's `de_novo `_ method. Please refer to these sources for more information on the de novo model. Neither Kaitlin's de novo caller nor Hail's de novo method provide a clear @@ -1355,6 +1354,7 @@ def calculate_de_novo_post_prob( P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_ref in mother}) \, P(\text{het in proband}) **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** + - **X non-PAR regions (XY only)**: .. math:: @@ -1413,14 +1413,12 @@ def calculate_de_novo_post_prob( :return: Posterior probability of a de novo mutation (`P_dn`). """ - def _get_freq_prior( - freq_prior: hl.expr.Float64Expression, min_prior=100 / 3e7 - ): + def _get_freq_prior(freq_prior: hl.expr.Float64Expression, min_prior=100 / 3e7): """ Get the population frequency prior for a de novo mutation. - :param freq_prior_expr: The population frequency prior for the variant. - :param min_pop_prior: The minimum population frequency prior. Default is + :param freq_prior: The population frequency prior for the variant. + :param min_prior: The minimum population frequency prior. Default is 100/3e7, taken from Kaitlin Samocha's [de novo caller](https://github.com/ksamocha/de_novo_scripts). """ return hl.max( @@ -1480,12 +1478,12 @@ def _transform_pl_to_pp( prob_data_missed_het_expr = ( hl.case() .when( - hemi_x_expr, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * - prior_one_parent_het + hemi_x_expr, + (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het, ) .when( - hemi_y_expr, (pp_father[1] + pp_father[2]) * pp_proband[2] * - prior_one_parent_het + hemi_y_expr, + (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het, ) .when( diploid_expr, @@ -1504,9 +1502,7 @@ def _transform_pl_to_pp( return p_dn_expr - - -def get_de_novo_expr( +def default_get_de_novo_expr( locus_expr: hl.expr.LocusExpression, alleles_expr: hl.expr.ArrayExpression, proband_expr: hl.expr.StructExpression, @@ -1533,20 +1529,27 @@ def get_de_novo_expr( Thresholds: - +----------------+------------+----------------------+------+------+------+------+ - | Category | P(de novo) | AB | AD | DP | DR | GQ | - +----------------+------------+----------------------+------+------+------+------+ - | FAIL | < 0.05 | AB(parents) > 0.05 | 0 | | <0.1 | <20 | - | | | OR AB(proband) < 0.2 | | | | | - | HIGH (Indel) | > 0.99 | > 0.3 | | | | | - | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | >0.2 | | - | HIGH (SNV) 2 | > 0.5 | > 0.3 | | >10 | | | - | MEDIUM | > 0.5 | > 0.3 | | | | | - | LOW | >= 0.05 | >= 0.2 | | | | | - +----------------+------------+----------------------+------+------+------+------+ + +----------------+------------+-----------------------+------+-----+------+-----+ + | Category | P(de novo) | AB | AD | DP | DR | GQ | + +================+============+=======================+======+=====+======+=====+ + | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | <0.1 | <20 | + | | | AB(proband) < 0.2 | | | | | + +----------------+------------+-----------------------+------+-----+------+-----+ + | HIGH (Indel) | > 0.99 | > 0.3 | | | | | + +----------------+------------+-----------------------+------+-----+------+-----+ + | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | >0.2 | | + +----------------+------------+-----------------------+------+-----+------+-----+ + | HIGH (SNV) 2 | > 0.5 | > 0.3 | | >10 | | | + +----------------+------------+-----------------------+------+-----+------+-----+ + | MEDIUM | > 0.5 | > 0.3 | | | | | + +----------------+------------+-----------------------+------+-----+------+-----+ + | LOW | >= 0.05 | >= 0.2 | | | | | + +----------------+------------+-----------------------+------+-----+------+-----+ * AB: Proband AB. FAIL criteria also includes threshold for parent(s). + * AD: Parent(s) AD sum. + * DP: Proband DP. * DR: Defined as DP(proband) / DP(parent(s)). @@ -1588,9 +1591,13 @@ def get_de_novo_expr( :param med_conf_p: P(de novo) threshold for medium confidence. Default is 0.5. :return: A StructExpression with variant de novo status and confidence of de novo call. """ + # Check if the alleles are bi-allelic + if hl.len(alleles_expr) != 2: + raise ValueError("Alleles must be bi-allelic, please split multi if it's not.") + # Determine genomic context diploid_expr = locus_expr.in_autosome_or_par() | ( - locus_expr.in_x_nonpar() & is_xx_expr + locus_expr.in_x_nonpar() & is_xx_expr ) hemi_x_expr = locus_expr.in_x_nonpar() & ~is_xx_expr hemi_y_expr = locus_expr.in_y_nonpar() & ~is_xx_expr @@ -1622,14 +1629,14 @@ def get_de_novo_expr( is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1]) is_de_novo = ( - diploid_expr - & ( - proband_expr.GT.is_het() - & father_expr.GT.is_hom_ref() - & mother_expr.GT.is_hom_ref() - ) - | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) - | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) + diploid_expr + & ( + proband_expr.GT.is_het() + & father_expr.GT.is_hom_ref() + & mother_expr.GT.is_hom_ref() + ) + | hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref()) + | hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref()) ) # Confidence assignment From bffe5741360e39d67a5b7321198f1c36c61d800f Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:28:14 -0500 Subject: [PATCH 24/56] Add test --- gnomad/sample_qc/relatedness.py | 12 +- gnomad/utils/annotations.py | 25 --- tests/sample_qc/test_de_novo.py | 275 +++++++++----------------------- 3 files changed, 80 insertions(+), 232 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index a494ae05f..f5216945d 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -8,7 +8,6 @@ import hail as hl import networkx as nx -from gnomad.utils.annotations import get_copy_state_by_sex from gnomad.utils.filtering import add_filters_expr logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -1592,8 +1591,11 @@ def default_get_de_novo_expr( :return: A StructExpression with variant de novo status and confidence of de novo call. """ # Check if the alleles are bi-allelic - if hl.len(alleles_expr) != 2: - raise ValueError("Alleles must be bi-allelic, please split multi if it's not.") + alleles_expr = ( + hl.case() + .when(hl.len(alleles_expr) == 2, alleles_expr) + .or_error("Alleles must be bi-allelic, please split multi if it's not.") + ) # Determine genomic context diploid_expr = locus_expr.in_autosome_or_par() | ( @@ -1704,8 +1706,8 @@ def default_get_de_novo_expr( fail = hl.any(list(fail_checks_expr.values())) result_expr = hl.struct( is_de_novo=is_de_novo, - p_de_novo=hl.if_else(fail, hl.missing(hl.tfloat64), p_de_novo), - confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence_expr), + p_de_novo=hl.if_else(~is_de_novo | fail, hl.missing(hl.tfloat64), p_de_novo), + confidence=hl.if_else(~is_de_novo | fail, hl.missing(hl.tstr), confidence_expr), fail_reason=add_filters_expr(filters=fail_checks_expr), ) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 66f146911..350735a37 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2780,28 +2780,3 @@ def _create_group_dicts( final_freq_dict["subcohortFrequency"] = list_of_group_info_dicts return final_freq_dict - - -def get_copy_state_by_sex( - locus_expr: hl.expr.LocusExpression, - is_xx_expr: hl.expr.BooleanExpression, -) -> Tuple[ - hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression -]: - """ - Determine the copy state of a variant by its locus and sample sex karyotype. - - :param locus_expr: LocusExpression of the variant. - :param is_xx_expr: BooleanExpression indicating whether the sample has an XX sex - karyotype. - :return: A tuple of BooleanExpressions: - - diploid_expr: True if the variant is in autosomes or PAR regions, or in the X non-PAR region for XX individuals. - - hemi_x_expr: True if the variant is in the X non-PAR region for XY individuals. - - hemi_y_expr: True if the variant is in the Y non-PAR region for XY individuals. - """ - diploid_expr = locus_expr.in_autosome_or_par() | ( - locus_expr.in_x_nonpar() & is_xx_expr - ) - hemi_x_expr = locus_expr.in_x_nonpar() & ~is_xx_expr - hemi_y_expr = locus_expr.in_y_nonpar() & ~is_xx_expr - return diploid_expr, hemi_x_expr, hemi_y_expr diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index f3c2208ef..13b315d0b 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -5,224 +5,95 @@ from gnomad.sample_qc.relatedness import ( calculate_de_novo_post_prob, - call_de_novo, - get_de_novo_expr, - get_freq_prior, - transform_pl_to_pp, + default_get_de_novo_expr, ) -from gnomad.utils.annotations import get_copy_state_by_sex +"""Test suite for de novo mutation functions.""" -class TestDeNovoMutation: - """Test suite for de novo mutation functions.""" - - loci: dict[str, hl.expr.LocusExpression] - - @classmethod - def setup_class(cls): - """Set up common test data for all tests.""" - cls.locus_expr = hl.locus("1", 123456) - cls.alleles_expr = hl.literal(["A", "T"]) - cls.freq_prior_expr = hl.literal(0.01) - cls.is_xx_expr = hl.literal(False) +import hail as hl +import pytest - # Mock Genotype Likelihoods (PL) - cls.proband_pl = hl.literal([0, 10, 100]) - cls.father_pl = hl.literal([0, 100, 100]) - cls.mother_pl = hl.literal([0, 100, 100]) +from gnomad.sample_qc.relatedness import ( + calculate_de_novo_post_prob, + default_get_de_novo_expr, +) - # Mock Genotype Calls - cls.proband_expr = hl.struct( - GT=hl.call(0, 1), DP=10, GQ=30, AD=[3, 7], PL=cls.proband_pl - ) - cls.father_expr = hl.struct( - GT=hl.call(0, 0), DP=12, GQ=40, AD=[12, 0], PL=cls.father_pl - ) - cls.mother_expr = hl.struct( - GT=hl.call(0, 0), DP=15, GQ=50, AD=[15, 0], PL=cls.mother_pl - ) - cls.loci = { - "autosomal": hl.locus("chr1", 100000, reference_genome="GRCh38"), - # PAR regions (always diploid) - "par1": hl.locus("chrX", 2781479, reference_genome="GRCh38"), # PAR1 start - "par2": hl.locus("chrX", 155701383, reference_genome="GRCh38"), # PAR2 end - # X non-PAR (diploid for XX, hemizygous for XY) - "x_nonpar": hl.locus("chrX", 3000000, reference_genome="GRCh38"), - # Y non-PAR (hemizygous for XY) - "y_nonpar": hl.locus("chrY", 10000000, reference_genome="GRCh38"), - } +class TestDeNovoMutation: + """Test suite for de novo mutation functions.""" @pytest.mark.parametrize( - "freq_prior, min_pop_prior, expect_error, expected", + "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected_p_dn", [ - (0.05, 100 / 3e7, False, 0.05), - (-0.01, 100 / 3e7, True, None), - (1.2, 100 / 3e7, True, None), - (hl.missing(hl.tfloat64), 100 / 3e7, False, 100 / 3e7), - ], + ([73, 0, 161], [0, 99, 198], [0, 99, 198], True, False, False, 1.80e-02, + 100 / 3e7, 0.999), + ([152, 0, 283], [0, 99, 198], [0, 63, 126], True, False, False, 7.55e-02, + 100 / 3e7, 0.198), + ([99, 50, 0], [0, 99, 198], [0, 99, 198], False, True, False, 0.005, + 100 / 3e7, 1), + ([2326, 140, 0], [0, 40, 80], [0, 40, 80], False, False, True, 1.97e-04, + 100 / 3e7, 0.297), + ([99, 50, 0], [0, 99, 198], [0, 99, 198], False, True, False, 0.005, + 100 / 3e7, 1), + ([99, 50, 0], [0, 99, 198], [0, 99, 198], False, False, True, 0.005, + 100 / 3e7, 1), + ([2, 0, 230], [0, 0, 0], [0, 0, 0], True, False, False, 2.03e-02, + 100 / 3e7, 0) + ] ) - def test_get_freq_prior( - self, freq_prior, min_pop_prior, expect_error, expected - ) -> None: - """ - Test frequency prior computation. - - :param freq_prior: Frequency prior value. - :param min_pop_prior: Minimum population prior. - :param expect_error: Whether an error is expected. - :param expected: Expected frequency prior. - :return: None. - """ - if expect_error: - with pytest.raises( - hl.utils.java.HailUserError, - match="de_novo: expect 0 <= freq_prior_expr <= 1", - ): - expr = get_freq_prior(hl.literal(freq_prior), min_pop_prior) - hl.eval(expr) # Hail will throw an error here - else: - expr = get_freq_prior(hl.literal(freq_prior), min_pop_prior) - result = hl.eval(expr) - assert result == pytest.approx(expected, rel=1e-6) - @pytest.mark.parametrize( - "pl_input, expected", - [ - ( - [0, 10, 100], - [0.9090909090082644, 0.09090909090082644, 9.090909090082645e-11], - ), - ([0, 0, 0], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]), - ], - ) - def test_transform_pl_to_pp(self, pl_input, expected) -> None: - """ - Test PL to PP transformation. + def test_calculate_de_novo_post_prob( + self, proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, + min_pop_prior, expected_p_dn + ): + """Test `calculate_de_novo_post_prob` function.""" + # Compute posterior probability of de novo mutation + p_dn_expr = calculate_de_novo_post_prob( + hl.literal(proband_pl), + hl.literal(father_pl), + hl.literal(mother_pl), + hl.literal(diploid), + hl.literal(hemi_x), + hl.literal(hemi_y), + hl.literal(freq_prior), + min_pop_prior, + ) - :param pl_input: Input PL values. - :param expected: Expected PP values. - :return: None. - """ - expr = transform_pl_to_pp(hl.literal(pl_input)) - result = hl.eval(expr) + # Assert with floating-point tolerance + assert round(hl.eval(p_dn_expr), 3) == expected_p_dn - assert result == pytest.approx( - expected, abs=1e-12 - ), f"Got {result}, expected {expected}" + def test_default_get_de_novo_expr_fail_conditions(self): + """Test default_get_de_novo_expr with a failing case where multiple fail conditions apply.""" + # Define locus and alleles (Autosomal) + locus = hl.locus("1", 10000) + alleles = hl.literal(["A", "C"]) - @pytest.mark.parametrize( - "locus_key, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y", - [ - ("autosomal", True, True, False, False), - ("autosomal", False, True, False, False), - ("par1", True, True, False, False), - ("par2", False, True, False, False), - ("x_nonpar", True, True, False, False), - ("x_nonpar", False, False, True, False), - ("y_nonpar", True, False, False, False), - ("y_nonpar", False, False, False, True), - ], - ) - def test_get_copy_state_by_sex( - self, locus_key, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y - ) -> None: - """ - Test copy state determination based on locus type and sex. - - :param locus_key: Locus key. - :param is_xx: Whether the individual is XX. - :param expected_diploid: Expected diploid state. - :param expected_hemi_x: Expected hemizygous X state. - :param expected_hemi_y: Expected hemizygous Y state. - :return: None. - """ - locus = self.loci[locus_key] - is_xx_expr = hl.literal(is_xx) - - diploid, hemi_x, hemi_y = get_copy_state_by_sex(locus, is_xx_expr) - result = hl.eval([diploid, hemi_x, hemi_y]) - - assert result == [ - expected_diploid, - expected_hemi_x, - expected_hemi_y, - ], ( - f"Failed for locus={locus}, is_xx={is_xx}. Expected" - f" {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" - ) + # Define proband, father, and mother genotype structures + proband_expr = hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]) + father_expr = hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]) + mother_expr = hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]) - @pytest.mark.parametrize( - "locus_key, proband_gt, father_gt, mother_gt, is_xx, expected", - [ - ("autosomal", (0, 1), (0, 0), (0, 0), False, True), - ("autosomal", (1, 1), (0, 0), (0, 0), False, False), - ("x_nonpar", (1, 1), None, (0, 0), False, True), - ("x_nonpar", (1, 1), (0, 0), None, False, None), - ("y_nonpar", (1, 1), None, None, False, None), - ], - ) - def test_call_de_novo( - self, locus_key, proband_gt, father_gt, mother_gt, is_xx, expected - ) -> None: - """ - Test de novo mutation detection with different loci and parental genotypes. - - :param locus_key: Locus key. - :param proband_gt: Proband genotype. - :param father_gt: Father genotype. - :param mother_gt: Mother genotype. - :param is_xx: Whether the individual is XX. - :param expected: Expected de novo mutation status. - :return: None. - """ - locus_expr = self.loci[locus_key] - proband_expr = hl.struct( - GT=hl.call(*proband_gt) if proband_gt else hl.missing(hl.tcall) - ) - father_expr = hl.struct( - GT=hl.call(*father_gt) if father_gt else hl.missing(hl.tcall) - ) - mother_expr = hl.struct( - GT=hl.call(*mother_gt) if mother_gt else hl.missing(hl.tcall) - ) - is_xx_expr = hl.literal(is_xx) + # Population frequency prior + freq_prior_expr = hl.literal(1e-5) + is_xx_expr = hl.literal(True) - expr = call_de_novo( - locus_expr, proband_expr, father_expr, mother_expr, is_xx_expr - ) - result = hl.eval(expr) - - assert ( - result == expected - ), f"Mismatch in {locus_key}: Expected {expected}, got {result}" - - def test_calculate_de_novo_post_prob(self): - """Test posterior probability computation for de novo mutations.""" - expr = calculate_de_novo_post_prob( - self.proband_pl, - self.father_pl, - self.mother_pl, - self.locus_expr, - self.is_xx_expr, - self.freq_prior_expr, + # Compute de novo classification + result_expr = default_get_de_novo_expr( + locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, + freq_prior_expr ) - result = hl.eval(expr) - assert 0 <= result <= 1 # Posterior probability should be within valid range - - def test_get_de_novo_expr(self): - """Test the de novo expression struct output.""" - expr = get_de_novo_expr( - self.locus_expr, - self.alleles_expr, - self.proband_expr, - self.father_expr, - self.mother_expr, - self.is_xx_expr, - self.freq_prior_expr, + + # Expected result structure + expected_result_expr = hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason=hl.set({"min_de_novo_p", "min_proband_ab", "min_proband_gq", "parent_sum_ad_0"}), ) - result = hl.eval(expr) - assert "p_de_novo" in result - assert "confidence" in result - assert 0 <= result.p_de_novo <= 1 # Probability must be valid + # Evaluate Hail expressions to convert to Python-native objects + result = hl.eval(result_expr) + expected_result = hl.eval(expected_result_expr) + + # Convert fail_reason to set for direct comparison + assert result == expected_result From f9a625b70009f86e070dbdc244dc656bcb4b1064 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:35:26 -0500 Subject: [PATCH 25/56] Black --- tests/sample_qc/test_de_novo.py | 125 ++++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 23 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 13b315d0b..7a9006ed2 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -25,26 +25,96 @@ class TestDeNovoMutation: @pytest.mark.parametrize( "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected_p_dn", [ - ([73, 0, 161], [0, 99, 198], [0, 99, 198], True, False, False, 1.80e-02, - 100 / 3e7, 0.999), - ([152, 0, 283], [0, 99, 198], [0, 63, 126], True, False, False, 7.55e-02, - 100 / 3e7, 0.198), - ([99, 50, 0], [0, 99, 198], [0, 99, 198], False, True, False, 0.005, - 100 / 3e7, 1), - ([2326, 140, 0], [0, 40, 80], [0, 40, 80], False, False, True, 1.97e-04, - 100 / 3e7, 0.297), - ([99, 50, 0], [0, 99, 198], [0, 99, 198], False, True, False, 0.005, - 100 / 3e7, 1), - ([99, 50, 0], [0, 99, 198], [0, 99, 198], False, False, True, 0.005, - 100 / 3e7, 1), - ([2, 0, 230], [0, 0, 0], [0, 0, 0], True, False, False, 2.03e-02, - 100 / 3e7, 0) - ] + ( + [73, 0, 161], + [0, 99, 198], + [0, 99, 198], + True, + False, + False, + 1.80e-02, + 100 / 3e7, + 0.999, + ), + ( + [152, 0, 283], + [0, 99, 198], + [0, 63, 126], + True, + False, + False, + 7.55e-02, + 100 / 3e7, + 0.198, + ), + ( + [99, 50, 0], + [0, 99, 198], + [0, 99, 198], + False, + True, + False, + 0.005, + 100 / 3e7, + 1, + ), + ( + [2326, 140, 0], + [0, 40, 80], + [0, 40, 80], + False, + False, + True, + 1.97e-04, + 100 / 3e7, + 0.297, + ), + ( + [99, 50, 0], + [0, 99, 198], + [0, 99, 198], + False, + True, + False, + 0.005, + 100 / 3e7, + 1, + ), + ( + [99, 50, 0], + [0, 99, 198], + [0, 99, 198], + False, + False, + True, + 0.005, + 100 / 3e7, + 1, + ), + ( + [2, 0, 230], + [0, 0, 0], + [0, 0, 0], + True, + False, + False, + 2.03e-02, + 100 / 3e7, + 0, + ), + ], ) - def test_calculate_de_novo_post_prob( - self, proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, - min_pop_prior, expected_p_dn + self, + proband_pl, + father_pl, + mother_pl, + diploid, + hemi_x, + hemi_y, + freq_prior, + min_pop_prior, + expected_p_dn, ): """Test `calculate_de_novo_post_prob` function.""" # Compute posterior probability of de novo mutation @@ -60,7 +130,7 @@ def test_calculate_de_novo_post_prob( ) # Assert with floating-point tolerance - assert round(hl.eval(p_dn_expr), 3) == expected_p_dn + assert round(hl.eval(p_dn_expr), 3) == expected_p_dn def test_default_get_de_novo_expr_fail_conditions(self): """Test default_get_de_novo_expr with a failing case where multiple fail conditions apply.""" @@ -69,7 +139,9 @@ def test_default_get_de_novo_expr_fail_conditions(self): alleles = hl.literal(["A", "C"]) # Define proband, father, and mother genotype structures - proband_expr = hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]) + proband_expr = hl.struct( + GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230] + ) father_expr = hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]) mother_expr = hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]) @@ -79,8 +151,13 @@ def test_default_get_de_novo_expr_fail_conditions(self): # Compute de novo classification result_expr = default_get_de_novo_expr( - locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, - freq_prior_expr + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, ) # Expected result structure @@ -88,7 +165,9 @@ def test_default_get_de_novo_expr_fail_conditions(self): is_de_novo=True, p_de_novo=hl.missing(hl.tfloat64), confidence=hl.missing(hl.tstr), - fail_reason=hl.set({"min_de_novo_p", "min_proband_ab", "min_proband_gq", "parent_sum_ad_0"}), + fail_reason=hl.set( + {"min_de_novo_p", "min_proband_ab", "min_proband_gq", "parent_sum_ad_0"} + ), ) # Evaluate Hail expressions to convert to Python-native objects From 46538d21e6b5a41e8542c507236ad428e6a9e04c Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 6 Feb 2025 13:46:47 -0500 Subject: [PATCH 26/56] remove imports --- tests/sample_qc/test_de_novo.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 7a9006ed2..d373c4228 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -8,16 +8,6 @@ default_get_de_novo_expr, ) -"""Test suite for de novo mutation functions.""" - -import hail as hl -import pytest - -from gnomad.sample_qc.relatedness import ( - calculate_de_novo_post_prob, - default_get_de_novo_expr, -) - class TestDeNovoMutation: """Test suite for de novo mutation functions.""" From d7d69b57c41ae4a56d4fea18e1a99a8f86807225 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Thu, 6 Feb 2025 15:42:15 -0500 Subject: [PATCH 27/56] docstring typo --- gnomad/sample_qc/relatedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index f5216945d..f0d551db4 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1392,7 +1392,7 @@ def calculate_de_novo_post_prob( P(\text{data} \mid \text{missed het in father}) = (P(\text{het in father}) + P(\text{hom_alt in father})) \times P(\text{hom_alt in proband}) - - :math:`P(\text{missed het in parent(s)` equals the **probability for at least one heterozygous parent**: + - :math:`P(\text{missed het in parent(s)}` equals the **probability for at least one heterozygous parent**: .. math:: From 9d37bcb2674c76d2d8a4b9caab9b37cbb160d88c Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:46:58 -0500 Subject: [PATCH 28/56] Apply suggestions from code review Co-authored-by: Katherine Chao --- gnomad/sample_qc/relatedness.py | 35 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index f0d551db4..bf01476b6 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1316,7 +1316,7 @@ def calculate_de_novo_post_prob( r""" Calculate the posterior probability of a de novo mutation. - This function computes the posterior probability of a de novo mutation (P_dn) + This function computes the posterior probability of a de novo mutation (`P_dn`) using the likelihoods of the proband's and parents' genotypes and the population frequency prior for the variant. It's based on Kaitlin Samocha's `de novo caller `_ and Hail's `de_novo `_ @@ -1334,7 +1334,7 @@ def calculate_de_novo_post_prob( - :math:`P(DN \mid \text{data})` is the probability that the variant is **de novo**, given the observed genotype data. - - :math:`P(\text{missed het in parent(s)} \mid \text{data})` is the probability that the heterozygous variant was **missed in the parent(s)**. + - :math:`P(\text{missed het in parent(s)} \mid \text{data})` is the probability that the heterozygous variant was **missed in at least one parent**. Applying Bayesian Theorem to the numerator and denominator yields: @@ -1353,6 +1353,7 @@ def calculate_de_novo_post_prob( P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_ref in mother}) \, P(\text{het in proband}) **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** + Note that hemizygous calls in XY individuals will be reported as homozygous alternate without any sex ploidy adjustments, which is why the formulas below use `P(hom_alt in proband)` - **X non-PAR regions (XY only)**: @@ -1407,8 +1408,8 @@ def calculate_de_novo_post_prob( :param hemi_x_expr: Boolean expression indicating a hemizygous genotype on the X chromosome. :param hemi_y_expr: Boolean expression indicating a hemizygous genotype on the Y chromosome. :param freq_prior_expr: Population frequency prior for the variant. - :param min_pop_prior: Minimum population frequency prior (default: 100/3e7). - :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). + :param min_pop_prior: Minimum population frequency prior (default: :math:`\text{100/3e7}`). + :param de_novo_prior: Prior probability of a de novo mutation (default: :math:`text{1/3e7}`). :return: Posterior probability of a de novo mutation (`P_dn`). """ @@ -1460,9 +1461,8 @@ def _transform_pl_to_pp( prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pp_proband = _transform_pl_to_pp(proband_pl_expr) - pp_father = _transform_pl_to_pp(father_pl_expr) - pp_mother = _transform_pl_to_pp(mother_pl_expr) + pl_expr = {"proband": proband_pl_expr, "father": father_pl_expr, "mother": mother_pl_expr} + pp_expr = {k: _transform_pl_to_pp(v.PL) for k, v in entry_expr.items()} # Compute `P(data | DN)` prob_data_given_dn_expr = ( @@ -1547,7 +1547,7 @@ def default_get_de_novo_expr( * AB: Proband AB. FAIL criteria also includes threshold for parent(s). - * AD: Parent(s) AD sum. + * AD: Sum of parent(s) AD. * DP: Proband DP. @@ -1573,7 +1573,7 @@ def default_get_de_novo_expr( :param proband_expr: Proband genotype info; required fields: GT, DP, GQ, AD, PL. :param father_expr: Father genotype info; required fields: GT, DP, GQ, AD, PL. :param mother_expr: Mother genotype info; required fields: GT, DP, GQ, AD, PL. - :param is_xx_expr: Whether the proband is XX. + :param is_xx_expr: Whether the proband has XX sex karyotype. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior. Default is 100 / 3e7. :param de_novo_prior: Prior probability of a de novo mutation. Default is 1 / 3e7. @@ -1583,18 +1583,18 @@ def default_get_de_novo_expr( :param max_parent_ab: Maximum allele balance for parents. Default is 0.05. :param min_de_novo_p: Minimum probability for variant to be called de novo. Default is 0.05. :param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence. Default is 0.2. - :param dp_threshold_snp: Minimum depth for high-confidence SNPs. Default is 10. - :param high_med_conf_ab: AB threshold for high/medium confidence. Default is 0.3. - :param low_conf_ab: AB threshold for low confidence. Default is 0.2. + :param dp_threshold_snp: Minimum depth for high-confidence SNPs. Default is 10. + :param high_med_conf_ab: AB threshold for high/medium confidence. Default is 0.3. + :param low_conf_ab: AB threshold for low confidence. Default is 0.2. :param high_conf_p: P(de novo) threshold for high confidence. Default is 0.99. :param med_conf_p: P(de novo) threshold for medium confidence. Default is 0.5. - :return: A StructExpression with variant de novo status and confidence of de novo call. + :return: StructExpression with variant de novo status and confidence of de novo call. """ - # Check if the alleles are bi-allelic + # Check whether multiallelics have been split alleles_expr = ( hl.case() .when(hl.len(alleles_expr) == 2, alleles_expr) - .or_error("Alleles must be bi-allelic, please split multi if it's not.") + .or_error("Must split multiallelic variants prior to running this function.") ) # Determine genomic context @@ -1671,7 +1671,7 @@ def default_get_de_novo_expr( .or_missing() ) - parent_sum_ad_0_expr = ( + fail_parent_sum_ad_0_expr = ( hl.case() .when( diploid_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) == 0) @@ -1696,7 +1696,7 @@ def default_get_de_novo_expr( # Fail checks fail_checks_expr = { "min_dp_ratio": dp_ratio < min_dp_ratio, - "parent_sum_ad_0": parent_sum_ad_0_expr, + "parent_sum_ad_0": fail_parent_sum_ad_0_expr, "max_parent_ab": fail_max_parent_ab_expr, "min_proband_ab": proband_ab < min_proband_ab, "min_proband_gq": proband_expr.GQ < min_gq, @@ -1710,5 +1710,4 @@ def default_get_de_novo_expr( confidence=hl.if_else(~is_de_novo | fail, hl.missing(hl.tstr), confidence_expr), fail_reason=add_filters_expr(filters=fail_checks_expr), ) - return result_expr From f70125df2a7e128650e462fa0d7d83f10ae3521c Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:56:29 -0500 Subject: [PATCH 29/56] Docstring changes --- gnomad/sample_qc/relatedness.py | 108 +++++++++++++++++--------------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index f0d551db4..5255537fc 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1314,17 +1314,18 @@ def calculate_de_novo_post_prob( de_novo_prior: Optional[float] = 1 / 3e7, ) -> hl.expr.Float64Expression: r""" - Calculate the posterior probability of a de novo mutation. + Calculate the posterior probability of a *de novo* mutation. - This function computes the posterior probability of a de novo mutation (P_dn) - using the likelihoods of the proband's and parents' genotypes and the population - frequency prior for the variant. It's based on Kaitlin Samocha's `de novo caller `_ - and Hail's `de_novo `_ - method. Please refer to these sources for more information on the de novo model. + This function computes the posterior probability of a *de novo* mutation (*P_dn*) + based on the genotype likelihoods of the proband and parents, along with the + population frequency prior for the variant. - Neither Kaitlin's de novo caller nor Hail's de novo method provide a clear - description on how to calculate for de novo calls for hemizygous genotypes in XY - individuals. These equations are included below: + The method is adapted from Kaitlin Samocha's `de novo caller `_ + and Hail's `de_novo `_ function. + + However, neither approach explicitly defines how to compute *de novo* + probabilities for hemizygous genotypes in XY individuals. To address this, + we provide the full set of equations in this docstring. .. math:: @@ -1340,33 +1341,37 @@ def calculate_de_novo_post_prob( .. math:: - P_{dn} = \frac{P(\text{data} \mid DN) \, P(DN)}{P(\text{data} \mid DN) \, P(DN) + P(\text{data} \mid \text{missed het in parent(s)}) \, P(\text{missed het in parent(s)})} + P_{dn} = \frac{P(\text{data} \mid DN) \cdot P(DN)}{P(\text{data} \mid DN) \cdot P(DN) + P(\text{data} \mid \text{missed het in parent(s)}) \cdot P(\text{missed het in parent(s)})} where: - - :math:`P(\text{data} \mid DN)`: Probability of observing the data under the assumption of a de novo mutation. + - :math:`P(\text{data} \mid DN)`: Probability of observing the data under the assumption of a *de novo* mutation. - **Autosomes and PAR regions**: .. math:: - P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_ref in mother}) \, P(\text{het in proband}) + P(\text{data} \mid DN) = P(\text{hom_ref in father}) \cdot P(\text{hom_ref in mother}) \cdot P(\text{het in proband}) + + **Probability of a *de novo* mutation given the data for hemizygous calls in XY individuals** - **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** + Neither Kaitlin's *de novo* caller nor Hail's *de novo* method provide a clear + description on how to calculate for *de novo* calls for hemizygous genotypes + in XY individuals. These equations are included below: - **X non-PAR regions (XY only)**: .. math:: - P(\text{data} \mid DN) = P(\text{hom_ref in mother}) \, P(\text{hom_alt in proband}) + P(\text{data} \mid DN) = P(\text{hom_ref in mother}) \cdot P(\text{hom_alt in proband}) - **Y non-PAR regions (XY only)**: .. math:: - P(\text{data} \mid DN) = P(\text{hom_ref in father}) \, P(\text{hom_alt in proband}) + P(\text{data} \mid DN) = P(\text{hom_ref in father}) \cdot P(\text{hom_alt in proband}) - - :math:`P(DN)`: The prior probability of a de novo mutation from literature, defined as: + - :math:`P(DN)`: The prior probability of a *de novo* mutation from literature, defined as: .. math:: @@ -1378,21 +1383,21 @@ def calculate_de_novo_post_prob( .. math:: - P(\text{data} \mid \text{missed het in parents}) = ( P(\text{het in father}) \times P(\text{hom_ref in mother}) + P(\text{hom_ref in father}) \times P(\text{het in mother})) \times P(\text{het in proband}) + P(\text{data} \mid \text{missed het in parents}) = ( P(\text{het in father}) \cdot P(\text{hom_ref in mother}) + P(\text{hom_ref in father}) \cdot P(\text{het in mother})) \cdot P(\text{het in proband}) - **X non-PAR regions**: .. math:: - P(\text{data} \mid \text{missed het in mother}) = (P(\text{het in mother}) + P(\text{hom_alt in mother})) \times P(\text{hom_alt in proband}) + P(\text{data} \mid \text{missed het in mother}) = (P(\text{het in mother}) + P(\text{hom_alt in mother})) \cdot P(\text{hom_alt in proband}) - **Y non-PAR regions**: .. math:: - P(\text{data} \mid \text{missed het in father}) = (P(\text{het in father}) + P(\text{hom_alt in father})) \times P(\text{hom_alt in proband}) + P(\text{data} \mid \text{missed het in father}) = (P(\text{het in father}) + P(\text{hom_alt in father})) \cdot P(\text{hom_alt in proband}) - - :math:`P(\text{missed het in parent(s)}` equals the **probability for at least one heterozygous parent**: + - :math:`P(\text{missed het in parent(s)}`: Prior that at least one parent is heterozygous. Depends on alternate allele frequency: .. math:: @@ -1408,17 +1413,17 @@ def calculate_de_novo_post_prob( :param hemi_y_expr: Boolean expression indicating a hemizygous genotype on the Y chromosome. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: 100/3e7). - :param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7). - :return: Posterior probability of a de novo mutation (`P_dn`). + :param de_novo_prior: Prior probability of a *de novo* mutation (default: 1/3e7). + :return: Posterior probability of a *de novo* mutation (`P_dn`). """ def _get_freq_prior(freq_prior: hl.expr.Float64Expression, min_prior=100 / 3e7): """ - Get the population frequency prior for a de novo mutation. + Get the population frequency prior for a *de novo* mutation. :param freq_prior: The population frequency prior for the variant. :param min_prior: The minimum population frequency prior. Default is - 100/3e7, taken from Kaitlin Samocha's [de novo caller](https://github.com/ksamocha/de_novo_scripts). + 100/3e7, taken from Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts). """ return hl.max( hl.or_else( @@ -1448,7 +1453,7 @@ def _transform_pl_to_pp( .. math:: - {PL} = -10 \times \log_{10}{P(\text{Genotype} \mid \text{Data})} + {PL} = -10 \cdot \log_{10}{P(\text{Genotype} \mid \text{Data})} :param pl_expr: ArrayExpression of PL values. :return: ArrayExpression of the probability of observing each genotype (PP). @@ -1493,7 +1498,7 @@ def _transform_pl_to_pp( .or_missing() ) - # Calculate posterior probability of de novo mutation + # Calculate posterior probability of *de novo* mutation prob_dn_given_data_expr = prob_data_given_dn_expr * de_novo_prior p_dn_expr = prob_dn_given_data_expr / ( prob_dn_given_data_expr + prob_data_missed_het_expr @@ -1524,26 +1529,26 @@ def default_get_de_novo_expr( med_conf_p: float = 0.5, ) -> hl.expr.StructExpression: """ - Get the de novo status of a variant based on the proband and parent genotypes. + Get the *de novo* status of a variant based on the proband and parent genotypes. Thresholds: - +----------------+------------+-----------------------+------+-----+------+-----+ - | Category | P(de novo) | AB | AD | DP | DR | GQ | - +================+============+=======================+======+=====+======+=====+ - | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | <0.1 | <20 | - | | | AB(proband) < 0.2 | | | | | - +----------------+------------+-----------------------+------+-----+------+-----+ - | HIGH (Indel) | > 0.99 | > 0.3 | | | | | - +----------------+------------+-----------------------+------+-----+------+-----+ - | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | >0.2 | | - +----------------+------------+-----------------------+------+-----+------+-----+ - | HIGH (SNV) 2 | > 0.5 | > 0.3 | | >10 | | | - +----------------+------------+-----------------------+------+-----+------+-----+ - | MEDIUM | > 0.5 | > 0.3 | | | | | - +----------------+------------+-----------------------+------+-----+------+-----+ - | LOW | >= 0.05 | >= 0.2 | | | | | - +----------------+------------+-----------------------+------+-----+------+-----+ + +----------------+--------------+-----------------------+------+-----+------+-----+ + | Category | P(*de novo*) | AB | AD | DP | DR | GQ | + +================+==============+=======================+======+=====+======+=====+ + | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | <0.1 | <20 | + | | | AB(proband) < 0.2 | | | | | + +----------------+--------------+-----------------------+------+-----+------+-----+ + | HIGH (Indel) | > 0.99 | > 0.3 | | | | | + +----------------+--------------+-----------------------+------+-----+------+-----+ + | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | >0.2 | | + +----------------+--------------+-----------------------+------+-----+------+-----+ + | HIGH (SNV) 2 | > 0.5 | > 0.3 | | >10 | | | + +----------------+--------------+-----------------------+------+-----+------+-----+ + | MEDIUM | > 0.5 | > 0.3 | | | | | + +----------------+--------------+-----------------------+------+-----+------+-----+ + | LOW | >= 0.05 | >= 0.2 | | | | | + +----------------+--------------+-----------------------+------+-----+------+-----+ * AB: Proband AB. FAIL criteria also includes threshold for parent(s). @@ -1560,7 +1565,7 @@ def default_get_de_novo_expr( The simplified version is the same as Hail's methods when using the `ignore_in_sample_allele_frequency` parameter. The main difference is that this mode should be used when families larger than a single trio are in the - dataset, in which an allele might be de novo in a parent and transmitted to a + dataset, in which an allele might be *de novo* in a parent and transmitted to a child in the dataset. This mode will not consider the allele count (AC) in the dataset, and will only consider the Phred-scaled likelihoods (PL) of the child and parents, allele balance (AB) of the child and parents, @@ -1568,27 +1573,28 @@ def default_get_de_novo_expr( parents, and the population frequency prior. :param locus_expr: Variant locus. - :param alleles_expr: Variant alleles. It assumes bi-allelic variants, meaning - that the matrix table or table should be already split to bi-allelics. + :param alleles_expr: Variant alleles. Function assumes all variants are + biallelic, meaning that multiallelic variants in the input dataset should be + split prior to running this function. :param proband_expr: Proband genotype info; required fields: GT, DP, GQ, AD, PL. :param father_expr: Father genotype info; required fields: GT, DP, GQ, AD, PL. :param mother_expr: Mother genotype info; required fields: GT, DP, GQ, AD, PL. :param is_xx_expr: Whether the proband is XX. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior. Default is 100 / 3e7. - :param de_novo_prior: Prior probability of a de novo mutation. Default is 1 / 3e7. + :param de_novo_prior: Prior probability of a *de novo* mutation. Default is 1 / 3e7. :param min_dp_ratio: Minimum depth ratio for proband to parents. Default is 0.1. :param min_gq: Minimum genotype quality for the proband. Default is 20. :param min_proband_ab: Minimum allele balance for the proband. Default is 0.2. :param max_parent_ab: Maximum allele balance for parents. Default is 0.05. - :param min_de_novo_p: Minimum probability for variant to be called de novo. Default is 0.05. + :param min_de_novo_p: Minimum probability for variant to be called *de novo*. Default is 0.05. :param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence. Default is 0.2. :param dp_threshold_snp: Minimum depth for high-confidence SNPs. Default is 10. :param high_med_conf_ab: AB threshold for high/medium confidence. Default is 0.3. :param low_conf_ab: AB threshold for low confidence. Default is 0.2. - :param high_conf_p: P(de novo) threshold for high confidence. Default is 0.99. - :param med_conf_p: P(de novo) threshold for medium confidence. Default is 0.5. - :return: A StructExpression with variant de novo status and confidence of de novo call. + :param high_conf_p: P(*de novo*) threshold for high confidence. Default is 0.99. + :param med_conf_p: P(*de novo*) threshold for medium confidence. Default is 0.5. + :return: A StructExpression with variant *de novo* status and confidence of *de novo* call. """ # Check if the alleles are bi-allelic alleles_expr = ( From a387d1938518763321e39a5b17b3336c067cc579 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 11:34:43 -0500 Subject: [PATCH 30/56] A small change --- gnomad/sample_qc/relatedness.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 729c2012d..3969ba5f6 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1463,8 +1463,7 @@ def _transform_pl_to_pp( prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pl_expr = {"proband": proband_pl_expr, "father": father_pl_expr, "mother": mother_pl_expr} - pp_expr = {k: _transform_pl_to_pp(v.PL) for k, v in entry_expr.items()} + pp_proband, pp_father, pp_mother = [_transform_pl_to_pp(pl) for pl in [proband_pl_expr, father_pl_expr, mother_pl_expr]] # Compute `P(data | DN)` prob_data_given_dn_expr = ( From f45bc3c40400fb8a0ab8f7fe93be5e7549b3a3a6 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:18:08 -0500 Subject: [PATCH 31/56] Add error case --- tests/sample_qc/test_de_novo.py | 141 ++++++++++---------------------- 1 file changed, 41 insertions(+), 100 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index d373c4228..c5e59cbc1 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -13,114 +13,55 @@ class TestDeNovoMutation: """Test suite for de novo mutation functions.""" @pytest.mark.parametrize( - "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected_p_dn", + "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected", [ + # ✅ Valid test cases (should return numeric values) + ([73, 0, 161], [0, 99, 198], [0, 99, 198], True, False, False, 1.80e-02, + 100 / 3e7, 0.999), + ([152, 0, 283], [0, 99, 198], [0, 63, 126], True, False, False, 7.55e-02, + 100 / 3e7, 0.198), + # ❌ Invalid `freq_prior` case (should raise `HailUserError`) ( - [73, 0, 161], - [0, 99, 198], - [0, 99, 198], - True, - False, - False, - 1.80e-02, - 100 / 3e7, - 0.999, - ), - ( - [152, 0, 283], - [0, 99, 198], - [0, 63, 126], - True, - False, - False, - 7.55e-02, - 100 / 3e7, - 0.198, - ), - ( - [99, 50, 0], - [0, 99, 198], - [0, 99, 198], - False, - True, - False, - 0.005, - 100 / 3e7, - 1, - ), - ( - [2326, 140, 0], - [0, 40, 80], - [0, 40, 80], - False, - False, - True, - 1.97e-04, - 100 / 3e7, - 0.297, - ), - ( - [99, 50, 0], - [0, 99, 198], - [0, 99, 198], - False, - True, - False, - 0.005, - 100 / 3e7, - 1, - ), - ( - [99, 50, 0], - [0, 99, 198], - [0, 99, 198], - False, - False, - True, - 0.005, - 100 / 3e7, - 1, - ), - ( - [2, 0, 230], - [0, 0, 0], - [0, 0, 0], - True, - False, - False, - 2.03e-02, - 100 / 3e7, - 0, - ), + [99, 50, 0], [0, 99, 198], [0, 99, 198], False, True, False, 1.2, 100 / 3e7, + None), ], ) def test_calculate_de_novo_post_prob( - self, - proband_pl, - father_pl, - mother_pl, - diploid, - hemi_x, - hemi_y, - freq_prior, - min_pop_prior, - expected_p_dn, + self, proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, + min_pop_prior, expected ): """Test `calculate_de_novo_post_prob` function.""" - # Compute posterior probability of de novo mutation - p_dn_expr = calculate_de_novo_post_prob( - hl.literal(proband_pl), - hl.literal(father_pl), - hl.literal(mother_pl), - hl.literal(diploid), - hl.literal(hemi_x), - hl.literal(hemi_y), - hl.literal(freq_prior), - min_pop_prior, - ) + # Case where we expect an error (freq_prior is out of range) + if expected is None: + with pytest.raises(hl.utils.HailUserError, + match=r"de_novo: expect 0 <= freq_prior_expr <= 1, found .*"): + hl.eval( + calculate_de_novo_post_prob( + hl.literal(proband_pl), + hl.literal(father_pl), + hl.literal(mother_pl), + hl.literal(diploid), + hl.literal(hemi_x), + hl.literal(hemi_y), + hl.literal(freq_prior), # Invalid frequency prior + min_pop_prior, + ) + ) + else: + # Case where we expect a valid float result + p_dn_expr = calculate_de_novo_post_prob( + hl.literal(proband_pl), + hl.literal(father_pl), + hl.literal(mother_pl), + hl.literal(diploid), + hl.literal(hemi_x), + hl.literal(hemi_y), + hl.literal(freq_prior), + min_pop_prior, + ) - # Assert with floating-point tolerance - assert round(hl.eval(p_dn_expr), 3) == expected_p_dn + # Assert with floating-point tolerance + assert round(hl.eval(p_dn_expr), 3) == expected def test_default_get_de_novo_expr_fail_conditions(self): """Test default_get_de_novo_expr with a failing case where multiple fail conditions apply.""" From 9c36896cae08323d79ae7538dcc9a66e5ae6333b Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:41:40 -0500 Subject: [PATCH 32/56] Add suggested test cases --- gnomad/sample_qc/relatedness.py | 11 +- tests/sample_qc/test_de_novo.py | 184 ++++++++++++++++++++++++-------- 2 files changed, 150 insertions(+), 45 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 3969ba5f6..6480bd7bb 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1463,7 +1463,10 @@ def _transform_pl_to_pp( prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pp_proband, pp_father, pp_mother = [_transform_pl_to_pp(pl) for pl in [proband_pl_expr, father_pl_expr, mother_pl_expr]] + pp_proband, pp_father, pp_mother = [ + _transform_pl_to_pp(pl) + for pl in [proband_pl_expr, father_pl_expr, mother_pl_expr] + ] # Compute `P(data | DN)` prob_data_given_dn_expr = ( @@ -1710,6 +1713,10 @@ def default_get_de_novo_expr( is_de_novo=is_de_novo, p_de_novo=hl.if_else(~is_de_novo | fail, hl.missing(hl.tfloat64), p_de_novo), confidence=hl.if_else(~is_de_novo | fail, hl.missing(hl.tstr), confidence_expr), - fail_reason=add_filters_expr(filters=fail_checks_expr), + fail_reason=hl.if_else( + is_de_novo & fail, + add_filters_expr(filters=fail_checks_expr), + hl.empty_set(hl.tstr), + ), ) return result_expr diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index c5e59cbc1..e5bbc5a7c 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -8,7 +8,6 @@ default_get_de_novo_expr, ) - class TestDeNovoMutation: """Test suite for de novo mutation functions.""" @@ -16,25 +15,61 @@ class TestDeNovoMutation: "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected", [ # ✅ Valid test cases (should return numeric values) - ([73, 0, 161], [0, 99, 198], [0, 99, 198], True, False, False, 1.80e-02, - 100 / 3e7, 0.999), - ([152, 0, 283], [0, 99, 198], [0, 63, 126], True, False, False, 7.55e-02, - 100 / 3e7, 0.198), + ( + [73, 0, 161], + [0, 99, 198], + [0, 99, 198], + True, + False, + False, + 1.80e-02, + 100 / 3e7, + 0.999, + ), + ( + [152, 0, 283], + [0, 99, 198], + [0, 63, 126], + True, + False, + False, + 7.55e-02, + 100 / 3e7, + 0.198, + ), # ❌ Invalid `freq_prior` case (should raise `HailUserError`) ( - [99, 50, 0], [0, 99, 198], [0, 99, 198], False, True, False, 1.2, 100 / 3e7, - None), + [99, 50, 0], + [0, 99, 198], + [0, 99, 198], + False, + True, + False, + 1.2, + 100 / 3e7, + None, + ), ], ) def test_calculate_de_novo_post_prob( - self, proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, - min_pop_prior, expected + self, + proband_pl, + father_pl, + mother_pl, + diploid, + hemi_x, + hemi_y, + freq_prior, + min_pop_prior, + expected, ): """Test `calculate_de_novo_post_prob` function.""" # Case where we expect an error (freq_prior is out of range) if expected is None: - with pytest.raises(hl.utils.HailUserError, - match=r"de_novo: expect 0 <= freq_prior_expr <= 1, found .*"): + with pytest.raises( + hl.utils.HailUserError, + match=r"de_novo: expect 0 <= freq_prior_expr <= 1, found .*", + ): hl.eval( calculate_de_novo_post_prob( hl.literal(proband_pl), @@ -63,24 +98,94 @@ def test_calculate_de_novo_post_prob( # Assert with floating-point tolerance assert round(hl.eval(p_dn_expr), 3) == expected - def test_default_get_de_novo_expr_fail_conditions(self): - """Test default_get_de_novo_expr with a failing case where multiple fail conditions apply.""" - # Define locus and alleles (Autosomal) - locus = hl.locus("1", 10000) - alleles = hl.literal(["A", "C"]) - - # Define proband, father, and mother genotype structures - proband_expr = hl.struct( - GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230] - ) - father_expr = hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]) - mother_expr = hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]) - - # Population frequency prior - freq_prior_expr = hl.literal(1e-5) - is_xx_expr = hl.literal(True) - - # Compute de novo classification + @pytest.mark.parametrize( + "locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, freq_prior_expr, expected", + [ + # Case 1: Multiple fail conditions + ( + hl.locus("chr1", 10000, reference_genome="GRCh38"), + hl.literal(["A", "C"]), + hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.literal(True), + hl.literal(1e-5), + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason=hl.set( + { + "min_de_novo_p", + "min_proband_ab", + "min_proband_gq", + "parent_sum_ad_0", + } + ), + ), + ), + # Case 2: One fail condition (low DP ratio) + ( + hl.locus("chr1", 20000, reference_genome="GRCh38"), + hl.literal(["A", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, PL=[10, 0, 100]), + hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + hl.literal(False), + hl.literal(1e-5), + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason=hl.set({"min_dp_ratio"}), + ), + ), + # Case 3: Variant is inherited (not de novo) + ( + hl.locus("chr1", 30000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, PL=[10, 0, 100]), + hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, PL=[0, 20, 80]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, PL=[0, 99, 198]), + hl.literal(True), + hl.literal(1e-5), + hl.struct( + is_de_novo=False, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason=hl.empty_set(hl.tstr), + ), + ), + # Case 4: A passing case (high confidence de novo) + ( + hl.locus("chr1", 40000, reference_genome="GRCh38"), + hl.literal(["C", "G"]), + hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, PL=[99, 0, 1]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, PL=[0, 60, 120]), + hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, PL=[0, 80, 150]), + hl.literal(True), + hl.literal(1e-5), + hl.struct( + is_de_novo=True, + p_de_novo=0.999, # High confidence P(de novo) + confidence="HIGH", + fail_reason=hl.empty_set(hl.tstr), + ), + ), + ], + ) + def test_default_get_de_novo_expr( + self, + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, + expected, + ): + """Test different scenarios of `default_get_de_novo_expr` in one function.""" result_expr = default_get_de_novo_expr( locus, alleles, @@ -91,19 +196,12 @@ def test_default_get_de_novo_expr_fail_conditions(self): freq_prior_expr, ) - # Expected result structure - expected_result_expr = hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason=hl.set( - {"min_de_novo_p", "min_proband_ab", "min_proband_gq", "parent_sum_ad_0"} - ), - ) - - # Evaluate Hail expressions to convert to Python-native objects result = hl.eval(result_expr) - expected_result = hl.eval(expected_result_expr) + expected_result = hl.eval(expected) - # Convert fail_reason to set for direct comparison - assert result == expected_result + assert result.is_de_novo == expected_result.is_de_novo + assert ( + None if result.p_de_novo is None else round(result.p_de_novo, 3) + ) == expected_result.p_de_novo + assert result.confidence == expected_result.confidence + assert result.fail_reason == expected_result.fail_reason From 768d9f4fd12379de613a5b4e94e739a7a966c3c4 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:48:46 -0500 Subject: [PATCH 33/56] Black --- tests/sample_qc/test_de_novo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index e5bbc5a7c..c6a4facbc 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -8,6 +8,7 @@ default_get_de_novo_expr, ) + class TestDeNovoMutation: """Test suite for de novo mutation functions.""" From c2050d0c826192b534b655a59b5b38ffe98d4ecd Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 13:01:22 -0500 Subject: [PATCH 34/56] extra space removal --- gnomad/sample_qc/relatedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 6480bd7bb..e07c2aef4 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1590,7 +1590,7 @@ def default_get_de_novo_expr( :param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence. Default is 0.2. :param dp_threshold_snp: Minimum depth for high-confidence SNPs. Default is 10. :param high_med_conf_ab: AB threshold for high/medium confidence. Default is 0.3. - :param low_conf_ab: AB threshold for low confidence. Default is 0.2. + :param low_conf_ab: AB threshold for low confidence. Default is 0.2. :param high_conf_p: P(*de novo*) threshold for high confidence. Default is 0.99. :param med_conf_p: P(*de novo*) threshold for medium confidence. Default is 0.5. :return: StructExpression with variant *de novo* status and confidence of *de novo* call. From 2742e266007e608a3dbf8616fc50e9d7db9c43fd Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 13:04:14 -0500 Subject: [PATCH 35/56] Add warning block back --- gnomad/sample_qc/relatedness.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index e07c2aef4..3fdeabb82 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1571,6 +1571,20 @@ def default_get_de_novo_expr( the genotype quality (GQ) of the child, the depth (DP) of the child and parents, and the population frequency prior. + .. warning:: + + This method assumes that the PL and AD fields are present in the genotype fields + of the child and parents. If they are missing, this method will not work. + gnomAD v3 and v4 VDS have the PL and AD fields intentionally removed to + save storage space. If this is the reason that the PL and AD fields are + missing, the only way to use this method is to set them to their approximate + values: + + .. code-block:: python + + PL=hl.or_else(PL, [0, GQ, 2 * GQ]) + AD=hl.or_else(AD, [DP, 0]) + :param locus_expr: Variant locus. :param alleles_expr: Variant alleles. Function assumes all variants are biallelic, meaning that multiallelic variants in the input dataset should be From 285865c90a83c785c9c491bb21a8be89c583b50f Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Fri, 7 Feb 2025 14:49:05 -0500 Subject: [PATCH 36/56] Change wording --- gnomad/sample_qc/relatedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 3fdeabb82..69f8cb578 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1323,7 +1323,7 @@ def calculate_de_novo_post_prob( The method is adapted from Kaitlin Samocha's `de novo caller `_ and Hail's `de_novo `_ function. - However, neither approach explicitly defines how to compute *de novo* + However, neither approach explicitly documented how to compute *de novo* probabilities for hemizygous genotypes in XY individuals. To address this, we provide the full set of equations in this docstring. From ace2d37911dd479d217e1c290934116fe9f10d72 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:14:10 -0500 Subject: [PATCH 37/56] Apply suggestions from code review Co-authored-by: Katherine Chao --- gnomad/sample_qc/relatedness.py | 10 +++++----- tests/sample_qc/test_de_novo.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 69f8cb578..97be5c9a1 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1328,7 +1328,7 @@ def calculate_de_novo_post_prob( we provide the full set of equations in this docstring. .. math:: - + The posterior probability of an even being truly *de novo* vs. the probability it was a missed heterozygote call in one of the two parents is: P_{dn} = \frac{P(DN \mid \text{data})}{P(DN \mid \text{data}) + P(\text{missed het in parent(s)} \mid \text{data})} The terms are defined as follows: @@ -1353,7 +1353,7 @@ def calculate_de_novo_post_prob( P(\text{data} \mid DN) = P(\text{hom_ref in father}) \cdot P(\text{hom_ref in mother}) \cdot P(\text{het in proband}) - **Probability of a de novo mutation given the data for hemizygous calls in XY individuals** + Probability of a observing a *de novo* mutation given the data specifically for hemizygous calls in XY individuals Note that hemizygous calls in XY individuals will be reported as homozygous alternate without any sex ploidy adjustments, which is why the formulas below use `P(hom_alt in proband)` @@ -1411,7 +1411,7 @@ def calculate_de_novo_post_prob( :param hemi_y_expr: Boolean expression indicating a hemizygous genotype on the Y chromosome. :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: :math:`\text{100/3e7}`). - :param de_novo_prior: Prior probability of a *de novo* mutation (default: :math:`text{1/3e7}`). + :param de_novo_prior: Prior probability of a *de novo* mutation (default: :math:`\text{1/3e7}`). :return: Posterior probability of a de novo mutation (`P_dn`). """ @@ -1463,7 +1463,7 @@ def _transform_pl_to_pp( prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4 # Convert PL to probabilities - pp_proband, pp_father, pp_mother = [ + proband_pp_expr, father_pp_expr, mother_pp_expr = [ _transform_pl_to_pp(pl) for pl in [proband_pl_expr, father_pl_expr, mother_pl_expr] ] @@ -1530,7 +1530,7 @@ def default_get_de_novo_expr( """ Get the *de novo* status of a variant based on the proband and parent genotypes. - Thresholds: + Confidence thresholds (from Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts)): +----------------+--------------+-----------------------+------+-----+------+-----+ | Category | P(*de novo*) | AB | AD | DP | DR | GQ | diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index c6a4facbc..d43af6afb 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -15,7 +15,7 @@ class TestDeNovoMutation: @pytest.mark.parametrize( "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected", [ - # ✅ Valid test cases (should return numeric values) + # Valid test cases (should return expected numeric values) ( [73, 0, 161], [0, 99, 198], @@ -38,7 +38,7 @@ class TestDeNovoMutation: 100 / 3e7, 0.198, ), - # ❌ Invalid `freq_prior` case (should raise `HailUserError`) + # Invalid `freq_prior` case (should raise `HailUserError`) ( [99, 50, 0], [0, 99, 198], @@ -65,7 +65,7 @@ def test_calculate_de_novo_post_prob( expected, ): """Test `calculate_de_novo_post_prob` function.""" - # Case where we expect an error (freq_prior is out of range) + # Case where we expect an error (`freq_prior` is out of range) if expected is None: with pytest.raises( hl.utils.HailUserError, @@ -79,7 +79,7 @@ def test_calculate_de_novo_post_prob( hl.literal(diploid), hl.literal(hemi_x), hl.literal(hemi_y), - hl.literal(freq_prior), # Invalid frequency prior + hl.literal(freq_prior), min_pop_prior, ) ) From 49f6249950a4aa7360e8c3c90de7cf588664d05c Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:41:27 -0500 Subject: [PATCH 38/56] Address review comments --- gnomad/sample_qc/relatedness.py | 66 +++++++++++++++++---------------- tests/sample_qc/test_de_novo.py | 4 +- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 97be5c9a1..cfd8eac71 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1318,17 +1318,17 @@ def calculate_de_novo_post_prob( This function computes the posterior probability of a *de novo* mutation (`P_dn`) based on the genotype likelihoods of the proband and parents, along with the - population frequency prior for the variant. - - The method is adapted from Kaitlin Samocha's `de novo caller `_ + population frequency prior for the variant. The method is adapted from Kaitlin + Samocha's `de novo caller `_ and Hail's `de_novo `_ function. - However, neither approach explicitly documented how to compute *de novo* probabilities for hemizygous genotypes in XY individuals. To address this, we provide the full set of equations in this docstring. - .. math:: The posterior probability of an even being truly *de novo* vs. the probability it was a missed heterozygote call in one of the two parents is: + + .. math:: + P_{dn} = \frac{P(DN \mid \text{data})}{P(DN \mid \text{data}) + P(\text{missed het in parent(s)} \mid \text{data})} The terms are defined as follows: @@ -1373,7 +1373,7 @@ def calculate_de_novo_post_prob( .. math:: - P(DN) = \frac{1}{3 \times 10^7} + P(DN) = \frac{1}{3 \times 10^7} (1 mutation per 30 million base pairs) - :math:`P(\text{data} \mid \text{missed het in parent(s)})`: Probability of observing the data under the assumption of a missed het in a parent. @@ -1471,9 +1471,9 @@ def _transform_pl_to_pp( # Compute `P(data | DN)` prob_data_given_dn_expr = ( hl.case() - .when(hemi_x_expr, pp_mother[0] * pp_proband[2]) - .when(hemi_y_expr, pp_father[0] * pp_proband[2]) - .when(diploid_expr, pp_father[0] * pp_mother[0] * pp_proband[1]) + .when(hemi_x_expr, mother_pp_expr[0] * proband_pp_expr[2]) + .when(hemi_y_expr, father_pp_expr[0] * proband_pp_expr[2]) + .when(diploid_expr, father_pp_expr[0] * mother_pp_expr[0] * proband_pp_expr[1]) .or_missing() ) @@ -1482,16 +1482,16 @@ def _transform_pl_to_pp( hl.case() .when( hemi_x_expr, - (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het, + (mother_pp_expr[1] + mother_pp_expr[2]) * proband_pp_expr[2] * prior_one_parent_het, ) .when( hemi_y_expr, - (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het, + (father_pp_expr[1] + father_pp_expr[2]) * proband_pp_expr[2] * prior_one_parent_het, ) .when( diploid_expr, - (pp_father[1] * pp_mother[0] + pp_father[0] * pp_mother[1]) - * pp_proband[1] + (father_pp_expr[1] * mother_pp_expr[0] + father_pp_expr[0] * mother_pp_expr[1]) + * proband_pp_expr[1] * prior_one_parent_het, ) .or_missing() @@ -1532,22 +1532,22 @@ def default_get_de_novo_expr( Confidence thresholds (from Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts)): - +----------------+--------------+-----------------------+------+-----+------+-----+ - | Category | P(*de novo*) | AB | AD | DP | DR | GQ | - +================+==============+=======================+======+=====+======+=====+ - | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | <0.1 | <20 | - | | | AB(proband) < 0.2 | | | | | - +----------------+--------------+-----------------------+------+-----+------+-----+ - | HIGH (Indel) | > 0.99 | > 0.3 | | | | | - +----------------+--------------+-----------------------+------+-----+------+-----+ - | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | >0.2 | | - +----------------+--------------+-----------------------+------+-----+------+-----+ - | HIGH (SNV) 2 | > 0.5 | > 0.3 | | >10 | | | - +----------------+--------------+-----------------------+------+-----+------+-----+ - | MEDIUM | > 0.5 | > 0.3 | | | | | - +----------------+--------------+-----------------------+------+-----+------+-----+ - | LOW | >= 0.05 | >= 0.2 | | | | | - +----------------+--------------+-----------------------+------+-----+------+-----+ + +----------------+--------------+-----------------------+------+------+-------+-----+ + | Category | P(*de novo*) | AB | AD | DP | DR | GQ | + +================+==============+=======================+======+======+=======+=====+ + | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | < 0.1 | <20 | + | | | AB(proband) < 0.2 | | | | | + +----------------+--------------+-----------------------+------+------+-------+-----+ + | HIGH (Indel) | > 0.99 | > 0.3 | | | > 0.2 | | + +----------------+--------------+-----------------------+------+------+-------+-----+ + | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | > 0.2 | | + +----------------+--------------+-----------------------+------+------+-------+-----+ + | HIGH (SNV) 2 | > 0.5 | > 0.3 | | > 10 | | | + +----------------+--------------+-----------------------+------+------+-------+-----+ + | MEDIUM | > 0.5 | > 0.3 | | | | | + +----------------+--------------+-----------------------+------+------+-------+-----+ + | LOW | >= 0.05 | >= 0.2 | | | | | + +----------------+--------------+-----------------------+------+------+-------+-----+ * AB: Proband AB. FAIL criteria also includes threshold for parent(s). @@ -1561,6 +1561,11 @@ def default_get_de_novo_expr( .. note:: + The “LOW” confidence category differs slightly from the criteria in the + original code (P(*de novo) > 0.05 and AB > 0.2 ), as it is designed to fill + the gap for variants that do not meet the FAIL criteria but would otherwise + remain unclassified. + The simplified version is the same as Hail's methods when using the `ignore_in_sample_allele_frequency` parameter. The main difference is that this mode should be used when families larger than a single trio are in the @@ -1727,10 +1732,9 @@ def default_get_de_novo_expr( is_de_novo=is_de_novo, p_de_novo=hl.if_else(~is_de_novo | fail, hl.missing(hl.tfloat64), p_de_novo), confidence=hl.if_else(~is_de_novo | fail, hl.missing(hl.tstr), confidence_expr), - fail_reason=hl.if_else( + fail_reason=hl.or_missing( is_de_novo & fail, add_filters_expr(filters=fail_checks_expr), - hl.empty_set(hl.tstr), ), ) return result_expr diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index d43af6afb..563db3418 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -154,7 +154,7 @@ def test_calculate_de_novo_post_prob( is_de_novo=False, p_de_novo=hl.missing(hl.tfloat64), confidence=hl.missing(hl.tstr), - fail_reason=hl.empty_set(hl.tstr), + fail_reason=hl.missing(hl.tset(hl.tstr)), ), ), # Case 4: A passing case (high confidence de novo) @@ -170,7 +170,7 @@ def test_calculate_de_novo_post_prob( is_de_novo=True, p_de_novo=0.999, # High confidence P(de novo) confidence="HIGH", - fail_reason=hl.empty_set(hl.tstr), + fail_reason=hl.missing(hl.tset(hl.tstr)), ), ), ], From 457cd9a761bdfd19c64167889effba287c12d80d Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:42:03 -0500 Subject: [PATCH 39/56] Black --- gnomad/sample_qc/relatedness.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index cfd8eac71..f3684faa3 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1482,15 +1482,22 @@ def _transform_pl_to_pp( hl.case() .when( hemi_x_expr, - (mother_pp_expr[1] + mother_pp_expr[2]) * proband_pp_expr[2] * prior_one_parent_het, + (mother_pp_expr[1] + mother_pp_expr[2]) + * proband_pp_expr[2] + * prior_one_parent_het, ) .when( hemi_y_expr, - (father_pp_expr[1] + father_pp_expr[2]) * proband_pp_expr[2] * prior_one_parent_het, + (father_pp_expr[1] + father_pp_expr[2]) + * proband_pp_expr[2] + * prior_one_parent_het, ) .when( diploid_expr, - (father_pp_expr[1] * mother_pp_expr[0] + father_pp_expr[0] * mother_pp_expr[1]) + ( + father_pp_expr[1] * mother_pp_expr[0] + + father_pp_expr[0] * mother_pp_expr[1] + ) * proband_pp_expr[1] * prior_one_parent_het, ) From 3571265685de4a4693528dc3511540e8ec800c2b Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:43:54 -0500 Subject: [PATCH 40/56] Change indel HIGH code --- gnomad/sample_qc/relatedness.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index f3684faa3..9d771d568 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1678,8 +1678,7 @@ def default_get_de_novo_expr( .when( ( ( - is_snp - & (p_de_novo > high_conf_p) + (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab) & (dp_ratio > high_conf_dp_ratio) ) @@ -1689,11 +1688,6 @@ def default_get_de_novo_expr( & (proband_ab > high_med_conf_ab) & (proband_expr.DP > dp_threshold_snp) ) - | ( - ~is_snp - & (p_de_novo > high_conf_p) - & (proband_ab > high_med_conf_ab) - ) ), "HIGH", ) From 9b1515b5f2594e0b8af7165ce50a6a97dcff6d9e Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:53:33 -0500 Subject: [PATCH 41/56] typo --- gnomad/sample_qc/relatedness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 9d771d568..94f776d81 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1537,7 +1537,7 @@ def default_get_de_novo_expr( """ Get the *de novo* status of a variant based on the proband and parent genotypes. - Confidence thresholds (from Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts)): + Confidence thresholds (from Kaitlin Samocha's `de novo caller `_): +----------------+--------------+-----------------------+------+------+-------+-----+ | Category | P(*de novo*) | AB | AD | DP | DR | GQ | @@ -1569,7 +1569,7 @@ def default_get_de_novo_expr( .. note:: The “LOW” confidence category differs slightly from the criteria in the - original code (P(*de novo) > 0.05 and AB > 0.2 ), as it is designed to fill + original code (P(*de novo*) > 0.05 and AB > 0.2 ), as it is designed to fill the gap for variants that do not meet the FAIL criteria but would otherwise remain unclassified. From b899e7b9bd1e9aaadc6e6091fd2d761a22b3e5f5 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 19:12:46 -0500 Subject: [PATCH 42/56] Remove extra --- gnomad/sample_qc/relatedness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 94f776d81..f376a060c 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1373,7 +1373,7 @@ def calculate_de_novo_post_prob( .. math:: - P(DN) = \frac{1}{3 \times 10^7} (1 mutation per 30 million base pairs) + P(DN) = \frac{1}{3 \times 10^7} - :math:`P(\text{data} \mid \text{missed het in parent(s)})`: Probability of observing the data under the assumption of a missed het in a parent. From 5631c62d9bdd7d8a7c279df1e800721b28454820 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Sat, 8 Feb 2025 19:16:46 -0500 Subject: [PATCH 43/56] Adjust table --- gnomad/sample_qc/relatedness.py | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index f376a060c..b8e5a76fb 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1539,22 +1539,22 @@ def default_get_de_novo_expr( Confidence thresholds (from Kaitlin Samocha's `de novo caller `_): - +----------------+--------------+-----------------------+------+------+-------+-----+ - | Category | P(*de novo*) | AB | AD | DP | DR | GQ | - +================+==============+=======================+======+======+=======+=====+ - | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | < 0.1 | <20 | - | | | AB(proband) < 0.2 | | | | | - +----------------+--------------+-----------------------+------+------+-------+-----+ - | HIGH (Indel) | > 0.99 | > 0.3 | | | > 0.2 | | - +----------------+--------------+-----------------------+------+------+-------+-----+ - | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | > 0.2 | | - +----------------+--------------+-----------------------+------+------+-------+-----+ - | HIGH (SNV) 2 | > 0.5 | > 0.3 | | > 10 | | | - +----------------+--------------+-----------------------+------+------+-------+-----+ - | MEDIUM | > 0.5 | > 0.3 | | | | | - +----------------+--------------+-----------------------+------+------+-------+-----+ - | LOW | >= 0.05 | >= 0.2 | | | | | - +----------------+--------------+-----------------------+------+------+-------+-----+ + +----------------+--------------+-----------------------+------+------+-------+------+ + | Category | P(*de novo*) | AB | AD | DP | DR | GQ | + +================+==============+=======================+======+======+=======+======+ + | FAIL | < 0.05 | AB(parents) > 0.05 OR | 0 | | < 0.1 | < 20 | + | | | AB(proband) < 0.2 | | | | | + +----------------+--------------+-----------------------+------+------+-------+------+ + | HIGH (Indel) | > 0.99 | > 0.3 | | | > 0.2 | | + +----------------+--------------+-----------------------+------+------+-------+------+ + | HIGH (SNV) 1 | > 0.99 | > 0.3 | | | > 0.2 | | + +----------------+--------------+-----------------------+------+------+-------+------+ + | HIGH (SNV) 2 | > 0.5 | > 0.3 | | > 10 | | | + +----------------+--------------+-----------------------+------+------+-------+------+ + | MEDIUM | > 0.5 | > 0.3 | | | | | + +----------------+--------------+-----------------------+------+------+-------+------+ + | LOW | >= 0.05 | >= 0.2 | | | | | + +----------------+--------------+-----------------------+------+------+-------+------+ * AB: Proband AB. FAIL criteria also includes threshold for parent(s). @@ -1569,9 +1569,9 @@ def default_get_de_novo_expr( .. note:: The “LOW” confidence category differs slightly from the criteria in the - original code (P(*de novo*) > 0.05 and AB > 0.2 ), as it is designed to fill - the gap for variants that do not meet the FAIL criteria but would otherwise - remain unclassified. + original code (P(*de novo*) > 0.05 and AB(proband > 0.2 ), as it is + designed to fill the gap for variants that do not meet the FAIL criteria but + would otherwise remain unclassified. The simplified version is the same as Hail's methods when using the `ignore_in_sample_allele_frequency` parameter. The main difference is that From 395a74dec85e5563e67f6ada083cea9c4cd17439 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 10 Feb 2025 10:37:29 -0500 Subject: [PATCH 44/56] Put copy state function back --- gnomad/sample_qc/relatedness.py | 7 ++----- gnomad/utils/annotations.py | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index b8e5a76fb..021ffeb44 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -8,6 +8,7 @@ import hail as hl import networkx as nx +from gnomad.utils.annotations import get_copy_state_by_sex from gnomad.utils.filtering import add_filters_expr logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -1629,11 +1630,7 @@ def default_get_de_novo_expr( ) # Determine genomic context - diploid_expr = locus_expr.in_autosome_or_par() | ( - locus_expr.in_x_nonpar() & is_xx_expr - ) - hemi_x_expr = locus_expr.in_x_nonpar() & ~is_xx_expr - hemi_y_expr = locus_expr.in_y_nonpar() & ~is_xx_expr + diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex(locus_expr, is_xx_expr) p_de_novo = calculate_de_novo_post_prob( proband_expr.PL, diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 350735a37..6def6292d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2780,3 +2780,28 @@ def _create_group_dicts( final_freq_dict["subcohortFrequency"] = list_of_group_info_dicts return final_freq_dict + + +def get_copy_state_by_sex( + locus_expr: hl.expr.LocusExpression, + is_xx_expr: hl.expr.BooleanExpression, +) -> Tuple[ + hl.expr.BooleanExpression, hl.expr.BooleanExpression, hl.expr.BooleanExpression +]: + """ + Determine the copy state of a variant by its locus and the sex karotype of a sample. + + :param locus_expr: LocusExpression of the variant. + :param is_xx_expr: BooleanExpression indicating whether the sample has an XX sex + karyotype. + :return: A tuple of BooleanExpressions: + - diploid_expr: True if the variant is in autosomes or PAR regions, or in the X non-PAR region for XX individuals. + - hemi_x_expr: True if the variant is in the X non-PAR region for XY individuals. + - hemi_y_expr: True if the variant is in the Y non-PAR region for XY individuals. + """ + diploid_expr = locus_expr.in_autosome_or_par() | ( + locus_expr.in_x_nonpar() & is_xx_expr + ) + hemi_x_expr = locus_expr.in_x_nonpar() & ~is_xx_expr + hemi_y_expr = locus_expr.in_y_nonpar() & ~is_xx_expr + return diploid_expr, hemi_x_expr, hemi_y_expr From 75bbd294b9599c86c45b7b6ff62a14dc2beed359 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 10 Feb 2025 13:36:13 -0500 Subject: [PATCH 45/56] Add expected error case --- gnomad/sample_qc/relatedness.py | 4 +- tests/sample_qc/test_de_novo.py | 74 +++++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 021ffeb44..3d61eb93d 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1630,7 +1630,9 @@ def default_get_de_novo_expr( ) # Determine genomic context - diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex(locus_expr, is_xx_expr) + diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex( + locus_expr, is_xx_expr + ) p_de_novo = calculate_de_novo_post_prob( proband_expr.PL, diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 563db3418..adc5c46cb 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -100,7 +100,7 @@ def test_calculate_de_novo_post_prob( assert round(hl.eval(p_dn_expr), 3) == expected @pytest.mark.parametrize( - "locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, freq_prior_expr, expected", + "locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, freq_prior_expr, expected_exception, expected_result", [ # Case 1: Multiple fail conditions ( @@ -111,6 +111,7 @@ def test_calculate_de_novo_post_prob( hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), hl.literal(True), hl.literal(1e-5), + False, hl.struct( is_de_novo=True, p_de_novo=hl.missing(hl.tfloat64), @@ -134,6 +135,7 @@ def test_calculate_de_novo_post_prob( hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), hl.literal(False), hl.literal(1e-5), + False, hl.struct( is_de_novo=True, p_de_novo=hl.missing(hl.tfloat64), @@ -150,6 +152,7 @@ def test_calculate_de_novo_post_prob( hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, PL=[0, 99, 198]), hl.literal(True), hl.literal(1e-5), + False, hl.struct( is_de_novo=False, p_de_novo=hl.missing(hl.tfloat64), @@ -166,6 +169,7 @@ def test_calculate_de_novo_post_prob( hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, PL=[0, 80, 150]), hl.literal(True), hl.literal(1e-5), + False, hl.struct( is_de_novo=True, p_de_novo=0.999, # High confidence P(de novo) @@ -173,6 +177,18 @@ def test_calculate_de_novo_post_prob( fail_reason=hl.missing(hl.tset(hl.tstr)), ), ), + # Case 5: Multi-allelic variant (should raise an error) + ( + hl.locus("chr1", 40000, reference_genome="GRCh38"), + hl.literal(["C", "G", "A"]), + hl.struct(GT=hl.call(0, 1), AD=[5, 30, 5], DP=40, GQ=99, PL=[99, 0, 1]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, PL=[0, 60, 120]), + hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, PL=[0, 80, 150]), + hl.literal(True), + hl.literal(1e-5), + True, + None, + ), ], ) def test_default_get_de_novo_expr( @@ -184,25 +200,41 @@ def test_default_get_de_novo_expr( mother_expr, is_xx_expr, freq_prior_expr, - expected, + expected_exception, + expected_result, ): - """Test different scenarios of `default_get_de_novo_expr` in one function.""" - result_expr = default_get_de_novo_expr( - locus, - alleles, - proband_expr, - father_expr, - mother_expr, - is_xx_expr, - freq_prior_expr, - ) - - result = hl.eval(result_expr) - expected_result = hl.eval(expected) + """Test different scenarios of `default_get_de_novo_expr`.""" + if expected_exception: + with pytest.raises( + hl.utils.HailUserError, + match="Must split multiallelic variants prior to running this function.", + ): + result_expr = default_get_de_novo_expr( + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, + ) + hl.eval(result_expr) + else: + result_expr = default_get_de_novo_expr( + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, + ) + result = hl.eval(result_expr) + expected_result = hl.eval(expected_result) - assert result.is_de_novo == expected_result.is_de_novo - assert ( - None if result.p_de_novo is None else round(result.p_de_novo, 3) - ) == expected_result.p_de_novo - assert result.confidence == expected_result.confidence - assert result.fail_reason == expected_result.fail_reason + assert result.is_de_novo == expected_result.is_de_novo + assert ( + None if result.p_de_novo is None else round(result.p_de_novo, 3) + ) == expected_result.p_de_novo + assert result.confidence == expected_result.confidence + assert result.fail_reason == expected_result.fail_reason From cf41a677814950bd951ac4e09cbbc688274cb1f1 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 10 Feb 2025 13:52:33 -0500 Subject: [PATCH 46/56] Minor number change --- tests/sample_qc/test_de_novo.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index adc5c46cb..fd94cca46 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -182,8 +182,12 @@ def test_calculate_de_novo_post_prob( hl.locus("chr1", 40000, reference_genome="GRCh38"), hl.literal(["C", "G", "A"]), hl.struct(GT=hl.call(0, 1), AD=[5, 30, 5], DP=40, GQ=99, PL=[99, 0, 1]), - hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, PL=[0, 60, 120]), - hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, PL=[0, 80, 150]), + hl.struct( + GT=hl.call(0, 0), AD=[20, 0, 5], DP=25, GQ=60, PL=[0, 60, 120] + ), + hl.struct( + GT=hl.call(0, 0), AD=[25, 0, 5], DP=30, GQ=80, PL=[0, 80, 150] + ), hl.literal(True), hl.literal(1e-5), True, From 048b173e5190d283a917aa9b003788f77f9b5020 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:29:54 -0500 Subject: [PATCH 47/56] Apply suggestions from code review Co-authored-by: Katherine Chao --- gnomad/sample_qc/relatedness.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 3d61eb93d..f4ac8393b 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1326,7 +1326,7 @@ def calculate_de_novo_post_prob( probabilities for hemizygous genotypes in XY individuals. To address this, we provide the full set of equations in this docstring. - The posterior probability of an even being truly *de novo* vs. the probability it was a missed heterozygote call in one of the two parents is: + The posterior probability of an event being truly *de novo* vs. the probability it was a missed heterozygote call in one of the two parents is: .. math:: @@ -1334,7 +1334,7 @@ def calculate_de_novo_post_prob( The terms are defined as follows: - - :math:`P(DN \mid \text{data})` is the probability that the variant is **de novo**, given the observed genotype data. + - :math:`P(DN \mid \text{data})` is the probability that the variant is *de novo*, given the observed genotype data. - :math:`P(\text{missed het in parent(s)} \mid \text{data})` is the probability that the heterozygous variant was **missed in at least one parent**. @@ -1356,7 +1356,7 @@ def calculate_de_novo_post_prob( Probability of a observing a *de novo* mutation given the data specifically for hemizygous calls in XY individuals - Note that hemizygous calls in XY individuals will be reported as homozygous alternate without any sex ploidy adjustments, which is why the formulas below use `P(hom_alt in proband)` + Note that hemizygous calls in XY individuals will be reported as homozygous alternate without any sex ploidy adjustments, which is why the formulas below use `P(hom_alt in proband)` - **X non-PAR regions (XY only)**: @@ -1413,7 +1413,7 @@ def calculate_de_novo_post_prob( :param freq_prior_expr: Population frequency prior for the variant. :param min_pop_prior: Minimum population frequency prior (default: :math:`\text{100/3e7}`). :param de_novo_prior: Prior probability of a *de novo* mutation (default: :math:`\text{1/3e7}`). - :return: Posterior probability of a de novo mutation (`P_dn`). + :return: Posterior probability of a *de novo* mutation (`P_dn`). """ def _get_freq_prior(freq_prior: hl.expr.Float64Expression, min_prior=100 / 3e7): @@ -1570,11 +1570,14 @@ def default_get_de_novo_expr( .. note:: The “LOW” confidence category differs slightly from the criteria in the - original code (P(*de novo*) > 0.05 and AB(proband > 0.2 ), as it is + original code (P(*de novo*) > 0.05 and AB(proband > 0.2), as it is designed to fill the gap for variants that do not meet the FAIL criteria but would otherwise remain unclassified. - The simplified version is the same as Hail's methods when using the + The *de novo* confidence is calculated as a simplified version of the one previously + described in Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts) and + Hail's [*de_novo*](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) + method. This simplified version is the same as Hail's methods when using the `ignore_in_sample_allele_frequency` parameter. The main difference is that this mode should be used when families larger than a single trio are in the dataset, in which an allele might be *de novo* in a parent and transmitted to a @@ -1588,7 +1591,7 @@ def default_get_de_novo_expr( This method assumes that the PL and AD fields are present in the genotype fields of the child and parents. If they are missing, this method will not work. - gnomAD v3 and v4 VDS have the PL and AD fields intentionally removed to + Many of our larger datasets have the PL and AD fields intentionally removed to save storage space. If this is the reason that the PL and AD fields are missing, the only way to use this method is to set them to their approximate values: From 13d46571934bc3b2097df73bb06e0aeaebcdd300 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Mon, 10 Feb 2025 16:41:10 -0500 Subject: [PATCH 48/56] Apply suggestions from code review Co-authored-by: Katherine Chao --- gnomad/utils/annotations.py | 2 +- tests/sample_qc/test_de_novo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 6def6292d..6c4c8a0f3 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2794,7 +2794,7 @@ def get_copy_state_by_sex( :param locus_expr: LocusExpression of the variant. :param is_xx_expr: BooleanExpression indicating whether the sample has an XX sex karyotype. - :return: A tuple of BooleanExpressions: + :return: Tuple of BooleanExpressions: - diploid_expr: True if the variant is in autosomes or PAR regions, or in the X non-PAR region for XX individuals. - hemi_x_expr: True if the variant is in the X non-PAR region for XY individuals. - hemi_y_expr: True if the variant is in the Y non-PAR region for XY individuals. diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index fd94cca46..ad0065b0f 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -172,7 +172,7 @@ def test_calculate_de_novo_post_prob( False, hl.struct( is_de_novo=True, - p_de_novo=0.999, # High confidence P(de novo) + p_de_novo=0.999, confidence="HIGH", fail_reason=hl.missing(hl.tset(hl.tstr)), ), From 34240d88836bd29719ab777bd538a13915e00b68 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 11 Feb 2025 14:16:21 -0500 Subject: [PATCH 49/56] Make a fixture for pytest --- gnomad/sample_qc/relatedness.py | 10 +- tests/sample_qc/test_de_novo.py | 528 +++++++++++++++++++------------- 2 files changed, 316 insertions(+), 222 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index f4ac8393b..8e9f2524e 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1659,9 +1659,8 @@ def default_get_de_novo_expr( ) dp_ratio = proband_expr.DP / parent_dp - # Calculate proband AB and assign variant type + # Calculate proband AB proband_ab = proband_expr.AD[1] / hl.sum(proband_expr.AD) - is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1]) is_de_novo = ( diploid_expr @@ -1685,7 +1684,7 @@ def default_get_de_novo_expr( & (dp_ratio > high_conf_dp_ratio) ) | ( - is_snp + hl.is_snp(alleles_expr[0], alleles_expr[1]) & (p_de_novo > med_conf_p) & (proband_ab > high_med_conf_ab) & (proband_expr.DP > dp_threshold_snp) @@ -1733,8 +1732,9 @@ def default_get_de_novo_expr( fail = hl.any(list(fail_checks_expr.values())) result_expr = hl.struct( is_de_novo=is_de_novo, - p_de_novo=hl.if_else(~is_de_novo | fail, hl.missing(hl.tfloat64), p_de_novo), - confidence=hl.if_else(~is_de_novo | fail, hl.missing(hl.tstr), confidence_expr), + p_de_novo=hl.or_missing(is_de_novo & ~fail, + p_de_novo), + confidence=hl.or_missing(is_de_novo & ~fail, confidence_expr), fail_reason=hl.or_missing( is_de_novo & fail, add_filters_expr(filters=fail_checks_expr), diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index ad0065b0f..aaecb2a88 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -8,237 +8,331 @@ default_get_de_novo_expr, ) +from gnomad.utils.annotations import get_copy_state_by_sex + +# I want to get a table with all the following cases: +# 1. autosomal locus with HIGH P +# 2. autosomal locus with medium P +# 3. autosomal locus with low P +# 4. autosomal locus with one FAIL condition +# 5. autosomal locus with multiple FAIL conditions +# 6. hemi X locus for XY individual with a HIGH P +# 7. hemi Y locus for XY individual with a HIGH P +# 8. autosomal locus that is not de novo +# 9. autosomal locus with PLs all [0,0,0] and no freq prior +# 10. autosomal locus with missing PLs +# 11. autosomal locus with a multi-allelic site +# 12. autosomal locus with frequency prior out of range + class TestDeNovoMutation: """Test suite for de novo mutation functions.""" - @pytest.mark.parametrize( - "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected", - [ - # Valid test cases (should return expected numeric values) - ( - [73, 0, 161], - [0, 99, 198], - [0, 99, 198], - True, - False, - False, - 1.80e-02, - 100 / 3e7, - 0.999, - ), - ( - [152, 0, 283], - [0, 99, 198], - [0, 63, 126], - True, - False, - False, - 7.55e-02, - 100 / 3e7, - 0.198, - ), - # Invalid `freq_prior` case (should raise `HailUserError`) - ( - [99, 50, 0], - [0, 99, 198], - [0, 99, 198], - False, - True, - False, - 1.2, - 100 / 3e7, - None, - ), - ], - ) - def test_calculate_de_novo_post_prob( - self, - proband_pl, - father_pl, - mother_pl, - diploid, - hemi_x, - hemi_y, - freq_prior, - min_pop_prior, - expected, - ): - """Test `calculate_de_novo_post_prob` function.""" - # Case where we expect an error (`freq_prior` is out of range) - if expected is None: - with pytest.raises( - hl.utils.HailUserError, - match=r"de_novo: expect 0 <= freq_prior_expr <= 1, found .*", - ): - hl.eval( - calculate_de_novo_post_prob( - hl.literal(proband_pl), - hl.literal(father_pl), - hl.literal(mother_pl), - hl.literal(diploid), - hl.literal(hemi_x), - hl.literal(hemi_y), - hl.literal(freq_prior), - min_pop_prior, - ) - ) - else: - # Case where we expect a valid float result - p_dn_expr = calculate_de_novo_post_prob( - hl.literal(proband_pl), - hl.literal(father_pl), - hl.literal(mother_pl), - hl.literal(diploid), - hl.literal(hemi_x), - hl.literal(hemi_y), - hl.literal(freq_prior), - min_pop_prior, - ) - - # Assert with floating-point tolerance - assert round(hl.eval(p_dn_expr), 3) == expected - - @pytest.mark.parametrize( - "locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, freq_prior_expr, expected_exception, expected_result", - [ - # Case 1: Multiple fail conditions - ( - hl.locus("chr1", 10000, reference_genome="GRCh38"), - hl.literal(["A", "C"]), - hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), - hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - hl.literal(True), - hl.literal(1e-5), - False, - hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason=hl.set( - { - "min_de_novo_p", + @pytest.fixture + def ht_de_novo_test_cases(self) -> hl.Table: + """Fixture to create a Hail Table with different de novo mutation test cases.""" + data = [ + # 1. Autosomal locus with HIGH confidence + { + "locus": hl.locus("chr1", 10000, reference_genome="GRCh38"), + "alleles": ["A", "C"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, + PL=[99, 0, 1]), + "father": hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, + PL=[0, 60, 120]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, + PL=[0, 80, 150]), + "is_xx": True, + "freq_prior": 1e-5, + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 0.999, + "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.999, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr))), + }, + # 2. Autosomal locus with MEDIUM confidence + { + "locus": hl.locus("chr1", 11000, reference_genome="GRCh38"), + "alleles": ["A", "T"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[59,61], DP=120, GQ=37, + PL=[542,0,1940]), + "father": hl.struct(GT=hl.call(0, 0), AD=[32,0], DP=32, GQ=60, + PL=[0,60,120]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[32,0], DP=37, GQ=60, + PL=[0,60,120]), + "is_xx": False, + "freq_prior": 2.62e-03, + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 0.615, + "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.615, + confidence="MEDIUM", + fail_reason=hl.missing(hl.tset(hl.tstr))), + }, + # 3. Autosomal locus with LOW confidence + { + "locus": hl.locus("chr1", 12000, reference_genome="GRCh38"), + "alleles": ["G", "T"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[7,2], DP=18, GQ=43, + PL=[43,0,387]), + "father": hl.struct(GT=hl.call(0, 0), AD=[25,0], DP=25, GQ=40, + PL=[0, 40, 80]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[23,0], DP=23, GQ=40, + PL=[0,40,80]), + "is_xx": True, + "freq_prior": 0, + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 0.926, + "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.926, + confidence="LOW", + fail_reason=hl.missing( + hl.tset(hl.tstr))), + }, + # 4. Autosomal locus with one FAIL condition + { + "locus": hl.locus("chr1", 13000, reference_genome="GRCh38"), + "alleles": ["C", "G"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, PL=[10, 0, 100]), + "father": hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + "is_xx": True, + "freq_prior": 1e-5, + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 1, + "expected_de_novo_expr": hl.struct(is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_dp_ratio"}), + }, + # 5. Autosomal locus with multiple FAIL conditions + { + "locus": hl.locus("chr1", 14000, reference_genome="GRCh38"), + "alleles": ["A", "G"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), + "father": hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + "is_xx": True, + "freq_prior": 1e-5, + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 0, + "expected_de_novo_expr": hl.struct(is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_de_novo_p", "min_proband_ab", "min_proband_gq", - "parent_sum_ad_0", - } - ), - ), - ), - # Case 2: One fail condition (low DP ratio) - ( - hl.locus("chr1", 20000, reference_genome="GRCh38"), - hl.literal(["A", "T"]), - hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, PL=[10, 0, 100]), - hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), - hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), - hl.literal(False), - hl.literal(1e-5), - False, - hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason=hl.set({"min_dp_ratio"}), - ), - ), - # Case 3: Variant is inherited (not de novo) - ( - hl.locus("chr1", 30000, reference_genome="GRCh38"), - hl.literal(["G", "T"]), - hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, PL=[10, 0, 100]), - hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, PL=[0, 20, 80]), - hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, PL=[0, 99, 198]), - hl.literal(True), - hl.literal(1e-5), - False, - hl.struct( + "parent_sum_ad_0"}), + }, + # 6. Hemi X locus for XY individual with HIGH confidence + { + "locus": hl.locus("chrX", 8400000, reference_genome="GRCh38"), + "alleles": ["A", "G"], + "proband": hl.struct(GT=hl.call(1, 1), AD=[0,14], DP=14, GQ=42, + PL=[419,42,0]), + "father": hl.struct(GT=hl.call(0, 0), AD=[38, 0], DP=38, GQ=40, + PL=[0,40,80]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[97,0], DP=110, GQ=99, + PL=[0,99,198]), + "is_xx": False, + "freq_prior": 3.74e-02, + "expected_error": False, + "expected_copy_state": (False, True, False), + "expected_p_de_novo": 0.999, + "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.999, + confidence="HIGH", + fail_reason=hl.missing( + hl.tset(hl.tstr))), + }, + # 7. Hemi Y locus for XY individual with HIGH confidence + { + "locus": hl.locus("chrY", 9900000, reference_genome="GRCh38"), + "alleles": ["A", "G"], + "proband": hl.struct(GT=hl.call(1, 1), AD=[0, 43], DP=43, GQ=99, + PL=[1363,129,0]), + "father": hl.struct(GT=hl.call(0, 0), AD=[28, 0], DP=28, GQ=40, + PL=[0, 40, 80]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, + PL=[0, 0, 0]), + "is_xx": False, + "freq_prior": hl.missing(hl.tfloat64), + "expected_error": False, + "expected_copy_state": (False, False, True), + "expected_p_de_novo": 0.962, + "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.962, + confidence="HIGH", + fail_reason=hl.missing( + hl.tset(hl.tstr))), + }, + # 8. Autosomal locus that is not de novo + { + "locus": hl.locus("chr1", 15000, reference_genome="GRCh38"), + "alleles": ["G", "T"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, PL=[10, 0, 100]), + "father": hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, PL=[0, 20, 80]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, PL=[0, 99, 198]), + "is_xx": False, + "freq_prior": 1e-5, + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 0.077, + "expected_de_novo_expr": hl.struct( is_de_novo=False, p_de_novo=hl.missing(hl.tfloat64), confidence=hl.missing(hl.tstr), fail_reason=hl.missing(hl.tset(hl.tstr)), ), - ), - # Case 4: A passing case (high confidence de novo) - ( - hl.locus("chr1", 40000, reference_genome="GRCh38"), - hl.literal(["C", "G"]), - hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, PL=[99, 0, 1]), - hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, PL=[0, 60, 120]), - hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, PL=[0, 80, 150]), - hl.literal(True), - hl.literal(1e-5), - False, - hl.struct( + }, + # 9. Autosomal locus with PLs all [0,0,0] and no freq prior + { + "locus": hl.locus("chr1", 16000, reference_genome="GRCh38"), + "alleles": ["G", "T"], + "proband": hl.struct(GT=hl.call(0, 1), AD=[0, 2], DP=2, GQ=0, + PL=[0, 0, 0]), + "father": hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, + PL=[0, 0, 0]), + "mother": hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, + PL=[0, 0, 0]), + "is_xx": False, + "freq_prior": hl.missing(hl.tfloat64), + "expected_error": False, + "expected_copy_state": (True, False, False), + "expected_p_de_novo": 0.001, + "expected_de_novo_expr": hl.struct( is_de_novo=True, - p_de_novo=0.999, - confidence="HIGH", - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), - ), - # Case 5: Multi-allelic variant (should raise an error) - ( - hl.locus("chr1", 40000, reference_genome="GRCh38"), - hl.literal(["C", "G", "A"]), - hl.struct(GT=hl.call(0, 1), AD=[5, 30, 5], DP=40, GQ=99, PL=[99, 0, 1]), - hl.struct( - GT=hl.call(0, 0), AD=[20, 0, 5], DP=25, GQ=60, PL=[0, 60, 120] + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_de_novo_p", + "min_proband_gq"} ), - hl.struct( - GT=hl.call(0, 0), AD=[25, 0, 5], DP=30, GQ=80, PL=[0, 80, 150] + } + ] + + # Convert list to a Hail Table + ht = hl.Table.parallelize( + data, + schema=hl.tstruct( + locus=hl.tlocus("GRCh38"), + alleles=hl.tarray(hl.tstr), + proband=hl.tstruct(GT=hl.tcall, AD=hl.tarray(hl.tint32), DP=hl.tint32, + GQ=hl.tint32, PL=hl.tarray(hl.tint32)), + father=hl.tstruct(GT=hl.tcall, AD=hl.tarray(hl.tint32), DP=hl.tint32, + GQ=hl.tint32, PL=hl.tarray(hl.tint32)), + mother=hl.tstruct(GT=hl.tcall, AD=hl.tarray(hl.tint32), DP=hl.tint32, + GQ=hl.tint32, PL=hl.tarray(hl.tint32)), + is_xx=hl.tbool, + freq_prior=hl.tfloat64, + expected_error=hl.tbool, + expected_copy_state=hl.ttuple(hl.tbool, hl.tbool, hl.tbool), + expected_p_de_novo=hl.tfloat64, + expected_de_novo_expr=hl.tstruct( + is_de_novo=hl.tbool, + p_de_novo=hl.tfloat64, + confidence=hl.tstr, + fail_reason=hl.tset(hl.tstr), ), - hl.literal(True), - hl.literal(1e-5), - True, - None, ), - ], - ) - def test_default_get_de_novo_expr( - self, - locus, - alleles, - proband_expr, - father_expr, - mother_expr, - is_xx_expr, - freq_prior_expr, - expected_exception, - expected_result, - ): - """Test different scenarios of `default_get_de_novo_expr`.""" - if expected_exception: - with pytest.raises( - hl.utils.HailUserError, - match="Must split multiallelic variants prior to running this function.", - ): - result_expr = default_get_de_novo_expr( - locus, - alleles, - proband_expr, - father_expr, - mother_expr, - is_xx_expr, - freq_prior_expr, + ) + + return ht + + def test_get_copy_state_by_sex(self, ht_de_novo_test_cases): + """Test `get_copy_state_by_sex` function using a Hail Table.""" + # 🔹 Compute actual copy state using `get_copy_state_by_sex` + ht = ht_de_novo_test_cases.annotate( + computed_copy_state=get_copy_state_by_sex(ht_de_novo_test_cases.locus, + ht_de_novo_test_cases.is_xx) + ) + + # 🔹 Evaluate computed and expected values + computed_values = hl.eval(ht.computed_copy_state.collect()) + expected_values = hl.eval(ht.expected_copy_state.collect()) + + # 🔹 Compare expected vs. actual results + for i, (computed, expected) in enumerate(zip(computed_values, expected_values)): + assert computed == expected, f"Copy state mismatch at index {i}: expected {expected}, got {computed}" + + def test_calculate_de_novo_post_prob(self, ht_de_novo_test_cases): + """Test `calculate_de_novo_post_prob` function using a Hail Table.""" + # 🔹 Store computed values and handle expected errors in Hail + ht = ht_de_novo_test_cases.annotate( + computed_p_de_novo=hl.case() + .when( + ht_de_novo_test_cases.expected_error, + # 🔹 If error expected, return missing + hl.missing(hl.tfloat64), + ) + .default( + calculate_de_novo_post_prob( + ht_de_novo_test_cases.proband.PL, + ht_de_novo_test_cases.father.PL, + ht_de_novo_test_cases.mother.PL, + ht_de_novo_test_cases.expected_copy_state[0], + ht_de_novo_test_cases.expected_copy_state[1], + ht_de_novo_test_cases.expected_copy_state[2], + ht_de_novo_test_cases.freq_prior, + min_pop_prior=100 / 3e7, ) - hl.eval(result_expr) - else: - result_expr = default_get_de_novo_expr( - locus, - alleles, - proband_expr, - father_expr, - mother_expr, - is_xx_expr, - freq_prior_expr, ) - result = hl.eval(result_expr) - expected_result = hl.eval(expected_result) - - assert result.is_de_novo == expected_result.is_de_novo - assert ( - None if result.p_de_novo is None else round(result.p_de_novo, 3) - ) == expected_result.p_de_novo - assert result.confidence == expected_result.confidence - assert result.fail_reason == expected_result.fail_reason + ) + + # 🔹 Collect the table + ht.select("computed_p_de_novo", "expected_p_de_novo", + "expected_error").show(-1) + results = ht.select("computed_p_de_novo", "expected_p_de_novo", + "expected_error").collect() + + for row in results: + if row.expected_error: + # 🔹 If an error was expected, assert the result is missing + assert hl.is_missing(row.computed_p_de_novo) + else: + # 🔹 Otherwise, compare expected values + assert round(row.computed_p_de_novo, 3) == row.expected_p_de_novo + + def test_default_get_de_novo_expr(self, ht_de_novo_test_cases): + """Test different scenarios of `default_get_de_novo_expr` using a Hail Table.""" + # 🔹 Store computed values and handle expected errors + ht = ht_de_novo_test_cases.annotate( + computed_de_novo_expr=hl.case() + .when( + ht_de_novo_test_cases.expected_error, + hl.missing(ht_de_novo_test_cases.expected_de_novo_expr.dtype), + ) + .default( + default_get_de_novo_expr( + ht_de_novo_test_cases.locus, + ht_de_novo_test_cases.alleles, + ht_de_novo_test_cases.proband, + ht_de_novo_test_cases.father, + ht_de_novo_test_cases.mother, + ht_de_novo_test_cases.is_xx, + ht_de_novo_test_cases.freq_prior, + ) + ) + ) + + # 🔹 Round `p_de_novo` within the struct before evaluation + ht = ht.annotate( + computed_de_novo_expr=hl.struct( + is_de_novo=ht.computed_de_novo_expr.is_de_novo, + p_de_novo=hl.or_missing( + hl.is_defined(ht.computed_de_novo_expr.p_de_novo), + hl.float64( + hl.int32(ht.computed_de_novo_expr.p_de_novo * 1000)) / 1000, + ), + confidence=ht.computed_de_novo_expr.confidence, + fail_reason=ht.computed_de_novo_expr.fail_reason, + ) + ) + + # 🔹 Evaluate computed and expected values + computed_values = hl.eval(ht.computed_de_novo_expr.collect()) + expected_values = hl.eval(ht.expected_de_novo_expr.collect()) + + # 🔹 Compare expected vs. actual results + for i, (computed, expected) in enumerate(zip(computed_values, expected_values)): + assert computed == expected, f"Copy state mismatch at index {i}: expected {expected}, got {computed}" From 8b359718889d55072f054a44f58ea74c07d9e1ae Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:43:38 -0500 Subject: [PATCH 50/56] Add more cases --- tests/sample_qc/test_de_novo.py | 650 +++++++++++++++++--------------- 1 file changed, 341 insertions(+), 309 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index aaecb2a88..d1cf2eb4f 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -10,329 +10,361 @@ from gnomad.utils.annotations import get_copy_state_by_sex -# I want to get a table with all the following cases: -# 1. autosomal locus with HIGH P -# 2. autosomal locus with medium P -# 3. autosomal locus with low P -# 4. autosomal locus with one FAIL condition -# 5. autosomal locus with multiple FAIL conditions -# 6. hemi X locus for XY individual with a HIGH P -# 7. hemi Y locus for XY individual with a HIGH P -# 8. autosomal locus that is not de novo -# 9. autosomal locus with PLs all [0,0,0] and no freq prior -# 10. autosomal locus with missing PLs -# 11. autosomal locus with a multi-allelic site -# 12. autosomal locus with frequency prior out of range - - class TestDeNovoMutation: """Test suite for de novo mutation functions.""" - @pytest.fixture - def ht_de_novo_test_cases(self) -> hl.Table: - """Fixture to create a Hail Table with different de novo mutation test cases.""" - data = [ + @pytest.mark.parametrize( + "locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y", + [ + (hl.locus("chr1", 100000, reference_genome="GRCh38"), True, True, False, False), + (hl.locus("chrX", 2781479, reference_genome="GRCh38"), False, True, False, False), + (hl.locus("chrX", 3000000, reference_genome="GRCh38"), False, False, True, False), + (hl.locus("chrY", 10000000, reference_genome="GRCh38"), False, False, False, True), + ], + ) + def test_get_copy_state_by_sex( + self, locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y + ) -> None: + """Test copy state determination based on locus type and sex.""" + is_xx_expr = hl.literal(is_xx) + + diploid, hemi_x, hemi_y = get_copy_state_by_sex(locus, is_xx_expr) + result = hl.eval([diploid, hemi_x, hemi_y]) + + assert result == [ + expected_diploid, + expected_hemi_x, + expected_hemi_y, + ], f"Failed for locus={locus}, is_xx={is_xx}. Expected {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" + + @pytest.mark.parametrize( + "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected", + [ + # Valid test cases (should return expected numeric values) + ( + [73, 0, 161], + [0, 99, 198], + [0, 99, 198], + True, + False, + False, + 1.80e-02, + 100 / 3e7, + 0.999, + ), + ( + [152, 0, 283], + [0, 99, 198], + [0, 63, 126], + True, + False, + False, + 7.55e-02, + 100 / 3e7, + 0.198, + ), + # Invalid `freq_prior` case (should raise `HailUserError`) + ( + [99, 50, 0], + [0, 99, 198], + [0, 99, 198], + False, + True, + False, + 1.2, + 100 / 3e7, + None, + ), + ], + ) + def test_calculate_de_novo_post_prob( + self, + proband_pl, + father_pl, + mother_pl, + diploid, + hemi_x, + hemi_y, + freq_prior, + min_pop_prior, + expected, + ): + """Test `calculate_de_novo_post_prob` function.""" + # Case where we expect an error (`freq_prior` is out of range) + if expected is None: + with pytest.raises( + hl.utils.HailUserError, + match=r"de_novo: expect 0 <= freq_prior_expr <= 1, found .*", + ): + hl.eval( + calculate_de_novo_post_prob( + hl.literal(proband_pl), + hl.literal(father_pl), + hl.literal(mother_pl), + hl.literal(diploid), + hl.literal(hemi_x), + hl.literal(hemi_y), + hl.literal(freq_prior), + min_pop_prior, + ) + ) + else: + # Case where we expect a valid float result + p_dn_expr = calculate_de_novo_post_prob( + hl.literal(proband_pl), + hl.literal(father_pl), + hl.literal(mother_pl), + hl.literal(diploid), + hl.literal(hemi_x), + hl.literal(hemi_y), + hl.literal(freq_prior), + min_pop_prior, + ) + + # Assert with floating-point tolerance + assert round(hl.eval(p_dn_expr), 3) == expected + + @pytest.mark.parametrize( + "locus, alleles, proband_expr, father_expr, mother_expr, is_xx_expr, freq_prior_expr, expected_exception, expected_result", + [ # 1. Autosomal locus with HIGH confidence - { - "locus": hl.locus("chr1", 10000, reference_genome="GRCh38"), - "alleles": ["A", "C"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, - PL=[99, 0, 1]), - "father": hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, - PL=[0, 60, 120]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, - PL=[0, 80, 150]), - "is_xx": True, - "freq_prior": 1e-5, - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 0.999, - "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.999, - confidence="HIGH", - fail_reason=hl.missing(hl.tset(hl.tstr))), - }, + ( + hl.locus("chr1", 10000, reference_genome="GRCh38"), + hl.literal(["A", "C"]), + hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, + PL=[99, 0, 1]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, + PL=[0, 60, 120]), + hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, + PL=[0, 80, 150]), + hl.literal(True), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.999, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), + ), # 2. Autosomal locus with MEDIUM confidence - { - "locus": hl.locus("chr1", 11000, reference_genome="GRCh38"), - "alleles": ["A", "T"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[59,61], DP=120, GQ=37, - PL=[542,0,1940]), - "father": hl.struct(GT=hl.call(0, 0), AD=[32,0], DP=32, GQ=60, - PL=[0,60,120]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[32,0], DP=37, GQ=60, - PL=[0,60,120]), - "is_xx": False, - "freq_prior": 2.62e-03, - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 0.615, - "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.615, - confidence="MEDIUM", - fail_reason=hl.missing(hl.tset(hl.tstr))), - }, + ( + hl.locus("chr1", 11000, reference_genome="GRCh38"), + hl.literal(["CT","C"]), + hl.struct(GT=hl.call(0, 1), AD=[59, 61], DP=120, GQ=99, + PL=[542,0,1940]), + hl.struct(GT=hl.call(0, 0), AD=[32, 0], DP=32, GQ=60, + PL=[0, 60, 120]), + hl.struct(GT=hl.call(0, 0), AD=[37, 0], DP=37, GQ=60, + PL=[0, 60, 120]), + hl.literal(False), + hl.literal(2.62e-03), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.615, + confidence="MEDIUM", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), + ), # 3. Autosomal locus with LOW confidence - { - "locus": hl.locus("chr1", 12000, reference_genome="GRCh38"), - "alleles": ["G", "T"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[7,2], DP=18, GQ=43, - PL=[43,0,387]), - "father": hl.struct(GT=hl.call(0, 0), AD=[25,0], DP=25, GQ=40, - PL=[0, 40, 80]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[23,0], DP=23, GQ=40, - PL=[0,40,80]), - "is_xx": True, - "freq_prior": 0, - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 0.926, - "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.926, - confidence="LOW", - fail_reason=hl.missing( - hl.tset(hl.tstr))), - }, + ( + hl.locus("chr1", 12000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[7, 2], DP=18, GQ=43, + PL=[43, 0, 387]), + hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=40, + PL=[0, 40, 80]), + hl.struct(GT=hl.call(0, 0), AD=[23, 0], DP=23, GQ=40, + PL=[0, 40, 80]), + hl.literal(True), + hl.literal(0), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.926, + confidence="LOW", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), + ), # 4. Autosomal locus with one FAIL condition - { - "locus": hl.locus("chr1", 13000, reference_genome="GRCh38"), - "alleles": ["C", "G"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, PL=[10, 0, 100]), - "father": hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), - "is_xx": True, - "freq_prior": 1e-5, - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 1, - "expected_de_novo_expr": hl.struct(is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason={"min_dp_ratio"}), - }, + ( + hl.locus("chr1", 13000, reference_genome="GRCh38"), + hl.literal(["C", "G"]), + hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, + PL=[10, 0, 100]), + hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, + PL=[0, 99, 198]), + hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, + PL=[0, 99, 198]), + hl.literal(True), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_dp_ratio"}, + ), + ), # 5. Autosomal locus with multiple FAIL conditions - { - "locus": hl.locus("chr1", 14000, reference_genome="GRCh38"), - "alleles": ["A", "G"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), - "father": hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - "is_xx": True, - "freq_prior": 1e-5, - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 0, - "expected_de_novo_expr": hl.struct(is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason={"min_de_novo_p", - "min_proband_ab", - "min_proband_gq", - "parent_sum_ad_0"}), - }, + ( + hl.locus("chr1", 14000, reference_genome="GRCh38"), + hl.literal(["A", "G"]), + hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.literal(True), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_de_novo_p", "min_proband_ab", + "min_proband_gq", "parent_sum_ad_0"}, + ), + ), # 6. Hemi X locus for XY individual with HIGH confidence - { - "locus": hl.locus("chrX", 8400000, reference_genome="GRCh38"), - "alleles": ["A", "G"], - "proband": hl.struct(GT=hl.call(1, 1), AD=[0,14], DP=14, GQ=42, - PL=[419,42,0]), - "father": hl.struct(GT=hl.call(0, 0), AD=[38, 0], DP=38, GQ=40, - PL=[0,40,80]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[97,0], DP=110, GQ=99, - PL=[0,99,198]), - "is_xx": False, - "freq_prior": 3.74e-02, - "expected_error": False, - "expected_copy_state": (False, True, False), - "expected_p_de_novo": 0.999, - "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.999, - confidence="HIGH", - fail_reason=hl.missing( - hl.tset(hl.tstr))), - }, + ( + hl.locus("chrX", 8400000, reference_genome="GRCh38"), + hl.literal(["A", "G"]), + hl.struct(GT=hl.call(1, 1), AD=[0, 14], DP=14, GQ=42, + PL=[419, 42, 0]), + hl.struct(GT=hl.call(0, 0), AD=[38, 0], DP=38, GQ=40, + PL=[0, 40, 80]), + hl.struct(GT=hl.call(0, 0), AD=[97, 0], DP=110, GQ=99, + PL=[0, 99, 198]), + hl.literal(False), + hl.literal(3.74e-02), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.999, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), + ), # 7. Hemi Y locus for XY individual with HIGH confidence - { - "locus": hl.locus("chrY", 9900000, reference_genome="GRCh38"), - "alleles": ["A", "G"], - "proband": hl.struct(GT=hl.call(1, 1), AD=[0, 43], DP=43, GQ=99, - PL=[1363,129,0]), - "father": hl.struct(GT=hl.call(0, 0), AD=[28, 0], DP=28, GQ=40, - PL=[0, 40, 80]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, - PL=[0, 0, 0]), - "is_xx": False, - "freq_prior": hl.missing(hl.tfloat64), - "expected_error": False, - "expected_copy_state": (False, False, True), - "expected_p_de_novo": 0.962, - "expected_de_novo_expr": hl.struct(is_de_novo=True, p_de_novo=0.962, - confidence="HIGH", - fail_reason=hl.missing( - hl.tset(hl.tstr))), - }, + ( + hl.locus("chrY", 9900000, reference_genome="GRCh38"), + hl.literal(["A", "G"]), + hl.struct(GT=hl.call(1, 1), AD=[0, 43], DP=43, GQ=99, + PL=[1363, 129, 0]), + hl.struct(GT=hl.call(0, 0), AD=[28, 0], DP=28, GQ=40, + PL=[0, 40, 80]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.literal(False), + hl.missing(hl.tfloat64), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.962, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), + ), # 8. Autosomal locus that is not de novo - { - "locus": hl.locus("chr1", 15000, reference_genome="GRCh38"), - "alleles": ["G", "T"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, PL=[10, 0, 100]), - "father": hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, PL=[0, 20, 80]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, PL=[0, 99, 198]), - "is_xx": False, - "freq_prior": 1e-5, - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 0.077, - "expected_de_novo_expr": hl.struct( - is_de_novo=False, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), - }, + ( + hl.locus("chr1", 15000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, + PL=[10, 0, 100]), + hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, + PL=[0, 20, 80]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, + PL=[0, 99, 198]), + hl.literal(False), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=False, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), + ), # 9. Autosomal locus with PLs all [0,0,0] and no freq prior - { - "locus": hl.locus("chr1", 16000, reference_genome="GRCh38"), - "alleles": ["G", "T"], - "proband": hl.struct(GT=hl.call(0, 1), AD=[0, 2], DP=2, GQ=0, - PL=[0, 0, 0]), - "father": hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, - PL=[0, 0, 0]), - "mother": hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, - PL=[0, 0, 0]), - "is_xx": False, - "freq_prior": hl.missing(hl.tfloat64), - "expected_error": False, - "expected_copy_state": (True, False, False), - "expected_p_de_novo": 0.001, - "expected_de_novo_expr": hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason={"min_de_novo_p", - "min_proband_gq"} - ), - } - ] - - # Convert list to a Hail Table - ht = hl.Table.parallelize( - data, - schema=hl.tstruct( - locus=hl.tlocus("GRCh38"), - alleles=hl.tarray(hl.tstr), - proband=hl.tstruct(GT=hl.tcall, AD=hl.tarray(hl.tint32), DP=hl.tint32, - GQ=hl.tint32, PL=hl.tarray(hl.tint32)), - father=hl.tstruct(GT=hl.tcall, AD=hl.tarray(hl.tint32), DP=hl.tint32, - GQ=hl.tint32, PL=hl.tarray(hl.tint32)), - mother=hl.tstruct(GT=hl.tcall, AD=hl.tarray(hl.tint32), DP=hl.tint32, - GQ=hl.tint32, PL=hl.tarray(hl.tint32)), - is_xx=hl.tbool, - freq_prior=hl.tfloat64, - expected_error=hl.tbool, - expected_copy_state=hl.ttuple(hl.tbool, hl.tbool, hl.tbool), - expected_p_de_novo=hl.tfloat64, - expected_de_novo_expr=hl.tstruct( - is_de_novo=hl.tbool, - p_de_novo=hl.tfloat64, - confidence=hl.tstr, - fail_reason=hl.tset(hl.tstr), - ), + ( + hl.locus("chr1", 16000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[0, 2], DP=2, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, PL=[0, 0, 0]), + hl.literal(False), + hl.missing(hl.tfloat64), + False, + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_de_novo_p", "min_proband_gq"}, + ), ), - ) - - return ht - - def test_get_copy_state_by_sex(self, ht_de_novo_test_cases): - """Test `get_copy_state_by_sex` function using a Hail Table.""" - # 🔹 Compute actual copy state using `get_copy_state_by_sex` - ht = ht_de_novo_test_cases.annotate( - computed_copy_state=get_copy_state_by_sex(ht_de_novo_test_cases.locus, - ht_de_novo_test_cases.is_xx) - ) - - # 🔹 Evaluate computed and expected values - computed_values = hl.eval(ht.computed_copy_state.collect()) - expected_values = hl.eval(ht.expected_copy_state.collect()) - - # 🔹 Compare expected vs. actual results - for i, (computed, expected) in enumerate(zip(computed_values, expected_values)): - assert computed == expected, f"Copy state mismatch at index {i}: expected {expected}, got {computed}" - - def test_calculate_de_novo_post_prob(self, ht_de_novo_test_cases): - """Test `calculate_de_novo_post_prob` function using a Hail Table.""" - # 🔹 Store computed values and handle expected errors in Hail - ht = ht_de_novo_test_cases.annotate( - computed_p_de_novo=hl.case() - .when( - ht_de_novo_test_cases.expected_error, - # 🔹 If error expected, return missing - hl.missing(hl.tfloat64), - ) - .default( - calculate_de_novo_post_prob( - ht_de_novo_test_cases.proband.PL, - ht_de_novo_test_cases.father.PL, - ht_de_novo_test_cases.mother.PL, - ht_de_novo_test_cases.expected_copy_state[0], - ht_de_novo_test_cases.expected_copy_state[1], - ht_de_novo_test_cases.expected_copy_state[2], - ht_de_novo_test_cases.freq_prior, - min_pop_prior=100 / 3e7, - ) - ) - ) - - # 🔹 Collect the table - ht.select("computed_p_de_novo", "expected_p_de_novo", - "expected_error").show(-1) - results = ht.select("computed_p_de_novo", "expected_p_de_novo", - "expected_error").collect() - - for row in results: - if row.expected_error: - # 🔹 If an error was expected, assert the result is missing - assert hl.is_missing(row.computed_p_de_novo) - else: - # 🔹 Otherwise, compare expected values - assert round(row.computed_p_de_novo, 3) == row.expected_p_de_novo + # 10. Autosomal locus with multi-allelic + ( + hl.locus("chr1", 40000, reference_genome="GRCh38"), + hl.literal(["C", "G", "A"]), + hl.struct(GT=hl.call(0, 1), AD=[5, 30, 5], DP=40, GQ=99, + PL=[99, 0, 1]), + hl.struct( + GT=hl.call(0, 0), AD=[20, 0, 5], DP=25, GQ=60, PL=[0, 60, 120] + ), + hl.struct( + GT=hl.call(0, 0), AD=[25, 0, 5], DP=30, GQ=80, PL=[0, 80, 150] + ), + hl.literal(True), + hl.literal(1e-5), + True, + None, + ), + ], + ) - def test_default_get_de_novo_expr(self, ht_de_novo_test_cases): - """Test different scenarios of `default_get_de_novo_expr` using a Hail Table.""" - # 🔹 Store computed values and handle expected errors - ht = ht_de_novo_test_cases.annotate( - computed_de_novo_expr=hl.case() - .when( - ht_de_novo_test_cases.expected_error, - hl.missing(ht_de_novo_test_cases.expected_de_novo_expr.dtype), - ) - .default( - default_get_de_novo_expr( - ht_de_novo_test_cases.locus, - ht_de_novo_test_cases.alleles, - ht_de_novo_test_cases.proband, - ht_de_novo_test_cases.father, - ht_de_novo_test_cases.mother, - ht_de_novo_test_cases.is_xx, - ht_de_novo_test_cases.freq_prior, + def test_default_get_de_novo_expr( + self, + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, + expected_exception, + expected_result, + ): + """Test different scenarios of `default_get_de_novo_expr`.""" + if expected_exception: + with pytest.raises( + hl.utils.HailUserError, + match="Must split multiallelic variants prior to running this function.", + ): + result_expr = default_get_de_novo_expr( + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, ) + hl.eval(result_expr) + else: + result_expr = default_get_de_novo_expr( + locus, + alleles, + proband_expr, + father_expr, + mother_expr, + is_xx_expr, + freq_prior_expr, ) - ) - - # 🔹 Round `p_de_novo` within the struct before evaluation - ht = ht.annotate( - computed_de_novo_expr=hl.struct( - is_de_novo=ht.computed_de_novo_expr.is_de_novo, - p_de_novo=hl.or_missing( - hl.is_defined(ht.computed_de_novo_expr.p_de_novo), - hl.float64( - hl.int32(ht.computed_de_novo_expr.p_de_novo * 1000)) / 1000, - ), - confidence=ht.computed_de_novo_expr.confidence, - fail_reason=ht.computed_de_novo_expr.fail_reason, - ) - ) - - # 🔹 Evaluate computed and expected values - computed_values = hl.eval(ht.computed_de_novo_expr.collect()) - expected_values = hl.eval(ht.expected_de_novo_expr.collect()) + result = hl.eval(result_expr) + expected_result = hl.eval(expected_result) - # 🔹 Compare expected vs. actual results - for i, (computed, expected) in enumerate(zip(computed_values, expected_values)): - assert computed == expected, f"Copy state mismatch at index {i}: expected {expected}, got {computed}" + assert result.is_de_novo == expected_result.is_de_novo + assert ( + None if result.p_de_novo is None else round(result.p_de_novo, 3) + ) == expected_result.p_de_novo + assert result.confidence == expected_result.confidence + assert result.fail_reason == expected_result.fail_reason From 6698674b1de271a06f7e1106c7c9f99906c5d196 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:46:01 -0500 Subject: [PATCH 51/56] pre-commit --- tests/sample_qc/test_de_novo.py | 346 ++++++++++++++++---------------- 1 file changed, 178 insertions(+), 168 deletions(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index d1cf2eb4f..096a8c53d 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -7,23 +7,47 @@ calculate_de_novo_post_prob, default_get_de_novo_expr, ) - from gnomad.utils.annotations import get_copy_state_by_sex + class TestDeNovoMutation: """Test suite for de novo mutation functions.""" @pytest.mark.parametrize( "locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y", [ - (hl.locus("chr1", 100000, reference_genome="GRCh38"), True, True, False, False), - (hl.locus("chrX", 2781479, reference_genome="GRCh38"), False, True, False, False), - (hl.locus("chrX", 3000000, reference_genome="GRCh38"), False, False, True, False), - (hl.locus("chrY", 10000000, reference_genome="GRCh38"), False, False, False, True), + ( + hl.locus("chr1", 100000, reference_genome="GRCh38"), + True, + True, + False, + False, + ), + ( + hl.locus("chrX", 2781479, reference_genome="GRCh38"), + False, + True, + False, + False, + ), + ( + hl.locus("chrX", 3000000, reference_genome="GRCh38"), + False, + False, + True, + False, + ), + ( + hl.locus("chrY", 10000000, reference_genome="GRCh38"), + False, + False, + False, + True, + ), ], ) def test_get_copy_state_by_sex( - self, locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y + self, locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y ) -> None: """Test copy state determination based on locus type and sex.""" is_xx_expr = hl.literal(is_xx) @@ -129,198 +153,184 @@ def test_calculate_de_novo_post_prob( [ # 1. Autosomal locus with HIGH confidence ( - hl.locus("chr1", 10000, reference_genome="GRCh38"), - hl.literal(["A", "C"]), - hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, - PL=[99, 0, 1]), - hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, - PL=[0, 60, 120]), - hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, - PL=[0, 80, 150]), - hl.literal(True), - hl.literal(1e-5), - False, - hl.struct( - is_de_novo=True, - p_de_novo=0.999, - confidence="HIGH", - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), + hl.locus("chr1", 10000, reference_genome="GRCh38"), + hl.literal(["A", "C"]), + hl.struct(GT=hl.call(0, 1), AD=[5, 30], DP=35, GQ=99, PL=[99, 0, 1]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=60, PL=[0, 60, 120]), + hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=80, PL=[0, 80, 150]), + hl.literal(True), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.999, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), ), # 2. Autosomal locus with MEDIUM confidence ( - hl.locus("chr1", 11000, reference_genome="GRCh38"), - hl.literal(["CT","C"]), - hl.struct(GT=hl.call(0, 1), AD=[59, 61], DP=120, GQ=99, - PL=[542,0,1940]), - hl.struct(GT=hl.call(0, 0), AD=[32, 0], DP=32, GQ=60, - PL=[0, 60, 120]), - hl.struct(GT=hl.call(0, 0), AD=[37, 0], DP=37, GQ=60, - PL=[0, 60, 120]), - hl.literal(False), - hl.literal(2.62e-03), - False, - hl.struct( - is_de_novo=True, - p_de_novo=0.615, - confidence="MEDIUM", - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), + hl.locus("chr1", 11000, reference_genome="GRCh38"), + hl.literal(["CT", "C"]), + hl.struct( + GT=hl.call(0, 1), AD=[59, 61], DP=120, GQ=99, PL=[542, 0, 1940] + ), + hl.struct(GT=hl.call(0, 0), AD=[32, 0], DP=32, GQ=60, PL=[0, 60, 120]), + hl.struct(GT=hl.call(0, 0), AD=[37, 0], DP=37, GQ=60, PL=[0, 60, 120]), + hl.literal(False), + hl.literal(2.62e-03), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.615, + confidence="MEDIUM", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), ), # 3. Autosomal locus with LOW confidence ( - hl.locus("chr1", 12000, reference_genome="GRCh38"), - hl.literal(["G", "T"]), - hl.struct(GT=hl.call(0, 1), AD=[7, 2], DP=18, GQ=43, - PL=[43, 0, 387]), - hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=40, - PL=[0, 40, 80]), - hl.struct(GT=hl.call(0, 0), AD=[23, 0], DP=23, GQ=40, - PL=[0, 40, 80]), - hl.literal(True), - hl.literal(0), - False, - hl.struct( - is_de_novo=True, - p_de_novo=0.926, - confidence="LOW", - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), + hl.locus("chr1", 12000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[7, 2], DP=18, GQ=43, PL=[43, 0, 387]), + hl.struct(GT=hl.call(0, 0), AD=[25, 0], DP=25, GQ=40, PL=[0, 40, 80]), + hl.struct(GT=hl.call(0, 0), AD=[23, 0], DP=23, GQ=40, PL=[0, 40, 80]), + hl.literal(True), + hl.literal(0), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.926, + confidence="LOW", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), ), # 4. Autosomal locus with one FAIL condition ( - hl.locus("chr1", 13000, reference_genome="GRCh38"), - hl.literal(["C", "G"]), - hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, - PL=[10, 0, 100]), - hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, - PL=[0, 99, 198]), - hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, - PL=[0, 99, 198]), - hl.literal(True), - hl.literal(1e-5), - False, - hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason={"min_dp_ratio"}, - ), + hl.locus("chr1", 13000, reference_genome="GRCh38"), + hl.literal(["C", "G"]), + hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, PL=[10, 0, 100]), + hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + hl.literal(True), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_dp_ratio"}, + ), ), # 5. Autosomal locus with multiple FAIL conditions ( - hl.locus("chr1", 14000, reference_genome="GRCh38"), - hl.literal(["A", "G"]), - hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), - hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - hl.literal(True), - hl.literal(1e-5), - False, - hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason={"min_de_novo_p", "min_proband_ab", - "min_proband_gq", "parent_sum_ad_0"}, - ), + hl.locus("chr1", 14000, reference_genome="GRCh38"), + hl.literal(["A", "G"]), + hl.struct(GT=hl.call(0, 1), AD=[9, 2], DP=11, GQ=2, PL=[2, 0, 230]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.literal(True), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={ + "min_de_novo_p", + "min_proband_ab", + "min_proband_gq", + "parent_sum_ad_0", + }, + ), ), # 6. Hemi X locus for XY individual with HIGH confidence ( - hl.locus("chrX", 8400000, reference_genome="GRCh38"), - hl.literal(["A", "G"]), - hl.struct(GT=hl.call(1, 1), AD=[0, 14], DP=14, GQ=42, - PL=[419, 42, 0]), - hl.struct(GT=hl.call(0, 0), AD=[38, 0], DP=38, GQ=40, - PL=[0, 40, 80]), - hl.struct(GT=hl.call(0, 0), AD=[97, 0], DP=110, GQ=99, - PL=[0, 99, 198]), - hl.literal(False), - hl.literal(3.74e-02), - False, - hl.struct( - is_de_novo=True, - p_de_novo=0.999, - confidence="HIGH", - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), + hl.locus("chrX", 8400000, reference_genome="GRCh38"), + hl.literal(["A", "G"]), + hl.struct(GT=hl.call(1, 1), AD=[0, 14], DP=14, GQ=42, PL=[419, 42, 0]), + hl.struct(GT=hl.call(0, 0), AD=[38, 0], DP=38, GQ=40, PL=[0, 40, 80]), + hl.struct(GT=hl.call(0, 0), AD=[97, 0], DP=110, GQ=99, PL=[0, 99, 198]), + hl.literal(False), + hl.literal(3.74e-02), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.999, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), ), # 7. Hemi Y locus for XY individual with HIGH confidence ( - hl.locus("chrY", 9900000, reference_genome="GRCh38"), - hl.literal(["A", "G"]), - hl.struct(GT=hl.call(1, 1), AD=[0, 43], DP=43, GQ=99, - PL=[1363, 129, 0]), - hl.struct(GT=hl.call(0, 0), AD=[28, 0], DP=28, GQ=40, - PL=[0, 40, 80]), - hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), - hl.literal(False), - hl.missing(hl.tfloat64), - False, - hl.struct( - is_de_novo=True, - p_de_novo=0.962, - confidence="HIGH", - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), + hl.locus("chrY", 9900000, reference_genome="GRCh38"), + hl.literal(["A", "G"]), + hl.struct( + GT=hl.call(1, 1), AD=[0, 43], DP=43, GQ=99, PL=[1363, 129, 0] + ), + hl.struct(GT=hl.call(0, 0), AD=[28, 0], DP=28, GQ=40, PL=[0, 40, 80]), + hl.struct(GT=hl.call(0, 0), AD=[0, 0], DP=0, GQ=0, PL=[0, 0, 0]), + hl.literal(False), + hl.missing(hl.tfloat64), + False, + hl.struct( + is_de_novo=True, + p_de_novo=0.962, + confidence="HIGH", + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), ), # 8. Autosomal locus that is not de novo ( - hl.locus("chr1", 15000, reference_genome="GRCh38"), - hl.literal(["G", "T"]), - hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, - PL=[10, 0, 100]), - hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, - PL=[0, 20, 80]), - hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, - PL=[0, 99, 198]), - hl.literal(False), - hl.literal(1e-5), - False, - hl.struct( - is_de_novo=False, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason=hl.missing(hl.tset(hl.tstr)), - ), + hl.locus("chr1", 15000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[15, 10], DP=30, GQ=50, PL=[10, 0, 100]), + hl.struct(GT=hl.call(0, 1), AD=[10, 5], DP=20, GQ=40, PL=[0, 20, 80]), + hl.struct(GT=hl.call(0, 0), AD=[20, 0], DP=20, GQ=50, PL=[0, 99, 198]), + hl.literal(False), + hl.literal(1e-5), + False, + hl.struct( + is_de_novo=False, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason=hl.missing(hl.tset(hl.tstr)), + ), ), # 9. Autosomal locus with PLs all [0,0,0] and no freq prior ( - hl.locus("chr1", 16000, reference_genome="GRCh38"), - hl.literal(["G", "T"]), - hl.struct(GT=hl.call(0, 1), AD=[0, 2], DP=2, GQ=0, PL=[0, 0, 0]), - hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, PL=[0, 0, 0]), - hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, PL=[0, 0, 0]), - hl.literal(False), - hl.missing(hl.tfloat64), - False, - hl.struct( - is_de_novo=True, - p_de_novo=hl.missing(hl.tfloat64), - confidence=hl.missing(hl.tstr), - fail_reason={"min_de_novo_p", "min_proband_gq"}, - ), + hl.locus("chr1", 16000, reference_genome="GRCh38"), + hl.literal(["G", "T"]), + hl.struct(GT=hl.call(0, 1), AD=[0, 2], DP=2, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, PL=[0, 0, 0]), + hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=0, PL=[0, 0, 0]), + hl.literal(False), + hl.missing(hl.tfloat64), + False, + hl.struct( + is_de_novo=True, + p_de_novo=hl.missing(hl.tfloat64), + confidence=hl.missing(hl.tstr), + fail_reason={"min_de_novo_p", "min_proband_gq"}, + ), ), # 10. Autosomal locus with multi-allelic ( - hl.locus("chr1", 40000, reference_genome="GRCh38"), - hl.literal(["C", "G", "A"]), - hl.struct(GT=hl.call(0, 1), AD=[5, 30, 5], DP=40, GQ=99, - PL=[99, 0, 1]), - hl.struct( - GT=hl.call(0, 0), AD=[20, 0, 5], DP=25, GQ=60, PL=[0, 60, 120] - ), - hl.struct( - GT=hl.call(0, 0), AD=[25, 0, 5], DP=30, GQ=80, PL=[0, 80, 150] - ), - hl.literal(True), - hl.literal(1e-5), - True, - None, + hl.locus("chr1", 40000, reference_genome="GRCh38"), + hl.literal(["C", "G", "A"]), + hl.struct(GT=hl.call(0, 1), AD=[5, 30, 5], DP=40, GQ=99, PL=[99, 0, 1]), + hl.struct( + GT=hl.call(0, 0), AD=[20, 0, 5], DP=25, GQ=60, PL=[0, 60, 120] + ), + hl.struct( + GT=hl.call(0, 0), AD=[25, 0, 5], DP=30, GQ=80, PL=[0, 80, 150] + ), + hl.literal(True), + hl.literal(1e-5), + True, + None, ), ], ) - def test_default_get_de_novo_expr( self, locus, From 80683f36c2219d09967c9a38b851e8a189b66bd4 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:49:52 -0500 Subject: [PATCH 52/56] Black --- gnomad/sample_qc/relatedness.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 8e9f2524e..96be3335d 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1732,8 +1732,7 @@ def default_get_de_novo_expr( fail = hl.any(list(fail_checks_expr.values())) result_expr = hl.struct( is_de_novo=is_de_novo, - p_de_novo=hl.or_missing(is_de_novo & ~fail, - p_de_novo), + p_de_novo=hl.or_missing(is_de_novo & ~fail, p_de_novo), confidence=hl.or_missing(is_de_novo & ~fail, confidence_expr), fail_reason=hl.or_missing( is_de_novo & fail, From a8db4ff73f5b0acf056793532bf315125d8551ed Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:58:50 -0500 Subject: [PATCH 53/56] Docstring format --- gnomad/sample_qc/relatedness.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 96be3335d..18be412b7 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1574,10 +1574,10 @@ def default_get_de_novo_expr( designed to fill the gap for variants that do not meet the FAIL criteria but would otherwise remain unclassified. - The *de novo* confidence is calculated as a simplified version of the one previously - described in Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts) and - Hail's [*de_novo*](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) - method. This simplified version is the same as Hail's methods when using the + The *de novo* confidence is calculated as a simplified version of the one + previously described in Kaitlin Samocha's [*de novo* caller](https://github.com/ksamocha/de_novo_scripts) + and Hail's [*de_novo*](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.de_novo) + method. This simplified version is the same as Hail's methods when using the `ignore_in_sample_allele_frequency` parameter. The main difference is that this mode should be used when families larger than a single trio are in the dataset, in which an allele might be *de novo* in a parent and transmitted to a From 3d894fa0f0c41bd212cc37ca8b03a70d36c51caa Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 12 Feb 2025 10:56:31 -0500 Subject: [PATCH 54/56] Address review comments --- gnomad/utils/annotations.py | 3 ++ tests/sample_qc/test_de_novo.py | 60 ++++----------------------------- tests/utils/test_annotations.py | 58 ++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 55 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 6c4c8a0f3..e61c83cff 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2791,6 +2791,9 @@ def get_copy_state_by_sex( """ Determine the copy state of a variant by its locus and the sex karotype of a sample. + This function assumes that the sample contains only XX and XY karyotypes. It does + not account for ambiguous sex or aneuploidies (e.g., XXY, XYY). + :param locus_expr: LocusExpression of the variant. :param is_xx_expr: BooleanExpression indicating whether the sample has an XX sex karyotype. diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index 096a8c53d..bdccd2591 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -13,54 +13,6 @@ class TestDeNovoMutation: """Test suite for de novo mutation functions.""" - @pytest.mark.parametrize( - "locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y", - [ - ( - hl.locus("chr1", 100000, reference_genome="GRCh38"), - True, - True, - False, - False, - ), - ( - hl.locus("chrX", 2781479, reference_genome="GRCh38"), - False, - True, - False, - False, - ), - ( - hl.locus("chrX", 3000000, reference_genome="GRCh38"), - False, - False, - True, - False, - ), - ( - hl.locus("chrY", 10000000, reference_genome="GRCh38"), - False, - False, - False, - True, - ), - ], - ) - def test_get_copy_state_by_sex( - self, locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y - ) -> None: - """Test copy state determination based on locus type and sex.""" - is_xx_expr = hl.literal(is_xx) - - diploid, hemi_x, hemi_y = get_copy_state_by_sex(locus, is_xx_expr) - result = hl.eval([diploid, hemi_x, hemi_y]) - - assert result == [ - expected_diploid, - expected_hemi_x, - expected_hemi_y, - ], f"Failed for locus={locus}, is_xx={is_xx}. Expected {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" - @pytest.mark.parametrize( "proband_pl, father_pl, mother_pl, diploid, hemi_x, hemi_y, freq_prior, min_pop_prior, expected", [ @@ -207,18 +159,18 @@ def test_calculate_de_novo_post_prob( # 4. Autosomal locus with one FAIL condition ( hl.locus("chr1", 13000, reference_genome="GRCh38"), - hl.literal(["C", "G"]), - hl.struct(GT=hl.call(0, 1), AD=[20, 5], DP=10, GQ=50, PL=[10, 0, 100]), - hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), - hl.struct(GT=hl.call(0, 0), AD=[10, 0], DP=100, GQ=99, PL=[0, 99, 198]), + hl.literal(["T", "TA"]), + hl.struct(GT=hl.call(0, 1), AD=[4, 2], DP=8, GQ=30, PL=[30, 0, 103]), + hl.struct(GT=hl.call(0, 0), AD=[7, 0], DP=7, GQ=24, PL=[0, 24, 48]), + hl.struct(GT=hl.call(0, 0), AD=[2, 0], DP=2, GQ=6, PL=[0, 6, 12]), hl.literal(True), - hl.literal(1e-5), + hl.literal(2.00e-01), False, hl.struct( is_de_novo=True, p_de_novo=hl.missing(hl.tfloat64), confidence=hl.missing(hl.tstr), - fail_reason={"min_dp_ratio"}, + fail_reason={"min_de_novo_p"}, ), ), # 5. Autosomal locus with multiple FAIL conditions diff --git a/tests/utils/test_annotations.py b/tests/utils/test_annotations.py index 705765a3d..82172c425 100644 --- a/tests/utils/test_annotations.py +++ b/tests/utils/test_annotations.py @@ -5,7 +5,11 @@ import hail as hl import pytest -from gnomad.utils.annotations import fill_missing_key_combinations, missing_struct_expr +from gnomad.utils.annotations import ( + fill_missing_key_combinations, + get_copy_state_by_sex, + missing_struct_expr, +) class TestFillMissingKeyCombinations: @@ -90,3 +94,55 @@ def test_missing_struct_expr() -> None: # Verify the result. assert hl.eval(result == expected) + + +class TestGetCopyStateBySex: + """Test the `get_copy_state_by_sex` function.""" + + @pytest.mark.parametrize( + "locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y", + [ + ( + hl.locus("chr1", 100000, reference_genome="GRCh38"), + True, + True, + False, + False, + ), + ( + hl.locus("chrX", 2781479, reference_genome="GRCh38"), + False, + True, + False, + False, + ), + ( + hl.locus("chrX", 3000000, reference_genome="GRCh38"), + False, + False, + True, + False, + ), + ( + hl.locus("chrY", 10000000, reference_genome="GRCh38"), + False, + False, + False, + True, + ), + ], + ) + def test_get_copy_state_by_sex( + self, locus, is_xx, expected_diploid, expected_hemi_x, expected_hemi_y + ) -> None: + """Test copy state determination based on locus type and sex.""" + is_xx_expr = hl.literal(is_xx) + + diploid, hemi_x, hemi_y = get_copy_state_by_sex(locus, is_xx_expr) + result = hl.eval([diploid, hemi_x, hemi_y]) + + assert result == [ + expected_diploid, + expected_hemi_x, + expected_hemi_y, + ], f"Failed for locus={locus}, is_xx={is_xx}. Expected {[expected_diploid, expected_hemi_x, expected_hemi_y]}, got {result}" From 20837f335c90d5eca17e1225c8d1777fe210c4f3 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 12 Feb 2025 11:05:17 -0500 Subject: [PATCH 55/56] Remove unused imports --- tests/sample_qc/test_de_novo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/sample_qc/test_de_novo.py b/tests/sample_qc/test_de_novo.py index bdccd2591..6352dabb2 100644 --- a/tests/sample_qc/test_de_novo.py +++ b/tests/sample_qc/test_de_novo.py @@ -7,7 +7,6 @@ calculate_de_novo_post_prob, default_get_de_novo_expr, ) -from gnomad.utils.annotations import get_copy_state_by_sex class TestDeNovoMutation: From 7e5be9157d80ad96de2a9600cc0873e2b349acf9 Mon Sep 17 00:00:00 2001 From: Qin He <44242118+KoalaQin@users.noreply.github.com> Date: Wed, 12 Feb 2025 12:42:30 -0500 Subject: [PATCH 56/56] specify XY only --- gnomad/sample_qc/relatedness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/sample_qc/relatedness.py b/gnomad/sample_qc/relatedness.py index 18be412b7..d4610a610 100644 --- a/gnomad/sample_qc/relatedness.py +++ b/gnomad/sample_qc/relatedness.py @@ -1384,13 +1384,13 @@ def calculate_de_novo_post_prob( P(\text{data} \mid \text{missed het in parents}) = ( P(\text{het in father}) \cdot P(\text{hom_ref in mother}) + P(\text{hom_ref in father}) \cdot P(\text{het in mother})) \cdot P(\text{het in proband}) - - **X non-PAR regions**: + - **X non-PAR regions (XY only)**: .. math:: P(\text{data} \mid \text{missed het in mother}) = (P(\text{het in mother}) + P(\text{hom_alt in mother})) \cdot P(\text{hom_alt in proband}) - - **Y non-PAR regions**: + - **Y non-PAR regions (XY only)**: .. math::