diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/_modules/gnomad/assessment/summary_stats.html b/_modules/gnomad/assessment/summary_stats.html new file mode 100644 index 000000000..b51da13fa --- /dev/null +++ b/_modules/gnomad/assessment/summary_stats.html @@ -0,0 +1,773 @@ + + + + + + gnomad.assessment.summary_stats — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.assessment.summary_stats

+# noqa: D100
+
+import logging
+from typing import Dict, Optional, Set
+
+import hail as hl
+
+from gnomad.utils.filtering import filter_low_conf_regions
+from gnomad.utils.vep import (
+    LOF_CSQ_SET,
+    add_most_severe_consequence_to_consequence,
+    filter_vep_to_canonical_transcripts,
+    filter_vep_to_mane_select_transcripts,
+    get_most_severe_consequence_for_summary,
+    process_consequences,
+)
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def freq_bin_expr( + freq_expr: hl.expr.ArrayExpression, index: int = 0 +) -> hl.expr.StringExpression: + """ + Return frequency string annotations based on input AC or AF. + + .. note:: + + - Default index is 0 because function assumes freq_expr was calculated with `annotate_freq`. + - Frequency index 0 from `annotate_freq` is frequency for all pops calculated on adj genotypes only. + + :param freq_expr: Array of structs containing frequency information. + :param index: Which index of freq_expr to use for annotation. Default is 0. + :return: StringExpression containing bin name based on input AC or AF. + """ + return ( + hl.case(missing_false=True) + .when(freq_expr[index].AC == 0, "Not found") + .when(freq_expr[index].AC == 1, "Singleton") + .when(freq_expr[index].AC == 2, "Doubleton") + .when(freq_expr[index].AC <= 5, "AC 3 - 5") + .when(freq_expr[index].AF < 1e-4, "AC 6 - 0.01%") + .when(freq_expr[index].AF < 1e-3, "0.01% - 0.1%") + .when(freq_expr[index].AF < 1e-2, "0.1% - 1%") + .when(freq_expr[index].AF < 1e-1, "1% - 10%") + .when(freq_expr[index].AF > 0.95, ">95%") + .default("10% - 95%") + )
+ + +
[docs]def get_summary_counts_dict( + locus_expr: hl.expr.LocusExpression, + allele_expr: hl.expr.ArrayExpression, + lof_expr: hl.expr.StringExpression, + no_lof_flags_expr: hl.expr.BooleanExpression, + most_severe_csq_expr: hl.expr.StringExpression, + prefix_str: str = "", +) -> Dict[str, hl.expr.Int64Expression]: + """ + Return dictionary containing containing counts of multiple variant categories. + + Categories are: + - Number of variants + - Number of indels + - Number of SNVs + - Number of LoF variants + - Number of LoF variants that pass LOFTEE + - Number of LoF variants that pass LOFTEE without any flgs + - Number of LoF variants annotated as 'other splice' (OS) by LOFTEE + - Number of LoF variants that fail LOFTEE + - Number of missense variants + - Number of synonymous variants + - Number of autosomal variants + - Number of allosomal variants + + .. warning:: + Assumes `allele_expr` contains only two variants (multi-allelics have been split). + + :param locus_expr: LocusExpression. + :param allele_expr: ArrayExpression containing alleles. + :param lof_expr: StringExpression containing LOFTEE annotation. + :param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags. + :param most_severe_csq_expr: StringExpression containing most severe consequence annotation. + :param prefix_str: Desired prefix string for category names. Default is empty str. + :return: Dict of categories and counts per category. + """ + logger.warning("This function expects that multi-allelic variants have been split!") + return { + f"{prefix_str}num_variants": hl.agg.count(), + f"{prefix_str}indels": hl.agg.count_where( + hl.is_indel(allele_expr[0], allele_expr[1]) + ), + f"{prefix_str}snps": hl.agg.count_where( + hl.is_snp(allele_expr[0], allele_expr[1]) + ), + f"{prefix_str}LOF": hl.agg.count_where(hl.is_defined(lof_expr)), + f"{prefix_str}pass_loftee": hl.agg.count_where(lof_expr == "HC"), + f"{prefix_str}pass_loftee_no_flag": hl.agg.count_where( + (lof_expr == "HC") & (no_lof_flags_expr) + ), + f"{prefix_str}loftee_os": hl.agg.count_where(lof_expr == "OS"), + f"{prefix_str}fail_loftee": hl.agg.count_where(lof_expr == "LC"), + f"{prefix_str}num_missense": hl.agg.count_where( + most_severe_csq_expr == "missense_variant" + ), + f"{prefix_str}num_synonymous": hl.agg.count_where( + most_severe_csq_expr == "synonymous_variant" + ), + f"{prefix_str}num_autosomal_variants": hl.agg.filter( + locus_expr.in_autosome_or_par(), hl.agg.count() + ), + f"{prefix_str}num_allosomal_variants": hl.agg.filter( + locus_expr.in_x_nonpar() | locus_expr.in_y_nonpar(), hl.agg.count() + ), + }
+ + +
[docs]def get_summary_ac_dict( + ac_expr: hl.expr.Int64Expression, + lof_expr: hl.expr.StringExpression, + no_lof_flags_expr: hl.expr.BooleanExpression, + most_severe_csq_expr: hl.expr.StringExpression, +) -> Dict[str, hl.expr.Int64Expression]: + """ + Return dictionary containing containing total allele counts for variant categories. + + Categories are: + - All variants + - LoF variants + - LoF variants that pass LOFTEE + - LoF variants that pass LOFTEE without any flags + - LoF variants that are annotate as 'other splice' (OS) by LOFTEE + - LoF variants that fail LOFTEE + - Missense variants + - Synonymous variants + + .. warning:: + Assumes `allele_expr` contains only two variants (multi-allelics have been split). + + :param allele_expr: ArrayExpression containing alleles. + :param lof_expr: StringExpression containing LOFTEE annotation. + :param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags. + :return: Dict of variant categories and their total allele counts. + """ + logger.warning("This function expects that multi-allelic variants have been split!") + return { + "total_ac": hl.agg.sum(ac_expr), + "total_ac_LOF": hl.agg.filter(hl.is_defined(lof_expr), hl.agg.sum(ac_expr)), + "total_ac_pass_loftee": hl.agg.filter(lof_expr == "HC", hl.agg.sum(ac_expr)), + "total_ac_pass_loftee_no_flag": hl.agg.filter( + (lof_expr == "HC") & (no_lof_flags_expr), hl.agg.sum(ac_expr) + ), + "total_ac_loftee_os": hl.agg.filter(lof_expr == "OS", hl.agg.sum(ac_expr)), + "total_ac_fail_loftee": hl.agg.filter(lof_expr == "LC", hl.agg.sum(ac_expr)), + "total_ac_missense": hl.agg.filter( + most_severe_csq_expr == "missense_variant", hl.agg.sum(ac_expr) + ), + "total_ac_synonymous": hl.agg.filter( + most_severe_csq_expr == "synonymous_variant", hl.agg.sum(ac_expr) + ), + }
+ + +
[docs]def get_summary_counts( + ht: hl.Table, + freq_field: str = "freq", + filter_field: str = "filters", + filter_decoy: bool = False, + canonical_only: bool = True, + mane_select_only: bool = False, + index: int = 0, +) -> hl.Table: + """ + Generate a struct with summary counts across variant categories. + + Summary counts: + - Number of variants + - Number of indels + - Number of SNVs + - Number of LoF variants + - Number of LoF variants that pass LOFTEE (including with LoF flags) + - Number of LoF variants that pass LOFTEE without LoF flags + - Number of OS (other splice) variants annotated by LOFTEE + - Number of LoF variants that fail LOFTEE filters + + Also annotates Table's globals with total variant counts. + + Before calculating summary counts, function: + - Filters out low confidence regions + - Uses the most severe consequence + - Filters to canonical transcripts (if `canonical_only` is True) or MANE Select + transcripts (if `mane_select_only` is True) + + Assumes that: + - Input HT is annotated with VEP. + - Multiallelic variants have been split and/or input HT contains bi-allelic variants only. + - freq_expr was calculated with `annotate_freq`. + - (Frequency index 0 from `annotate_freq` is frequency for all pops calculated on adj genotypes only.) + + :param ht: Input Table. + :param freq_field: Name of field in HT containing frequency annotation (array of structs). Default is "freq". + :param filter_field: Name of field in HT containing variant filter information. Default is "filters". + :param canonical_only: Whether to filter to canonical transcripts. Default is True. + :param mane_select_only: Whether to filter to MANE Select transcripts. Default is False. + :param filter_decoy: Whether to filter decoy regions. Default is False. + :param index: Which index of freq_expr to use for annotation. Default is 0. + :return: Table grouped by frequency bin and aggregated across summary count categories. + """ + if canonical_only and mane_select_only: + raise ValueError( + "Only one of `canonical_only` and `mane_select_only` can be True." + ) + + logger.info("Checking if multi-allelic variants have been split...") + max_alleles = ht.aggregate(hl.agg.max(hl.len(ht.alleles))) + if max_alleles > 2: + logger.info("Splitting multi-allelics and VEP transcript consequences...") + ht = hl.split_multi_hts(ht) + + logger.info("Filtering to PASS variants in high confidence regions...") + ht = ht.filter((hl.len(ht[filter_field]) == 0)) + ht = filter_low_conf_regions(ht, filter_decoy=filter_decoy) + + if canonical_only: + logger.info("Filtering to canonical transcripts...") + ht = filter_vep_to_canonical_transcripts(ht) + elif mane_select_only: + logger.info("Filtering to mane select transcripts...") + ht = filter_vep_to_mane_select_transcripts(ht) + + logger.info("Getting VEP summary annotations...") + ht = get_most_severe_consequence_for_summary(ht) + + logger.info("Annotating with frequency bin information...") + ht = ht.annotate(freq_bin=freq_bin_expr(ht[freq_field], index)) + + logger.info( + "Annotating HT globals with total counts/total allele counts per variant" + " category..." + ) + summary_counts = ht.aggregate( + hl.struct( + **get_summary_counts_dict( + ht.locus, + ht.alleles, + ht.lof, + ht.no_lof_flags, + ht.most_severe_csq, + prefix_str="total_", + ) + ) + ) + summary_ac_counts = ht.aggregate( + hl.struct( + **get_summary_ac_dict( + ht[freq_field][index].AC, + ht.lof, + ht.no_lof_flags, + ht.most_severe_csq, + ) + ) + ) + ht = ht.annotate_globals( + summary_counts=summary_counts.annotate(**summary_ac_counts) + ) + return ht.group_by("freq_bin").aggregate( + **get_summary_counts_dict( + ht.locus, + ht.alleles, + ht.lof, + ht.no_lof_flags, + ht.most_severe_csq, + ) + )
+ + +
[docs]def get_an_criteria( + mt: hl.MatrixTable, + samples_by_sex: Optional[Dict[str, int]] = None, + meta_root: str = "meta", + sex_field: str = "sex_imputation.sex_karyotype", + xy_str: str = "XY", + xx_str: str = "XX", + freq_field: str = "freq", + freq_index: int = 0, + an_proportion_cutoff: float = 0.8, +) -> hl.expr.BooleanExpression: + """ + Generate criteria to filter samples based on allele number (AN). + + Uses allele number as proxy for call rate. + + :param mt: Input MatrixTable. + :param samples_by_sex: Optional Dictionary containing number of samples (value) for each sample sex (key). + :param meta_root: Name of field in MatrixTable containing sample metadata information. Default is 'meta'. + :param sex_field: Name of field in MatrixTable containing sample sex assignment. Defualt is 'sex_imputation.sex_karyotype'. + :param xy_str: String marking whether a sample has XY sex. Default is 'XY'. + :param xx_str: String marking whether a sample has XX sex. Default is 'XX'. + :param freq_field: Name of field in MT that contains frequency information. Default is 'freq'. + :param freq_index: Which index of frequency struct to use. Default is 0. + :param an_proportion_cutoff: Desired allele number proportion cutoff. Default is 0.8. + """ + if samples_by_sex is None: + samples_by_sex = mt.aggregate_cols(hl.agg.counter(mt[meta_root][sex_field])) + return ( + hl.case() + .when( + mt.locus.in_autosome_or_par(), + mt[freq_field][freq_index].AN + >= an_proportion_cutoff * 2 * sum(samples_by_sex.values()), + ) + .when( + mt.locus.in_x_nonpar(), + mt[freq_field][freq_index].AN + >= an_proportion_cutoff + * (samples_by_sex[xy_str] + samples_by_sex[xx_str] * 2), + ) + .when( + mt.locus.in_y_nonpar(), + mt[freq_field][freq_index].AN + >= an_proportion_cutoff * samples_by_sex[xy_str], + ) + .or_missing() + )
+ + +
[docs]def get_tx_expression_expr( + key_expr: hl.expr.StructExpression, + tx_ht: hl.Table, + csq_expr: hl.expr.StructExpression, + gene_field: str = "ensg", + csq_field: str = "csq", + tx_struct: str = "tx_annotation", +) -> hl.expr.Float64Expression: + """ + Pull appropriate transcript expression annotation struct given a specific locus and alleles (provided in `key_expr`). + + Assumes that `key_expr` contains a locus and alleles. + Assumes that multi-allelic variants have been split in both `tx_ht` and `key_expr`. + + :param row_key_expr: StructExpression containing locus and alleles to search in `tx_ht`. + :param tx_ht: Input Table containing transcript expression information. + :param csq_expr: Input StructExpression that contains VEP consequence information. + :param gene_field: Field in `csq_expr` that contains gene ID. + :param csq_field: Field in `csq_expr` that contains `most_severe_consequence` annotation. + :param tx_struct: StructExpression that contains transcript expression information. + :return: StructExpression that contains transcript expression information for given gene ID in `csq_expr`. + """ + return hl.find( + lambda csq: (csq[gene_field] == csq_expr.gene_id) + & (csq[csq_field] == csq_expr.most_severe_consequence), + tx_ht[key_expr][tx_struct], + )
+ + +
[docs]def default_generate_gene_lof_matrix( + mt: hl.MatrixTable, + tx_ht: Optional[hl.Table], + high_expression_cutoff: float = 0.9, + low_expression_cutoff: float = 0.1, + filter_field: str = "filters", + freq_field: str = "freq", + freq_index: int = 0, + additional_csq_set: Set[str] = {"missense_variant", "synonymous_variant"}, + all_transcripts: bool = False, + filter_an: bool = False, + filter_to_rare: bool = False, + pre_loftee: bool = False, + lof_csq_set: Set[str] = LOF_CSQ_SET, + remove_ultra_common: bool = False, +) -> hl.MatrixTable: + """ + Generate loss-of-function gene matrix. + + Used to generate summary metrics on LoF variants. + + :param mt: Input MatrixTable. + :param tx_ht: Optional Table containing expression levels per transcript. + :param high_expression_cutoff: Minimum mean proportion expressed cutoff for a transcript to be considered highly expressed. Default is 0.9. + :param low_expression_cutoff: Upper mean proportion expressed cutoff for a transcript to lowly expressed. Default is 0.1. + :param filter_field: Name of field in MT that contains variant filters. Default is 'filters'. + :param freq_field: Name of field in MT that contains frequency information. Default is 'freq'. + :param freq_index: Which index of frequency struct to use. Default is 0. + :param additional_csq_set: Set of additional consequences to keep. Default is {'missense_variant', 'synonymous_variant'}. + :param all_transcripts: Whether to use all transcripts instead of just the transcript with most severe consequence. Default is False. + :param filter_an: Whether to filter using allele number as proxy for call rate. Default is False. + :param filter_to_rare: Whether to filter to rare (AF < 5%) variants. Default is False. + :param pre_loftee: Whether LoF consequences have been annotated with LOFTEE. Default is False. + :param lof_csq_set: Set of LoF consequence strings. Default is {"splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant"}. + :param remove_ultra_common: Whether to remove ultra common (AF > 95%) variants. Default is False. + """ + logger.info("Filtering to PASS variants...") + filt_criteria = hl.len(mt[filter_field]) == 0 + if filter_an: + logger.info( + "Using AN (as a call rate proxy) to filter to variants that meet a minimum" + " call rate..." + ) + mt = mt.filter_rows(get_an_criteria(mt)) + if remove_ultra_common: + logger.info("Removing ultra common (AF > 95%) variants...") + filt_criteria &= mt[freq_field][freq_index].AF < 0.95 + if filter_to_rare: + logger.info("Filtering to rare (AF < 5%) variants...") + filt_criteria &= mt[freq_field][freq_index].AF < 0.05 + mt = mt.filter_rows(filt_criteria) + + if all_transcripts: + logger.info("Exploding transcript_consequences field...") + explode_field = "transcript_consequences" + else: + logger.info( + "Adding most severe (worst) consequence and expoding worst_csq_by_gene" + " field..." + ) + mt = process_consequences(mt) + explode_field = "worst_csq_by_gene" + + if additional_csq_set: + logger.info("Including these consequences: %s", additional_csq_set) + additional_cats = hl.literal(additional_csq_set) + + if pre_loftee: + logger.info("Filtering to LoF consequences: %s", lof_csq_set) + lof_cats = hl.literal(lof_csq_set) + criteria = lambda x: lof_cats.contains( + add_most_severe_consequence_to_consequence(x).most_severe_consequence + ) + if additional_csq_set: + criteria = lambda x: lof_cats.contains( + add_most_severe_consequence_to_consequence(x).most_severe_consequence + ) | additional_cats.contains( + add_most_severe_consequence_to_consequence(x).most_severe_consequence + ) + + else: + logger.info("Filtering to LoF variants that pass LOFTEE with no LoF flags...") + criteria = lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags) + if additional_csq_set: + criteria = lambda x: (x.lof == "HC") & hl.is_missing( + x.lof_flags + ) | additional_cats.contains( + add_most_severe_consequence_to_consequence(x).most_severe_consequence + ) + + csqs = mt.vep[explode_field].filter(criteria) + mt = mt.select_rows(mt[freq_field], csqs=csqs) + mt = mt.explode_rows(mt.csqs) + annotation_expr = { + "gene_id": mt.csqs.gene_id, + "gene": mt.csqs.gene_symbol, + "indel": hl.is_indel(mt.alleles[0], mt.alleles[1]), + "most_severe_consequence": mt.csqs.most_severe_consequence, + } + + if tx_ht: + logger.info("Adding transcript expression annotation...") + tx_annotation = get_tx_expression_expr( + mt.row_key, + tx_ht, + mt.csqs, + ).mean_proportion + annotation_expr["expressed"] = ( + hl.case() + .when(tx_annotation >= high_expression_cutoff, "high") + .when(tx_annotation > low_expression_cutoff, "medium") + .when(hl.is_defined(tx_annotation), "low") + .default("missing") + ) + else: + annotation_expr["transcript_id"] = mt.csqs.transcript_id + annotation_expr["canonical"] = hl.is_defined(mt.csqs.canonical) + + mt = mt.annotate_rows(**annotation_expr) + return ( + mt.group_rows_by(*list(annotation_expr.keys())) + .aggregate_rows( + n_sites=hl.agg.count(), + n_sites_array=hl.agg.array_sum(mt.freq.map(lambda x: hl.int(x.AC > 0))), + classic_caf=hl.agg.sum(mt[freq_field][freq_index].AF), + max_af=hl.agg.max(mt[freq_field][freq_index].AF), + classic_caf_array=hl.agg.array_sum(mt[freq_field].map(lambda x: x.AF)), + ) + .aggregate_entries( + num_homs=hl.agg.count_where(mt.GT.is_hom_var()), + num_hets=hl.agg.count_where(mt.GT.is_het()), + defined_sites=hl.agg.count_where(hl.is_defined(mt.GT)), + ) + .result() + )
+ + +
[docs]def get_het_hom_summary_dict( + csq_set: Set[str], + most_severe_csq_expr: hl.expr.StringExpression, + defined_sites_expr: hl.expr.Int64Expression, + num_homs_expr: hl.expr.Int64Expression, + num_hets_expr: hl.expr.Int64Expression, + pop_expr: hl.expr.StringExpression, +) -> Dict[str, hl.expr.Int64Expression]: + """ + Generate dictionary containing summary counts. + + Summary counts are: + - Number of sites with defined genotype calls + - Number of samples with heterozygous calls + - Number of samples with homozygous calls + + Function has option to generate counts by population. + + :param csq_set: Set containing transcript consequence string(s). + :param most_severe_csq_expr: StringExpression containing most severe consequence. + :param defined_sites_expr: Int64Expression containing number of sites with defined genotype calls. + :param num_homs_expr: Int64Expression containing number of samples with homozygous genotype calls. + :param num_hets_expr: Int64Expression containing number of samples with heterozygous genotype calls. + :param pop_expr: StringExpression containing sample population labels. + :return: Dictionary of summary annotation names and their values. + """ + csq_filter_expr = hl.literal(csq_set).contains(most_severe_csq_expr) + return { + "no_alt_calls": hl.agg.count_where( + (csq_filter_expr) + & (defined_sites_expr > 0) + & (num_homs_expr + num_hets_expr == 0) + ), + "obs_het": hl.agg.count_where( + (csq_filter_expr) & (num_homs_expr == 0) & (num_hets_expr > 0) + ), + "obs_hom": hl.agg.count_where((csq_filter_expr) & (num_homs_expr > 0)), + "defined": hl.agg.count_where((csq_filter_expr) & (defined_sites_expr > 0)), + "pop_no_alt_calls": hl.agg.group_by( + pop_expr, + hl.agg.count_where( + (csq_filter_expr) + & (defined_sites_expr > 0) + & (num_homs_expr + num_hets_expr == 0) + ), + ), + "pop_obs_het": hl.agg.group_by( + pop_expr, + hl.agg.count_where( + (csq_filter_expr) & (num_homs_expr == 0) & (num_hets_expr > 0) + ), + ), + "pop_obs_hom": hl.agg.group_by( + pop_expr, + hl.agg.count_where((csq_filter_expr) & (num_homs_expr > 0)), + ), + "pop_defined": hl.agg.group_by( + pop_expr, + hl.agg.count_where((csq_filter_expr) & (defined_sites_expr > 0)), + ), + }
+ + +
[docs]def default_generate_gene_lof_summary( + mt: hl.MatrixTable, + collapse_indels: bool = False, + tx: bool = False, + lof_csq_set: Set[str] = LOF_CSQ_SET, + meta_root: str = "meta", + pop_field: str = "pop", + filter_loftee: bool = False, +) -> hl.Table: + """ + Generate summary counts for loss-of-function (LoF), missense, and synonymous variants. + + Also calculates p, proportion of of haplotypes carrying a putative LoF (pLoF) variant, + and observed/expected (OE) ratio of samples with homozygous pLoF variant calls. + + Summary counts are (all per gene): + - Number of samples with no pLoF variants. + - Number of samples with heterozygous pLoF variants. + - Number of samples with homozygous pLoF variants. + - Total number of sites with genotype calls. + - All of the above stats grouped by population. + + Assumes MT was created using `default_generate_gene_lof_matrix`. + + .. note:: + Assumes LoF variants in MT were filtered (LOFTEE pass and no LoF flag only). + If LoF variants have not been filtered and `filter_loftee` is True, + expects MT has the row annotation `vep`. + + :param mt: Input MatrixTable. + :param collapse_indels: Whether to collapse indels. Default is False. + :param tx: Whether input MT has transcript expression data. Default is False. + :param lof_csq_set: Set containing LoF transcript consequence strings. Default is LOF_CSQ_SET. + :param meta_root: String indicating top level name for sample metadata. Default is 'meta'. + :param pop_field: String indiciating field with sample population assignment information. Default is 'pop'. + :param filter_loftee: Filters to LOFTEE pass variants (and no LoF flags) only. Default is False. + :return: Table with het/hom summary counts. + """ + if collapse_indels: + grouping = ["gene_id", "gene", "most_severe_consequence"] + if tx: + grouping.append("expressed") + else: + grouping.extend(["transcript_id", "canonical"]) + mt = ( + mt.group_rows_by(*grouping) + .aggregate_rows( + n_sites=hl.agg.sum(mt.n_sites), + n_sites_array=hl.agg.array_sum(mt.n_sites_array), + classic_caf=hl.agg.sum(mt.classic_caf), + max_af=hl.agg.max(mt.max_af), + classic_caf_array=hl.agg.array_sum(mt.classic_caf_array), + ) + .aggregate_entries( + num_homs=hl.agg.sum(mt.num_homs), + num_hets=hl.agg.sum(mt.num_hets), + defined_sites=hl.agg.sum(mt.defined_sites), + ) + .result() + ) + + if filter_loftee: + lof_ht = get_most_severe_consequence_for_summary(mt.rows()) + mt = mt.filter_rows( + hl.is_defined(lof_ht[mt.row_key].lof) + & (lof_ht[mt.row_key].lof == "HC") + & (lof_ht[mt.row_key].no_lof_flags) + ) + + ht = mt.annotate_rows( + lof=hl.struct( + **get_het_hom_summary_dict( + csq_set=lof_csq_set, + most_severe_csq_expr=mt.most_severe_consequence, + defined_sites_expr=mt.defined_sites, + num_homs_expr=mt.num_homs, + num_hets_expr=mt.num_hets, + pop_expr=mt[meta_root][pop_field], + ), + ), + missense=hl.struct( + **get_het_hom_summary_dict( + csq_set={"missense_variant"}, + most_severe_csq_expr=mt.most_severe_consequence, + defined_sites_expr=mt.defined_sites, + num_homs_expr=mt.num_homs, + num_hets_expr=mt.num_hets, + pop_expr=mt[meta_root][pop_field], + ), + ), + synonymous=hl.struct( + **get_het_hom_summary_dict( + csq_set={"synonymous_variant"}, + most_severe_csq_expr=mt.most_severe_consequence, + defined_sites_expr=mt.defined_sites, + num_homs_expr=mt.num_homs, + num_hets_expr=mt.num_hets, + pop_expr=mt[meta_root][pop_field], + ), + ), + ).rows() + ht = ht.annotate( + p=(1 - hl.sqrt(hl.float64(ht.lof.no_alt_calls) / ht.lof.defined)), + pop_p=hl.dict( + hl.array(ht.lof.pop_defined).map( + lambda x: ( + x[0], + 1 - hl.sqrt(hl.float64(ht.lof.pop_no_alt_calls.get(x[0])) / x[1]), + ) + ) + ), + ) + ht = ht.annotate(exp_hom_lof=ht.lof.defined * ht.p * ht.p) + return ht.annotate(oe=ht.lof.obs_hom / ht.exp_hom_lof)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/assessment/validity_checks.html b/_modules/gnomad/assessment/validity_checks.html new file mode 100644 index 000000000..c85e722e9 --- /dev/null +++ b/_modules/gnomad/assessment/validity_checks.html @@ -0,0 +1,1290 @@ + + + + + + gnomad.assessment.validity_checks — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.assessment.validity_checks

+# noqa: D100
+
+import logging
+from pprint import pprint
+from typing import Any, Dict, List, Optional, Union
+
+import hail as hl
+from hail.utils.misc import new_temp_file
+
+from gnomad.resources.grch38.gnomad import CURRENT_MAJOR_RELEASE, POPS, SEXES
+from gnomad.utils.vcf import HISTS, SORT_ORDER, make_label_combos
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def generic_field_check( + ht: hl.Table, + check_description: str, + display_fields: hl.expr.StructExpression, + cond_expr: hl.expr.BooleanExpression = None, + verbose: bool = False, + show_percent_sites: bool = False, + n_fail: Optional[int] = None, + ht_count: Optional[int] = None, +) -> None: + """ + Check generic logical condition `cond_expr` involving annotations in a Hail Table when `n_fail` is absent and print the results to stdout. + + Displays the number of rows (and percent of rows, if `show_percent_sites` is True) in the Table that fail, either previously computed as `n_fail` or that match the `cond_expr`, and fail to be the desired condition (`check_description`). + If the number of rows that match the `cond_expr` or `n_fail` is 0, then the Table passes that check; otherwise, it fails. + + .. note:: + + `cond_expr` and `check_description` are opposites and should never be the same. + E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, + then it is checking sites that fail to be the desired condition (`check_description`) + of having a raw AC greater than or equal to the adj AC. + + :param ht: Table containing annotations to be checked. + :param check_description: String describing the condition being checked; is displayed in stdout summary message. + :param display_fields: StructExpression containing annotations to be displayed in case of failure (for troubleshooting purposes); these fields are also displayed if verbose is True. + :param cond_expr: Optional logical expression referring to annotations in ht to be checked. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. + :param show_percent_sites: Show percentage of sites that fail checks. Default is False. + :param n_fail: Optional number of sites that fail the conditional checks (previously computed). If not supplied, `cond_expr` is used to filter the Table and obtain the count of sites that fail the checks. + :param ht_count: Optional number of sites within hail Table (previously computed). If not supplied, a count of sites in the Table is performed. + :return: None + """ + if n_fail is None and cond_expr is None: + raise ValueError("At least one of n_fail or cond_expr must be defined!") + + if n_fail is None and cond_expr is not None: + n_fail = ht.filter(cond_expr).count() + + if show_percent_sites and (ht_count is None): + ht_count = ht.count() + + if n_fail > 0: + logger.info("Found %d sites that fail %s check:", n_fail, check_description) + if show_percent_sites: + logger.info( + "Percentage of sites that fail: %.2f %%", 100 * (n_fail / ht_count) + ) + if cond_expr is not None: + ht = ht.select(_fail=cond_expr, **display_fields) + ht.filter(ht._fail).drop("_fail").show() + else: + logger.info("PASSED %s check", check_description) + if verbose: + ht.select(**display_fields).show()
+ + +
[docs]def make_filters_expr_dict( + ht: hl.Table, + extra_filter_checks: Optional[Dict[str, hl.expr.Expression]] = None, + variant_filter_field: str = "RF", +) -> Dict[str, hl.expr.Expression]: + """ + Make Hail expressions to measure % variants filtered under varying conditions of interest. + + Checks for: + - Total number of variants + - Fraction of variants removed due to: + - Any filter + - Inbreeding coefficient filter in combination with any other filter + - AC0 filter in combination with any other filter + - `variant_filter_field` filtering in combination with any other filter + - Only inbreeding coefficient filter + - Only AC0 filter + - Only filtering defined by `variant_filter_field` + + :param ht: Table containing 'filter' annotation to be examined. + :param extra_filter_checks: Optional dictionary containing filter condition name (key) extra filter expressions (value) to be examined. + :param variant_filter_field: String of variant filtration used in the filters annotation on `ht` (e.g. RF, VQSR, AS_VQSR). Default is "RF". + :return: Dictionary containing Hail aggregation expressions to examine filter flags. + """ + filters_dict = { + "n": hl.agg.count(), + "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), + "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")), + "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), + f"frac_{variant_filter_field.lower()}": hl.agg.fraction( + ht.filters.contains(variant_filter_field) + ), + "frac_inbreed_coeff_only": hl.agg.fraction( + ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1) + ), + "frac_ac0_only": hl.agg.fraction( + ht.filters.contains("AC0") & (ht.filters.length() == 1) + ), + f"frac_{variant_filter_field.lower()}_only": hl.agg.fraction( + ht.filters.contains(variant_filter_field) & (ht.filters.length() == 1) + ), + } + if extra_filter_checks: + filters_dict.update(extra_filter_checks) + + return filters_dict
+ + +
[docs]def make_group_sum_expr_dict( + t: Union[hl.MatrixTable, hl.Table], + subset: str, + label_groups: Dict[str, List[str]], + sort_order: List[str] = SORT_ORDER, + delimiter: str = "-", + metric_first_field: bool = True, + metrics: List[str] = ["AC", "AN", "nhomalt"], +) -> Dict[str, Dict[str, Union[hl.expr.Int64Expression, hl.expr.StructExpression]]]: + """ + Compute the sum of call stats annotations for a specified group of annotations, compare to the annotated version, and display the result in stdout. + + For example, if subset1 consists of pop1, pop2, and pop3, check that t.info.AC-subset1 == sum(t.info.AC-subset1-pop1, t.info.AC-subset1-pop2, t.info.AC-subset1-pop3). + + :param t: Input MatrixTable or Table containing call stats annotations to be summed. + :param subset: String indicating sample subset. + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["XY", "XX"] or ["afr", "nfe", "amr"]). + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. + :param delimiter: String to use as delimiter when making group label combinations. Default is "-". + :param metric_first_field: If True, metric precedes subset in the Table's fields, e.g. AC-hgdp. If False, subset precedes metric, hgdp-AC. Default is True. + :param metrics: List of metrics to sum and compare to annotationed versions. Default is ["AC", "AN", "nhomalt"]. + :return: Dictionary of sample sum field check expressions and display fields. + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + # Check if subset string is provided to avoid adding a delimiter to empty string + # (An empty string is passed to run this check on the entire callset) + if subset: + subset += delimiter + + label_combos = make_label_combos(label_groups, label_delimiter=delimiter) + # Grab the first group for check and remove if from the label_group + # dictionary. In gnomAD, this is 'adj', as we do not retain the raw metric + # counts for all sample groups so we do not check raw sample sums. + group = label_groups.pop("group")[0] + # sum_group is a the type of high level annotation that you want to sum + # e.g. 'pop', 'pop-sex', 'sex'. + sum_group = delimiter.join( + sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) + ) + info_fields = t.info.keys() + + # Loop through metrics and the label combos to build a dictionary + # where the key is a string representing the sum_group annotations and the value is the sum of these annotations. + # If metric_first_field is True, metric is AC, subset is tgp, group is adj, sum_group is pop, then the values below are: + # sum_group_exprs = ["AC-tgp-pop1", "AC-tgp-pop2", "AC-tgp-pop3"] + # annot_dict = {'sum-AC-tgp-adj-pop': hl.sum(["AC-tgp-adj-pop1", + # "AC-tgp-adj-pop2", "AC-tgp-adj-pop3"])} + annot_dict = {} + for metric in metrics: + if metric_first_field: + field_prefix = f"{metric}{delimiter}{subset}" + else: + field_prefix = f"{subset}{metric}{delimiter}" + + sum_group_exprs = [] + for label in label_combos: + field = f"{field_prefix}{label}" + if field in info_fields: + sum_group_exprs.append(t.info[field]) + else: + logger.warning("%s is not in table's info field", field) + + annot_dict[f"sum{delimiter}{field_prefix}{group}{delimiter}{sum_group}"] = ( + hl.sum(sum_group_exprs) + ) + + # If metric_first_field is True, metric is AC, subset is tgp, sum_group is pop, and group is adj, then the values below are: + # check_field_left = "AC-tgp-adj" + # check_field_right = "sum-AC-tgp-adj-pop" to match the annotation dict + # key from above + field_check_expr = {} + for metric in metrics: + if metric_first_field: + check_field_left = f"{metric}{delimiter}{subset}{group}" + else: + check_field_left = f"{subset}{metric}{delimiter}{group}" + check_field_right = f"sum{delimiter}{check_field_left}{delimiter}{sum_group}" + field_check_expr[f"{check_field_left} = {check_field_right}"] = { + "expr": t.info[check_field_left] != annot_dict[check_field_right], + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{ + check_field_left: t.info[check_field_left], + check_field_right: annot_dict[check_field_right], + } + ), + } + return field_check_expr
+ + +
[docs]def compare_row_counts(ht1: hl.Table, ht2: hl.Table) -> bool: + """ + Check if the row counts in two Tables are the same. + + :param ht1: First Table to be checked. + :param ht2: Second Table to be checked. + :return: Whether the row counts are the same. + """ + r_count1 = ht1.count() + r_count2 = ht2.count() + logger.info("%d rows in left table; %d rows in right table", r_count1, r_count2) + return r_count1 == r_count2
+ + +
[docs]def summarize_variant_filters( + t: Union[hl.MatrixTable, hl.Table], + variant_filter_field: str = "RF", + problematic_regions: List[str] = ["lcr", "segdup", "nonpar"], + single_filter_count: bool = False, + site_gt_check_expr: Dict[str, hl.expr.BooleanExpression] = None, + extra_filter_checks: Optional[Dict[str, hl.expr.Expression]] = None, + n_rows: int = 50, + n_cols: int = 140, +) -> None: + """ + Summarize variants filtered under various conditions in input MatrixTable or Table. + + Summarize counts for: + - Total number of variants + - Fraction of variants removed due to: + - Any filter + - Inbreeding coefficient filter in combination with any other filter + - AC0 filter in combination with any other filter + - `variant_filter_field` filtering in combination with any other filter in combination with any other filter + - Only inbreeding coefficient filter + - Only AC0 filter + - Only `variant_filter_field` filtering + + :param t: Input MatrixTable or Table to be checked. + :param variant_filter_field: String of variant filtration used in the filters annotation on `ht` (e.g. RF, VQSR, AS_VQSR). Default is "RF". + :param problematic_regions: List of regions considered problematic to run filter check in. Default is ["lcr", "segdup", "nonpar"]. + :param single_filter_count: If True, explode the Table's filter column and give a supplement total count of each filter. Default is False. + :param site_gt_check_expr: Optional dictionary of strings and boolean expressions typically used to log how many monoallelic or 100% heterozygous sites are in the Table. + :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined. + :param n_rows: Number of rows to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 50. + :param n_cols: Number of columns to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 140. + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + filters = t.aggregate(hl.agg.counter(t.filters)) + logger.info("Variant filter counts: %s", filters) + + if single_filter_count: + exp_t = t.explode(t.filters) + filters = exp_t.aggregate(hl.agg.counter(exp_t.filters)) + logger.info("Exploded variant filter counts: %s", filters) + + if site_gt_check_expr is not None: + for k, m_expr in site_gt_check_expr.items(): + if isinstance(t, hl.MatrixTable): + gt_check_sites = t.filter_rows(m_expr).count_rows() + else: + gt_check_sites = t.filter(m_expr).count() + logger.info("There are %d %s sites in the dataset.", gt_check_sites, k) + + filtered_expr = hl.len(t.filters) > 0 + problematic_region_expr = hl.any( + lambda x: x, [t.info[region] for region in problematic_regions] + ) + + t = t.annotate( + is_filtered=filtered_expr, in_problematic_region=problematic_region_expr + ) + + def _filter_agg_order( + t: Union[hl.MatrixTable, hl.Table], + group_exprs: Dict[str, hl.expr.Expression], + n_rows: Optional[int] = None, + n_cols: Optional[int] = None, + ) -> None: + """ + Perform validity checks to measure percentages of variants filtered under different conditions. + + :param t: Input MatrixTable or Table. + :param group_exprs: Dictionary of expressions to group the Table by. + :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined. + :param n_rows: Number of rows to show. Default is None (to display 10 rows). + :param n_cols: Number of columns to show. Default is None (to display 10 cols). + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + # NOTE: make_filters_expr_dict returns a dict with %ages of variants filtered + t.group_by(**group_exprs).aggregate( + **make_filters_expr_dict(t, extra_filter_checks, variant_filter_field) + ).order_by(hl.desc("n")).show(n_rows, n_cols) + + logger.info( + "Checking distributions of filtered variants amongst variant filters..." + ) + _filter_agg_order(t, {"is_filtered": t.is_filtered}, n_rows, n_cols) + + add_agg_expr = {} + if "allele_type" in t.info: + logger.info("Checking distributions of variant type amongst variant filters...") + add_agg_expr["allele_type"] = t.info.allele_type + _filter_agg_order(t, add_agg_expr, n_rows, n_cols) + + if "in_problematic_region" in t.row: + logger.info( + "Checking distributions of variant type and region type amongst variant" + " filters..." + ) + add_agg_expr["in_problematic_region"] = t.in_problematic_region + _filter_agg_order(t, add_agg_expr, n_rows, n_cols) + + if "n_alt_alleles" in t.info: + logger.info( + "Checking distributions of variant type, region type, and number of alt alleles" + " amongst variant filters..." + ) + add_agg_expr["n_alt_alleles"] = t.info.n_alt_alleles + _filter_agg_order(t, add_agg_expr, n_rows, n_cols)
+ + +
[docs]def generic_field_check_loop( + ht: hl.Table, + field_check_expr: Dict[str, Dict[str, Any]], + verbose: bool, + show_percent_sites: bool = False, + ht_count: int = None, +) -> None: + """ + Loop through all conditional checks for a given hail Table. + + This loop allows aggregation across the hail Table once, as opposed to aggregating during every conditional check. + + :param ht: Table containing annotations to be checked. + :param field_check_expr: Dictionary whose keys are conditions being checked and values are the expressions for filtering to condition. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. + :param show_percent_sites: Show percentage of sites that fail checks. Default is False. + :param ht_count: Previously computed sum of sites within hail Table. Default is None. + :return: None + """ + ht_field_check_counts = ht.aggregate( + hl.struct(**{k: v["agg_func"](v["expr"]) for k, v in field_check_expr.items()}) + ) + for check_description, n_fail in ht_field_check_counts.items(): + generic_field_check( + ht, + check_description=check_description, + n_fail=n_fail, + display_fields=field_check_expr[check_description]["display_fields"], + cond_expr=field_check_expr[check_description]["expr"], + verbose=verbose, + show_percent_sites=show_percent_sites, + ht_count=ht_count, + )
+ + +
[docs]def compare_subset_freqs( + t: Union[hl.MatrixTable, hl.Table], + subsets: List[str], + verbose: bool, + show_percent_sites: bool = True, + delimiter: str = "-", + metric_first_field: bool = True, + metrics: List[str] = ["AC", "AN", "nhomalt"], +) -> None: + """ + Perform validity checks on frequency data in input Table. + + Check: + - Number of sites where callset frequency is equal to a subset frequency (raw and adj) + - eg. t.info.AC-adj != t.info.AC-subset1-adj + - Total number of sites where the raw allele count annotation is defined + + :param t: Input MatrixTable or Table. + :param subsets: List of sample subsets. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. + :param show_percent_sites: If True, show the percentage and count of overall sites that fail; if False, only show the number of sites that fail. + :param delimiter: String to use as delimiter when making group label combinations. Default is "-". + :param metric_first_field: If True, metric precedes subset, e.g. AC-non_v2-. If False, subset precedes metric, non_v2-AC-XY. Default is True. + :param metrics: List of metrics to compare between subset and entire callset. Default is ["AC", "AN", "nhomalt"]. + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + field_check_expr = {} + for subset in subsets: + if subset: + for metric in metrics: + for group in ["adj", "raw"]: + logger.info( + "Comparing the %s subset's %s %s to entire callset's %s %s", + subset, + group, + metric, + group, + metric, + ) + check_field_left = f"{metric}{delimiter}{group}" + if metric_first_field: + check_field_right = ( + f"{metric}{delimiter}{subset}{delimiter}{group}" + ) + else: + check_field_right = ( + f"{subset}{delimiter}{metric}{delimiter}{group}" + ) + + field_check_expr[f"{check_field_left} != {check_field_right}"] = { + "expr": t.info[check_field_left] == t.info[check_field_right], + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{ + check_field_left: t.info[check_field_left], + check_field_right: t.info[check_field_right], + } + ), + } + + generic_field_check_loop( + t, + field_check_expr, + verbose, + show_percent_sites=show_percent_sites, + ) + + # Spot check the raw AC counts + total_defined_raw_ac = t.aggregate( + hl.agg.count_where(hl.is_defined(t.info[f"AC{delimiter}raw"])) + ) + logger.info("Total defined raw AC count: %s", total_defined_raw_ac)
+ + +
[docs]def sum_group_callstats( + t: Union[hl.MatrixTable, hl.Table], + sexes: List[str] = SEXES, + subsets: List[str] = [""], + pops: List[str] = POPS[CURRENT_MAJOR_RELEASE]["exomes"], + groups: List[str] = ["adj"], + additional_subsets_and_pops: Dict[str, List[str]] = None, + verbose: bool = False, + sort_order: List[str] = SORT_ORDER, + delimiter: str = "-", + metric_first_field: bool = True, + metrics: List[str] = ["AC", "AN", "nhomalt"], +) -> None: + """ + Compute the sum of annotations for a specified group of annotations, and compare to the annotated version. + + Displays results from checking the sum of the specified annotations in stdout. + Also checks that annotations for all expected sample populations are present. + + :param t: Input Table. + :param sexes: List of sexes in table. + :param subsets: List of sample subsets that contain pops passed in pops parameter. An empty string, e.g. "", should be passed to test entire callset. Default is [""]. + :param pops: List of pops contained within the subsets. Default is POPS[CURRENT_MAJOR_RELEASE]["exomes"]. + :param groups: List of callstat groups, e.g. "adj" and "raw" contained within the callset. gnomAD does not store the raw callstats for the pop or sex groupings of any subset. Default is ["adj"] + :param sample_sum_sets_and_pops: Dict with subset (keys) and list of the subset's specific populations (values). Default is None. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. Default is False. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. + :param delimiter: String to use as delimiter when making group label combinations. Default is "-". + :param metric_first_field: If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True. + :param metrics: List of metrics to sum and compare to annotationed versions. Default is ["AC", "AN", "nhomalt"]. + :return: None + """ + # TODO: Add support for subpop sums + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + field_check_expr = {} + default_pop_subset = {subset: pops for subset in subsets} + sample_sum_sets_and_pops = ( + {**default_pop_subset, **additional_subsets_and_pops} + if additional_subsets_and_pops + else default_pop_subset + ) + for subset, pops in sample_sum_sets_and_pops.items(): + for group in groups: + field_check_expr_s = make_group_sum_expr_dict( + t, + subset, + dict(group=[group], pop=pops), + sort_order, + delimiter, + metric_first_field, + metrics, + ) + field_check_expr.update(field_check_expr_s) + field_check_expr_s = make_group_sum_expr_dict( + t, + subset, + dict(group=[group], sex=sexes), + sort_order, + delimiter, + metric_first_field, + metrics, + ) + field_check_expr.update(field_check_expr_s) + field_check_expr_s = make_group_sum_expr_dict( + t, + subset, + dict(group=[group], pop=pops, sex=sexes), + sort_order, + delimiter, + metric_first_field, + metrics, + ) + field_check_expr.update(field_check_expr_s) + + generic_field_check_loop(t, field_check_expr, verbose)
+ + +
[docs]def summarize_variants( + t: Union[hl.MatrixTable, hl.Table], +) -> hl.Struct: + """ + Get summary of variants in a MatrixTable or Table. + + Print the number of variants to stdout and check that each chromosome has variant calls. + + :param t: Input MatrixTable or Table to be checked. + :return: Struct of variant summary + """ + if isinstance(t, hl.MatrixTable): + logger.info("Dataset has %d samples.", t.count_cols()) + + var_summary = hl.summarize_variants(t, show=False) + logger.info( + "Dataset has %d variants distributed across the following contigs: %s", + var_summary.n_variants, + var_summary.contigs, + ) + + for contig in var_summary.contigs: + if var_summary.contigs[contig] == 0: + logger.warning("%s has no variants called", var_summary.contigs) + + return var_summary
+ + +
[docs]def check_raw_and_adj_callstats( + t: Union[hl.MatrixTable, hl.Table], + subsets: List[str], + verbose: bool, + delimiter: str = "-", + metric_first_field: bool = True, +) -> None: + """ + Perform validity checks on raw and adj data in input Table/MatrixTable. + + Check that: + - Raw AC and AF are not 0 + - AC and AF are not negative + - Raw values for AC, AN, nhomalt in each sample subset are greater than or equal to their corresponding adj values + + Raw and adj call stat annotations must be in an info struct annotation on the Table/MatrixTable, e.g. t.info.AC-raw. + + :param t: Input MatrixTable or Table to check. + :param subsets: List of sample subsets. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. + :param delimiter: String to use as delimiter when making group label combinations. Default is "-". + :param metric_first_field: If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True. + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + field_check_expr = {} + + for group in ["raw", "adj"]: + # Check AC and nhomalt missing if AN is missing and defined if AN is defined. + for subfield in ["AC", "nhomalt"]: + check_field = f"{subfield}{delimiter}{group}" + an_field = f"AN{delimiter}{group}" + field_check_expr[ + f"{check_field} defined when AN defined and missing when AN missing" + ] = { + "expr": hl.if_else( + hl.is_missing(t.info[an_field]), + hl.is_defined(t.info[check_field]), + hl.is_missing(t.info[check_field]), + ), + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{an_field: t.info[an_field], check_field: t.info[check_field]} + ), + } + + # Check AF missing if AN is missing and defined if AN is defined and > 0. + check_field = f"AF{delimiter}{group}" + an_field = f"AN{delimiter}{group}" + field_check_expr[ + f"{check_field} defined when AN defined (and > 0) and missing when AN missing" + ] = { + "expr": hl.if_else( + hl.is_missing(t.info[an_field]), + hl.is_defined(t.info[check_field]), + (t.info[an_field] > 0) & hl.is_missing(t.info[check_field]), + ), + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{an_field: t.info[an_field], check_field: t.info[check_field]} + ), + } + + # Check raw and adj AF missing if AN is 0. + check_field = f"AF{delimiter}{group}" + an_field = f"AN{delimiter}{group}" + field_check_expr[f"{check_field} missing when AN 0"] = { + "expr": (t.info[an_field] == 0) & hl.is_defined(t.info[check_field]), + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{an_field: t.info[an_field], check_field: t.info[check_field]} + ), + } + + for subfield in ["AC", "AF"]: + # Check raw AC, AF > 0 + check_field = f"{subfield}{delimiter}raw" + field_check_expr[f"{check_field} > 0"] = { + "expr": t.info[check_field] <= 0, + "agg_func": hl.agg.count_where, + "display_fields": hl.struct(**{check_field: t.info[check_field]}), + } + + # Check adj AC, AF > 0 + check_field = f"{subfield}{delimiter}adj" + field_check_expr[f"{check_field} >= 0"] = { + "expr": t.info[check_field] < 0, + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{check_field: t.info[check_field], "filters": t.filters} + ), + } + + # Check overall gnomad's raw subfields >= adj + for subfield in ["AC", "AN", "nhomalt"]: + check_field_left = f"{subfield}{delimiter}raw" + check_field_right = f"{subfield}{delimiter}adj" + + field_check_expr[f"{check_field_left} >= {check_field_right}"] = { + "expr": t.info[check_field_left] < t.info[check_field_right], + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{ + check_field_left: t.info[check_field_left], + check_field_right: t.info[check_field_right], + } + ), + } + + for subset in subsets: + # Add delimiter for subsets but not "" representing entire callset + if subset: + subset += delimiter + field_check_label = ( + f"{subfield}{delimiter}{subset}" + if metric_first_field + else f"{subset}{subfield}{delimiter}" + ) + check_field_left = f"{field_check_label}raw" + check_field_right = f"{field_check_label}adj" + + field_check_expr[f"{check_field_left} >= {check_field_right}"] = { + "expr": t.info[check_field_left] < t.info[check_field_right], + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{ + check_field_left: t.info[check_field_left], + check_field_right: t.info[check_field_right], + } + ), + } + + generic_field_check_loop(t, field_check_expr, verbose)
+ + +
[docs]def check_sex_chr_metrics( + t: Union[hl.MatrixTable, hl.Table], + info_metrics: List[str], + contigs: List[str], + verbose: bool, + delimiter: str = "-", +) -> None: + """ + Perform validity checks for annotations on the sex chromosomes. + + Check: + - That metrics for chrY variants in XX samples are NA and not 0 + - That nhomalt counts are equal to XX nhomalt counts for all non-PAR chrX variants + + :param t: Input MatrixTable or Table. + :param info_metrics: List of metrics in info struct of input Table. + :param contigs: List of contigs present in input Table. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. + :param delimiter: String to use as the delimiter in XX metrics. Default is "-". + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + xx_metrics = [x for x in info_metrics if f"{delimiter}XX" in x] + + if "chrY" in contigs: + logger.info("Check values of XX metrics for Y variants are NA:") + t_y = hl.filter_intervals(t, [hl.parse_locus_interval("chrY")]) + metrics_values = {} + for metric in xx_metrics: + metrics_values[metric] = hl.agg.any(hl.is_defined(t_y.info[metric])) + output = dict(t_y.aggregate(hl.struct(**metrics_values))) + for metric, value in output.items(): + if value: + values_found = t_y.aggregate( + hl.agg.filter( + hl.is_defined(t_y.info[metric]), + hl.agg.take(t_y.info[metric], 1), + ) + ) + logger.info( + "FAILED %s = %s check for Y variants. Values found: %s", + metric, + None, + values_found, + ) + else: + logger.info("PASSED %s = %s check for Y variants", metric, None) + + t_x = hl.filter_intervals(t, [hl.parse_locus_interval("chrX")]) + t_xnonpar = t_x.filter(t_x.locus.in_x_nonpar()) + n = t_xnonpar.count() + logger.info("Found %d X nonpar sites", n) + + logger.info("Check (nhomalt == nhomalt_xx) for X nonpar variants:") + xx_metrics = [x for x in xx_metrics if "nhomalt" in x] + + field_check_expr = {} + for metric in xx_metrics: + standard_field = metric.replace(f"{delimiter}XX", "") + check_field_left = f"{metric}" + check_field_right = f"{standard_field}" + field_check_expr[f"{check_field_left} == {check_field_right}"] = { + "expr": t_xnonpar.info[check_field_left] + != t_xnonpar.info[check_field_right], + "agg_func": hl.agg.count_where, + "display_fields": hl.struct( + **{ + check_field_left: t_xnonpar.info[check_field_left], + check_field_right: t_xnonpar.info[check_field_right], + } + ), + } + + generic_field_check_loop(t_xnonpar, field_check_expr, verbose)
+ + +
[docs]def compute_missingness( + t: Union[hl.MatrixTable, hl.Table], + info_metrics: List[str], + non_info_metrics: List[str], + n_sites: int, + missingness_threshold: float, +) -> None: + """ + Check amount of missingness in all row annotations. + + Print metric to sdout if the percentage of metric annotations missingness exceeds the missingness_threshold. + + :param t: Input MatrixTable or Table. + :param info_metrics: List of metrics in info struct of input Table. + :param non_info_metrics: List of row annotations minus info struct from input Table. + :param n_sites: Number of sites in input Table. + :param missingness_threshold: Upper cutoff for allowed amount of missingness. + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + + logger.info( + "Missingness threshold (upper cutoff for what is allowed for missingness" + " checks): %.2f", + missingness_threshold, + ) + metrics_missing = {} + for x in info_metrics: + metrics_missing[x] = hl.agg.sum(hl.is_missing(t.info[x])) + for x in non_info_metrics: + metrics_missing[x] = hl.agg.sum(hl.is_missing(t[x])) + output = dict(t.aggregate(hl.struct(**metrics_missing))) + + n_fail = 0 + for metric, n_missing in output.items(): + if n_missing / n_sites > missingness_threshold: + logger.info( + "FAILED missingness check for %s: %d sites or %.2f%% missing", + metric, + n_missing, + (100 * n_missing / n_sites), + ) + n_fail += 1 + else: + logger.info( + "Passed missingness check for %s: %d sites or %.2f%% missing", + metric, + n_missing, + (100 * n_missing / n_sites), + ) + logger.info("%d missing metrics checks failed", n_fail)
+ + +
[docs]def vcf_field_check( + t: Union[hl.MatrixTable, hl.Table], + header_dict: Dict[str, Dict[str, Dict[str, str]]], + row_annotations: List[str] = None, + entry_annotations: List[str] = None, + hists: List[str] = HISTS, +) -> bool: + """ + Check that all VCF fields and descriptions are present in input Table and VCF header dictionary. + + :param t: Input MatrixTable or Table to be exported to VCF. + :param header_dict: VCF header dictionary. + :param row_annotations: List of row annotations in MatrixTable or Table. + :param entry_annotations: List of entry annotations to use if running this check on a MatrixTable. + :param hists: List of variant histogram annotations. Default is HISTS. + :return: Boolean with whether all expected fields and descriptions are present. + """ + hist_fields = [] + for hist in hists: + hist_fields.append(f"{hist}_bin_freq") + if "dp" in hist: + hist_fields.append(f"{hist}_n_larger") + + missing_fields = [] + missing_descriptions = [] + items = ["info", "filter"] + if entry_annotations: + items.append("format") + for item in items: + if item == "info": + annots = row_annotations + elif item == "format": + annots = entry_annotations + else: + annot_t = ( + t.explode_rows(t.filters) + if isinstance(t, hl.MatrixTable) + else t.explode(t.filters) + ) + annots = ( + list(annot_t.aggregate_rows(hl.agg.collect_as_set(annot_t.filters))) + if isinstance(t, hl.MatrixTable) + else list(annot_t.aggregate(hl.agg.collect_as_set(annot_t.filters))) + ) + + temp_missing_fields = [] + temp_missing_descriptions = [] + for field in annots: + try: + description = header_dict[item][field] + if len(description) == 0: + logger.warning( + "%s in info field has empty description in VCF header!", field + ) + temp_missing_descriptions.append(field) + except KeyError: + logger.warning("%s in info field does not exist in VCF header!", field) + # NOTE: END entry is not exported (removed during densify) + if isinstance(t, hl.MatrixTable) and (field != "END"): + temp_missing_fields.append(field) + + missing_fields.extend(temp_missing_fields) + missing_descriptions.extend(temp_missing_descriptions) + + if len(missing_fields) != 0 or len(missing_descriptions) != 0: + logger.error( + "Some fields are either missing or missing descriptions in the VCF header!" + " Please reconcile." + ) + logger.error("Missing fields: %s", missing_fields) + logger.error("Missing descriptions: %s", missing_descriptions) + return False + + logger.info("Passed VCF fields check!") + return True
+ + +
[docs]def check_global_and_row_annot_lengths( + t: Union[hl.MatrixTable, hl.Table], + row_to_globals_check: Dict[str, List[str]], + check_all_rows: bool = False, +) -> None: + """ + Check that the lengths of row annotations match the lengths of associated global annotations. + + :param t: Input MatrixTable or Table. + :param row_to_globals_check: Dictionary with row annotation (key) and list of associated global annotations (value) to compare. + :param check_all_rows: If True, check all rows in `t`; if False, check only the first row. Default is False. + :return: None + """ + t = t.rows() if isinstance(t, hl.MatrixTable) else t + if not check_all_rows: + t = t.head(1) + for row_field, global_fields in row_to_globals_check.items(): + if not check_all_rows: + logger.info( + "Checking length of %s in first row against length of globals: %s", + row_field, + global_fields, + ) + for global_field in global_fields: + global_len = hl.eval(hl.len(t[global_field])) + row_len_expr = hl.len(t[row_field]) + failed_rows = t.aggregate( + hl.struct( + n_fail=hl.agg.count_where(row_len_expr != global_len), + row_len=hl.agg.counter(row_len_expr), + ) + ) + outcome = "Failed" if failed_rows["n_fail"] > 0 else "Passed" + n_rows = t.count() + logger.info( + "%s global and row lengths comparison: Length of %s in" + " globals (%d) does %smatch length of %s in %d out of %d rows (%s)", + outcome, + global_field, + global_len, + "NOT " if outcome == "Failed" else "", + row_field, + failed_rows["n_fail"] if outcome == "Failed" else n_rows, + n_rows, + failed_rows["row_len"], + )
+ + +
[docs]def pprint_global_anns(t: Union[hl.MatrixTable, hl.Table]) -> None: + """ + Pretty print global annotations. + + :param t: Input MatrixTable or Table. + """ + global_pprint = {g: hl.eval(t[g]) for g in t.globals} + pprint(global_pprint, sort_dicts=False)
+ + +
[docs]def validate_release_t( + t: Union[hl.MatrixTable, hl.Table], + subsets: List[str] = [""], + pops: List[str] = POPS[CURRENT_MAJOR_RELEASE]["exomes"], + missingness_threshold: float = 0.5, + site_gt_check_expr: Dict[str, hl.expr.BooleanExpression] = None, + verbose: bool = False, + show_percent_sites: bool = True, + delimiter: str = "-", + metric_first_field: bool = True, + sum_metrics: List[str] = ["AC", "AN", "nhomalt"], + sexes: List[str] = SEXES, + groups: List[str] = ["adj"], + sample_sum_sets_and_pops: Dict[str, List[str]] = None, + sort_order: List[str] = SORT_ORDER, + variant_filter_field: str = "RF", + problematic_regions: List[str] = ["lcr", "segdup", "nonpar"], + single_filter_count: bool = False, + summarize_variants_check: bool = True, + filters_check: bool = True, + raw_adj_check: bool = True, + subset_freq_check: bool = True, + samples_sum_check: bool = True, + sex_chr_check: bool = True, + missingness_check: bool = True, + pprint_globals: bool = False, + row_to_globals_check: Optional[Dict[str, List[str]]] = None, + check_all_rows_in_row_to_global_check: bool = False, +) -> None: + """ + Perform a battery of validity checks on a specified group of subsets in a MatrixTable containing variant annotations. + + Includes: + - Summaries of % filter status for different partitions of variants + - Histogram outlier bin checks + - Checks on AC, AN, and AF annotations + - Checks that subgroup annotation values add up to the supergroup annotation values + - Checks on sex-chromosome annotations; and summaries of % missingness in variant annotations + + All annotations must be within an info struct, e.g. t.info.AC-raw. + + :param t: Input MatrixTable or Table containing variant annotations to check. + :param subsets: List of subsets to be checked. + :param pops: List of pops within main callset. Default is POPS[CURRENT_MAJOR_RELEASE]["exomes"]. + :param missingness_threshold: Upper cutoff for allowed amount of missingness. Default is 0.5. + :param site_gt_check_expr: Optional boolean expression or dictionary of strings and boolean expressions typically used to log how many monoallelic or 100% heterozygous sites are in the Table. + :param verbose: If True, display top values of relevant annotations being checked, regardless of whether check conditions are violated; if False, display only top values of relevant annotations if check conditions are violated. + :param show_percent_sites: Show percentage of sites that fail checks. Default is False. + :param delimiter: String to use as delimiter when making group label combinations. Default is "-". + :param metric_first_field: If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True. + :param sum_metrics: List of metrics to sum and compare to annotationed versions and between subsets and entire callset. Default is ["AC", "AN", "nhomalt"]. + :param sexes: List of sexes in table. Default is SEXES. + :param groups: List of callstat groups, e.g. "adj" and "raw" contained within the callset. gnomAD does not store the raw callstats for the pop or sex groupings of any subset. Default is ["adj"] + :param sample_sum_sets_and_pops: Dict with subset (keys) and populations within subset (values) for sample sum check. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. + :param variant_filter_field: String of variant filtration used in the filters annotation on `ht` (e.g. RF, VQSR, AS_VQSR). Default is "RF". + :param problematic_regions: List of regions considered problematic to run filter check in. Default is ["lcr", "segdup", "nonpar"]. + :param single_filter_count: If True, explode the Table's filter column and give a supplement total count of each filter. Default is False. + :param summarize_variants_check: When true, runs the summarize_variants method. Default is True. + :param filters_check: When True, runs the summarize_variant_filters method. Default is True. + :param raw_adj_check: When True, runs the check_raw_and_adj_callstats method. Default is True. + :param subset_freq_check: When True, runs the compare_subset_freqs method. Default is True. + :param samples_sum_check: When True, runs the sum_group_callstats method. Default is True. + :param sex_chr_check: When True, runs the check_sex_chr_metricss method. Default is True. + :param missingness_check: When True, runs the compute_missingness method. Default is True. + :param pprint_globals: When True, Pretty Print the globals of the input Table. Default is True. + :param row_to_globals_check: Optional dictionary of globals (keys) and rows (values) to be checked. When passed, function checks that the lengths of the global and row annotations are equal. + :param check_all_rows_in_row_to_global_check: If True, check all rows in `t` in `row_to_globals_check`; if False, check only the first row. Default is False. + :return: None (stdout display of results from the battery of validity checks). + """ + if pprint_globals: + logger.info("GLOBALS OF INPUT TABLE:") + pprint_global_anns(t) + + if row_to_globals_check is not None: + logger.info("COMPARE GLOBAL ANNOTATIONS' LENGTHS TO ROW ANNOTATIONS:") + check_global_and_row_annot_lengths( + t, row_to_globals_check, check_all_rows_in_row_to_global_check + ) + + if summarize_variants_check: + logger.info("BASIC SUMMARY OF INPUT TABLE:") + summarize_variants(t) + + if filters_check: + logger.info("VARIANT FILTER SUMMARIES:") + summarize_variant_filters( + t, + variant_filter_field, + problematic_regions, + single_filter_count, + site_gt_check_expr, + ) + + if raw_adj_check: + logger.info("RAW AND ADJ CHECKS:") + check_raw_and_adj_callstats(t, subsets, verbose, delimiter, metric_first_field) + + if subset_freq_check: + logger.info("SUBSET FREQUENCY CHECKS:") + compare_subset_freqs( + t, + subsets, + verbose, + show_percent_sites, + delimiter, + metric_first_field, + sum_metrics, + ) + + if samples_sum_check: + logger.info("CALLSET ANNOTATIONS TO SUM GROUP CHECKS:") + sum_group_callstats( + t, + sexes, + subsets, + pops, + groups, + sample_sum_sets_and_pops, + verbose, + sort_order, + delimiter, + metric_first_field, + sum_metrics, + ) + + info_metrics = list(t.row.info) + + if sex_chr_check: + logger.info("SEX CHROMOSOME ANNOTATION CHECKS:") + contigs = t.aggregate(hl.agg.collect_as_set(t.locus.contig)) + check_sex_chr_metrics(t, info_metrics, contigs, verbose, delimiter) + + if missingness_check: + logger.info("MISSINGNESS CHECKS:") + non_info_metrics = list(t.row) + non_info_metrics.remove("info") + n_sites = t.count() + compute_missingness( + t, info_metrics, non_info_metrics, n_sites, missingness_threshold + ) + logger.info("VALIDITY CHECKS COMPLETE")
+ + +
[docs]def count_vep_annotated_variants_per_interval( + vep_ht: hl.Table, interval_ht: hl.Table +) -> hl.Table: + """ + Calculate the count of VEP annotated variants in `vep_ht` per interval defined by `interval_ht`. + + .. note:: + + - `vep_ht` must contain the 'vep.transcript_consequences' array field, which + contains a 'biotype' field to determine whether a variant is in a + "protein-coding" gene. + - `interval_ht` should be indexed by 'locus' and contain a 'gene_stable_ID' + field. For example, an interval Table containing the intervals of + protein-coding genes of a specific Ensembl release. + + The returned Table will have the following fields added: + - n_total_variants: The number of total variants in the interval. + - n_pcg_variants: The number of variants in the interval that are annotated as + "protein-coding". + + :param vep_ht: VEP-annotated Table. + :param interval_ht: Interval Table. + :return: Interval Table with annotations for the counts of total variants and + variants annotated as "protein-coding" in biotype. + """ + logger.info( + "Counting the number of total variants and protein-coding variants in each" + " interval..." + ) + + # Select the vep_ht and annotate genes that have a matched interval from + # the interval_ht and are protein-coding. + vep_ht = vep_ht.select( + gene_stable_ID=interval_ht.index(vep_ht.locus, all_matches=True).gene_stable_ID, + in_pcg=vep_ht.vep.transcript_consequences.biotype.contains("protein_coding"), + ) + + vep_ht = vep_ht.filter(hl.is_defined(vep_ht.gene_stable_ID)) + + # Explode the vep_ht by gene_stable_ID. + vep_ht = vep_ht.explode(vep_ht.gene_stable_ID) + + # Count the number of total variants and "protein-coding" variants in each interval. + count_ht = vep_ht.group_by(vep_ht.gene_stable_ID).aggregate( + all_variants=hl.agg.count(), + variants_in_pcg=hl.agg.count_where(vep_ht.in_pcg), + ) + + interval_ht = interval_ht.annotate(**count_ht[interval_ht.gene_stable_ID]) + + logger.info("Checkpointing the counts per interval...") + interval_ht = interval_ht.checkpoint( + new_temp_file("validity_checks.vep_count_per_interval", extension="ht"), + overwrite=True, + ) + + logger.info("Genes without variants annotated: ") + gene_sets = interval_ht.aggregate( + hl.struct( + na_genes=hl.agg.filter( + hl.is_missing(interval_ht.variants_in_pcg) + | (interval_ht.variants_in_pcg == 0), + hl.agg.collect_as_set(interval_ht.gene_stable_ID), + ), + partial_pcg_genes=hl.agg.filter( + (interval_ht.all_variants != 0) + & (interval_ht.variants_in_pcg != 0) + & (interval_ht.all_variants != interval_ht.variants_in_pcg), + hl.agg.collect_as_set(interval_ht.gene_stable_ID), + ), + ) + ) + + logger.info( + "%s gene(s) have no variants annotated as protein-coding in Biotype. It is" + " likely these genes are not covered by the variants in 'vep_ht'. These" + " genes are: %s", + len(gene_sets.na_genes), + gene_sets.na_genes, + ) + + logger.info( + "%s gene(s) have a subset of variants annotated as protein-coding biotype" + " in their defined intervals", + len(gene_sets.partial_pcg_genes), + ) + + return interval_ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/config.html b/_modules/gnomad/resources/config.html new file mode 100644 index 000000000..d088cea39 --- /dev/null +++ b/_modules/gnomad/resources/config.html @@ -0,0 +1,209 @@ + + + + + + gnomad.resources.config — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.resources.config

+"""Configuration for loading resources."""
+
+import logging
+import os
+from enum import Enum
+from typing import Union
+
+logger = logging.getLogger(__name__)
+
+
+
[docs]class GnomadPublicResourceSource(Enum): + """Sources for public gnomAD resources.""" + + GNOMAD = "gnomAD" + GOOGLE_CLOUD_PUBLIC_DATASETS = "Google Cloud Public Datasets" + REGISTRY_OF_OPEN_DATA_ON_AWS = "Registry of Open Data on AWS" + AZURE_OPEN_DATASETS = "Azure Open Datasets"
+ + +
[docs]def get_default_public_resource_source() -> Union[GnomadPublicResourceSource, str]: + """ + Get the default source for public gnomAD resources. + + The default source is determined by... + + - If the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable is set, use the source configured there. + - Otherwise, if Hail determines that is is running in a cloud provider's Spark environment, use the source from that cloud provider. + For example, use Azure Open Datasets if running on an Azure HDInsight cluster. + - Otherwise, use Google Cloud Public Datasets. + + :returns: Default resource source + """ + default_source_from_env = os.getenv("GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE", None) + if default_source_from_env: + # Convert to a GnomadPublicResourceSource enum if possible + try: + default_source = GnomadPublicResourceSource(default_source_from_env) + logger.info( + "Using configured source for gnomAD resources: %s", default_source.value + ) + return default_source + except ValueError: + logger.info( + "Using configured custom source for gnomAD resources: %s", + default_source_from_env, + ) + return default_source_from_env + + try: + from hail.utils import guess_cloud_spark_provider + except ImportError: + pass + else: + cloud_spark_provider = guess_cloud_spark_provider() + default_resource_sources_by_provider = { + "dataproc": GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + "hdinsight": GnomadPublicResourceSource.AZURE_OPEN_DATASETS, + } + if cloud_spark_provider: + try: + default_source_from_provider = default_resource_sources_by_provider[ + cloud_spark_provider + ] + logger.info( + "Using default source for gnomAD resources based on cloud" + " provider: %s", + default_source_from_provider, + ) + return default_source_from_provider + except KeyError: + pass + + return GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS
+ + +class _GnomadPublicResourceConfiguration: + """Configuration for public gnomAD resources.""" + + _source: Union[GnomadPublicResourceSource, str, None] = None + + @property + def source(self) -> Union[GnomadPublicResourceSource, str]: + """ + Get the source for public gnomAD resource files. + + This is used to determine which URLs gnomAD resources will be loaded from. + + :returns: Source name or path to root of resources directory + """ + if self._source is None: + self._source = get_default_public_resource_source() + + return self._source + + @source.setter + def source(self, source: Union[GnomadPublicResourceSource, str]) -> None: + """ + Set the default source for resource files. + + This is used to determine which URLs gnomAD resources will be loaded from. + + :param source: Source name or path to root of resources directory + """ + self._source = source + + +gnomad_public_resource_configuration = _GnomadPublicResourceConfiguration() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/grch37/gnomad.html b/_modules/gnomad/resources/grch37/gnomad.html new file mode 100644 index 000000000..b4cec58ef --- /dev/null +++ b/_modules/gnomad/resources/grch37/gnomad.html @@ -0,0 +1,323 @@ + + + + + + gnomad.resources.grch37.gnomad — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.grch37.gnomad

+# noqa: D100
+
+from gnomad.resources.resource_utils import (
+    DataException,
+    GnomadPublicTableResource,
+    VersionedTableResource,
+)
+
+DATA_TYPES = ["exomes", "genomes"]
+
+CURRENT_EXOME_RELEASE = "2.1.1"
+CURRENT_GENOME_RELEASE = "2.1.1"
+
+EXOME_RELEASES = ["2.1", "2.1.1"]
+GENOME_RELEASES = ["2.1", "2.1.1"]
+
+SUBPOPS = {
+    "NFE": ["BGR", "EST", "NWE", "SEU", "SWE", "ONF"],
+    "EAS": ["KOR", "JPN", "OEA"],
+}
+GENOME_POPS = ["AFR", "AMR", "ASJ", "EAS", "FIN", "NFE", "OTH"]
+EXOME_POPS = ["AFR", "AMR", "ASJ", "EAS", "FIN", "NFE", "OTH", "SAS"]
+EXAC_POPS = ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"]
+
+POP_NAMES = {
+    "oth": "Other",
+    "afr": "African-American/African",
+    "ami": "Amish",
+    "amr": "Latino",
+    "eas": "East Asian",
+    "fin": "Finnish",
+    "eur": "European",
+    "nfe": "Non-Finnish European",
+    "sas": "South Asian",
+    "mde": "Middle Eastern",
+    "asj": "Ashkenazi Jewish",
+    "uniform": "Uniform",
+    "sas_non_consang": "South Asian (F < 0.05)",
+    "consanguineous": "South Asian (F > 0.05)",
+    "exac": "ExAC",
+    "bgr": "Bulgarian (Eastern European)",
+    "deu": "German",
+    "est": "Estonian",
+    "esp": "Spanish",
+    "gbr": "British",
+    "nwe": "North-Western European",
+    "seu": "Southern European",
+    "ita": "Italian",
+    "swe": "Swedish",
+    "chn": "Chinese",
+    "kor": "Korean",
+    "hkg": "Hong Kong",
+    "sgp": "Singaporean",
+    "twn": "Taiwanese",
+    "jpn": "Japanese",
+    "oea": "Other East Asian",
+    "oeu": "Other European",
+    "onf": "Other Non-Finnish European",
+    "unk": "Unknown",
+}
+
+
+def _public_release_ht_path(data_type: str, version: str) -> str:
+    """
+    Get public release table path.
+
+    :param data_type: One of "exomes" or "genomes"
+    :param version: One of the release versions of gnomAD on GRCh37
+    :return: Path to release Table
+    """
+    return f"gs://gnomad-public-requester-pays/release/{version}/ht/{data_type}/gnomad.{data_type}.r{version}.sites.ht"
+
+
+def _public_coverage_ht_path(data_type: str, version: str) -> str:
+    """
+    Get public coverage hail table.
+
+    :param data_type: One of "exomes" or "genomes"
+    :param version: One of the release versions of gnomAD on GRCh37
+    :return: path to coverage Table
+    """
+    return f"gs://gnomad-public-requester-pays/release/{version}/coverage/{data_type}/gnomad.{data_type}.r{version}.coverage.ht"
+
+
+def _public_pca_ht_path(subpop: str) -> str:
+    """
+    Get public pca loadings path.
+
+    :param subpop: Can be empty ("") -> global, "eas" or "nfe"
+    :return: Path to release Table
+    """
+    subpop = f".{subpop}" if subpop else ""
+    return f"gs://gnomad-public-requester-pays/release/2.1/pca/gnomad.r2.1.pca_loadings{subpop}.ht"
+
+
+def _liftover_data_path(data_type: str, version: str) -> str:
+    """
+    Paths to liftover gnomAD Table.
+
+    :param data_type: One of `exomes` or `genomes`
+    :param version: One of the release versions of gnomAD on GRCh37
+    :return: Path to chosen Table
+    """
+    return f"gs://gnomad-public-requester-pays/release/{version}/liftover_grch38/ht/{data_type}/gnomad.{data_type}.r{version}.sites.liftover_grch38.ht"
+
+
+
[docs]def public_release(data_type: str) -> VersionedTableResource: + """ + Retrieve publicly released versioned table resource. + + :param data_type: One of "exomes" or "genomes" + :return: Release Table + """ + if data_type not in DATA_TYPES: + raise DataException(f"{data_type} not in {DATA_TYPES}") + + if data_type == "exomes": + current_release = CURRENT_EXOME_RELEASE + releases = EXOME_RELEASES + else: + current_release = CURRENT_GENOME_RELEASE + releases = GENOME_RELEASES + + return VersionedTableResource( + current_release, + { + release: GnomadPublicTableResource( + path=_public_release_ht_path(data_type, release) + ) + for release in releases + }, + )
+ + +
[docs]def coverage(data_type: str) -> VersionedTableResource: + """ + Retrieve gnomAD's coverage table by data_type. + + :param data_type: One of "exomes" or "genomes" + :return: Coverage Table + """ + if data_type not in DATA_TYPES: + raise DataException(f"{data_type} not in {DATA_TYPES}") + + if data_type == "exomes": + current_release = "2.1" + releases = [r for r in EXOME_RELEASES if r != "2.1.1"] + else: + current_release = "2.1" + releases = [r for r in GENOME_RELEASES if r != "2.1.1"] + + return VersionedTableResource( + current_release, + { + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release) + ) + for release in releases + }, + )
+ + +
[docs]def liftover(data_type: str) -> VersionedTableResource: + """ + Get the 38 liftover of gnomad v2.1.1. + + :param data_type: One of "exomes" or "genomes" + :return: Release Table + """ + if data_type not in DATA_TYPES: + raise DataException(f"{data_type} not in {DATA_TYPES}") + + if data_type == "exomes": + current_release = CURRENT_EXOME_RELEASE + releases = [r for r in EXOME_RELEASES if r != "2.1"] + else: + current_release = CURRENT_GENOME_RELEASE + releases = [r for r in GENOME_RELEASES if r != "2.1"] + + return VersionedTableResource( + current_release, + { + release: GnomadPublicTableResource( + path=_liftover_data_path(data_type, release) + ) + for release in releases + }, + )
+ + +
[docs]def public_pca_loadings(subpop: str = "") -> GnomadPublicTableResource: + """ + Return the TableResource containing sites and loadings from population PCA. + + :param subpop: Can be empty ("") -> global, "eas" or "nfe" + :return: gnomAD public PCA loadings TableResource + """ + if subpop not in ["", "eas", "nfe"]: + raise DataException( + 'Available subpops are "eas" or "nfe", default value "" for global' + ) + + return GnomadPublicTableResource(path=_public_pca_ht_path(subpop))
+ + +
[docs]def release_vcf_path(data_type: str, version: str, contig: str) -> str: + """ + Publically released VCF. Provide specific contig, i.e. "20", to retrieve contig specific VCF. + + :param data_type: One of "exomes" or "genomes" + :param version: One of the release versions of gnomAD on GRCh37 + :param contig: Single contig "1" to "Y" + :return: Path to VCF + """ + if not version.startswith("2"): + raise DataException( + f"gnomAD version {version} is not available on reference genome GRCh37" + ) + + contig = f".{contig}" if contig else "" + return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.r{version}.sites{contig}.vcf.bgz"
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/grch37/gnomad_ld.html b/_modules/gnomad/resources/grch37/gnomad_ld.html new file mode 100644 index 000000000..962ce854f --- /dev/null +++ b/_modules/gnomad/resources/grch37/gnomad_ld.html @@ -0,0 +1,188 @@ + + + + + + gnomad.resources.grch37.gnomad_ld — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.grch37.gnomad_ld

+# noqa: D100
+
+from typing import Optional
+
+from gnomad.resources.grch37.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE
+from gnomad.resources.resource_utils import (
+    GnomadPublicBlockMatrixResource,
+    GnomadPublicTableResource,
+)
+
+
+def _ld_matrix_path(
+    data_type: str,
+    pop: str,
+    common_only: bool = True,
+    adj: bool = True,
+    version: Optional[str] = None,
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    subdir = "sv/" if data_type == "genomes_snv_sv" else ""
+    return f'gs://gnomad-public-requester-pays/release/{version}/ld/{subdir}gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
+
+
+def _ld_index_path(
+    data_type: str,
+    pop: str,
+    common_only: bool = True,
+    adj: bool = True,
+    version: Optional[str] = None,
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    subdir = "sv/" if data_type == "genomes_snv_sv" else ""
+    return f'gs://gnomad-public-requester-pays/release/{version}/ld/{subdir}gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
+
+
+def _ld_snv_sv_path(pop):
+    return f"gs://gnomad-public-requester-pays/release/2.1.1/ld/sv/gnomad.genomes_snv_sv.r2.1.1.{pop}.snv_sv.ld.ht"
+
+
+def _ld_snv_sv_index_path(pop, type):
+    return f"gs://gnomad-public-requester-pays/release/2.1.1/ld/sv/gnomad.genomes_snv_sv.r2.1.1.{pop}.snv_sv.ld.{type}.txt.bgz"
+
+
+def _cross_pop_ld_scores_path(
+    data_type: str,
+    pop1: str,
+    pop2: str,
+    adj: bool = True,
+    version: Optional[str] = None,
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    return f'gs://gnomad-public-requester-pays/release/{version}/ld/scores/gnomad.{data_type}.r{version}.{pop1}.{pop2}.{"adj." if adj else ""}ld_scores.ht'
+
+
+def _ld_scores_path(
+    data_type: str, pop: str, adj: bool = True, version: Optional[str] = None
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    return f'gs://gnomad-public-requester-pays/release/{version}/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
+
+
+
[docs]def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource: + """Get resource for the LD matrix for the given population.""" + return GnomadPublicBlockMatrixResource(path=_ld_matrix_path("genomes", pop))
+ + +
[docs]def ld_index(pop: str) -> GnomadPublicTableResource: + """Get resource for the LD indices for the given population.""" + return GnomadPublicTableResource(path=_ld_index_path("genomes", pop))
+ + +
[docs]def ld_scores(pop: str) -> GnomadPublicTableResource: + """Get resource for the LD scores for the given population.""" + return GnomadPublicTableResource(path=_ld_scores_path("genomes", pop))
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/grch37/reference_data.html b/_modules/gnomad/resources/grch37/reference_data.html new file mode 100644 index 000000000..c857fd4e2 --- /dev/null +++ b/_modules/gnomad/resources/grch37/reference_data.html @@ -0,0 +1,474 @@ + + + + + + gnomad.resources.grch37.reference_data — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.grch37.reference_data

+# noqa: D100
+
+import hail as hl
+
+from gnomad.resources.resource_utils import (
+    GnomadPublicMatrixTableResource,
+    GnomadPublicTableResource,
+    VersionedMatrixTableResource,
+    VersionedTableResource,
+    import_gencode,
+    import_sites_vcf,
+)
+
+
+def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
+    """
+    Import GTEx RSEM data from expression data and sample attributes file.
+
+    .. note::
+
+        Files are downloaded from https://www.gtexportal.org/home/downloads/adult-gtex.
+        We get the transcript TPM under Bulk tissue expression and sample attributes
+        under Metadata. The transcript TPM file is expected to have transcript
+        expression data, with transcript IDs as the first column and gene IDs as the
+        second column.
+
+    :param gtex_path: Path to the GTEx RSEM file.
+    :param meta_path: Path to the GTEx sample attributes file.
+    :param kwargs: Any additional parameters to be passed to Hail's `import_matrix_table`.
+    :return: Matrix Table with GTEx RSEM data with tissue information.
+    """
+    meta_ht = hl.import_table(meta_path, force_bgz=True, impute=True)
+    meta_ht = meta_ht.key_by("SAMPID")
+
+    mt = hl.import_matrix_table(
+        gtex_path,
+        row_fields={"transcript_id": hl.tstr, "gene_id": hl.tstr},
+        entry_type=hl.tfloat64,
+        force_bgz=True,
+        **kwargs,
+    )
+
+    mt = mt.rename({"x": "transcript_tpm", "col_id": "s"})
+
+    # GTEx data has gene IDs and transcript IDs with version numbers, we need
+    # to remove the version numbers so that it can later be joined with VEP
+    # transcript consequences transcript_id.
+    mt = mt.annotate_cols(
+        tissue=meta_ht[mt.s]
+        .SMTSD.replace(" ", "")
+        .replace("-", "_")
+        .replace("\\(", "_")
+        .replace("\\)", "")
+    )
+    mt = mt.annotate_rows(
+        transcript_id=mt.transcript_id.split("\\.")[0],
+        gene_id=mt.gene_id.split("\\.")[0],
+    )
+    mt = mt.key_rows_by("transcript_id").drop("row_id")
+
+    return mt
+
+
+na12878_giab = GnomadPublicMatrixTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt",
+    import_func=hl.import_vcf,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz",
+        "force_bgz": True,
+        "min_partitions": 100,
+        "reference_genome": "GRCh37",
+    },
+)
+
+hapmap = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/hapmap/hapmap_3.3.b37.ht",
+    import_func=import_sites_vcf,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz",
+        "force_bgz": True,
+        "min_partitions": 100,
+        "reference_genome": "GRCh37",
+    },
+)
+
+kgp_omni = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000G_omni2.5.b37.ht",
+    import_func=import_sites_vcf,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/kgp/1000G_omni2.5.b37.vcf.bgz",
+        "force_bgz": True,
+        "min_partitions": 100,
+        "reference_genome": "GRCh37",
+    },
+)
+
+mills = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.ht",
+    import_func=import_sites_vcf,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.vcf.bgz",
+        "force_bgz": True,
+        "min_partitions": 100,
+        "reference_genome": "GRCh37",
+    },
+)
+
+syndip = GnomadPublicMatrixTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/syndip/hybrid.m37m.mt",
+    import_func=hl.import_vcf,
+    import_args={
+        "path": (
+            "gs://gcp-public-data--gnomad/resources/grch37/syndip/hybrid.m37m.vcf.bgz"
+        ),
+        "min_partitions": 100,
+        "reference_genome": "GRCh37",
+    },
+)
+
+# Versioned resources: versions should be listed from most recent to oldest
+vep_context = VersionedTableResource(
+    default_version="85",
+    versions={
+        "85": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht",
+        )
+    },
+)
+
+dbsnp = VersionedTableResource(
+    default_version="20180423",
+    versions={
+        "20180423": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch37/dbsnp/All_20180423.ht",
+            import_func=import_sites_vcf,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch37/dbsnp/All_20180423.vcf.bgz",
+                "force_bgz": True,
+                "skip_invalid_loci": True,
+                "min_partitions": 100,
+                "reference_genome": "GRCh37",
+            },
+        )
+    },
+)
+
+clinvar = VersionedTableResource(
+    default_version="20181028",
+    versions={
+        "20181028": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch37/clinvar/clinvar_20181028.vep.ht",
+            import_func=import_sites_vcf,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch37/clinvar/clinvar_20181028.vcf.bgz",
+                "force_bgz": True,
+                "skip_invalid_loci": True,
+                "min_partitions": 100,
+                "reference_genome": "GRCh37",
+            },
+        )
+    },
+)
+
+kgp_phase_3 = VersionedMatrixTableResource(
+    default_version="phase_3_split",
+    versions={
+        "phase_3_split": GnomadPublicMatrixTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt",
+            import_func=hl.import_vcf,
+            import_args={
+                "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
+                "force_bgz": True,
+                "skip_invalid_loci": True,
+                "min_partitions": 300,
+                "reference_genome": "GRCh37",
+            },
+        ),
+        "phase_3": GnomadPublicMatrixTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt",
+            import_func=hl.import_vcf,
+            import_args={
+                "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
+                "force_bgz": True,
+                "skip_invalid_loci": True,
+                "min_partitions": 300,
+                "reference_genome": "GRCh37",
+            },
+        ),
+    },
+)
+
+kgp = VersionedTableResource(
+    default_version="phase_1_hc",
+    versions={
+        "phase_1_hc": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.ht",
+            import_func=import_sites_vcf,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.vcf.bgz",
+                "force_bgz": True,
+                "skip_invalid_loci": True,
+                "min_partitions": 100,
+                "reference_genome": "GRCh37",
+            },
+        ),
+    },
+)
+
+cpg_sites = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/cpg_sites/cpg.ht"
+)
+
+methylation_sites = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/methylation_sites/methylation.ht"
+)
+
+lcr_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+decoy_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.bed",
+        "reference_genome": "GRCh37",
+    },
+)
+
+purcell_5k_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/purcell_5k_intervals/purcell5k.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/purcell_5k_intervals/purcell5k.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+seg_dup_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.bed",
+        "reference_genome": "GRCh37",
+    },
+)
+
+exome_hc_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+high_coverage_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/high_coverage.auto.interval_list.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/high_coverage.auto.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+exome_calling_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+exome_evaluation_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+genome_evaluation_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list",
+        "reference_genome": "GRCh37",
+    },
+)
+
+na12878_hc_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/na12878/NA12878_GIAB_highconf_intervals.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.bed",
+        "reference_genome": "GRCh37",
+    },
+)
+
+syndip_hc_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch37/syndip/syndip_highconf_genome_intervals.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch37/syndip/hybrid.m37m.bed",
+        "reference_genome": "GRCh37",
+    },
+)
+
+
+
[docs]def get_truth_ht() -> hl.Table: + """ + Return a table with annotations from the latest version of the corresponding truth data. + + The following annotations are included: + - hapmap + - kgp_omni (1000 Genomes intersection Onni 2.5M array) + - kgp_phase_1_hc (high confidence sites in 1000 genonmes) + - mills (Mills & Devine indels) + + :return: A table with the latest version of popular truth data annotations + """ + return ( + hapmap.ht() + .select(hapmap=True) + .join(kgp_omni.ht().select(omni=True), how="outer") + .join( + kgp.versions["phase_1_hc"].mt().rows().select(kgp_phase1_hc=True), + how="outer", + ) + .join(mills.ht().select(mills=True), how="outer") + .repartition(200, shuffle=False) + .persist() + )
+ + +gtex_rsem = VersionedMatrixTableResource( + default_version="v7", + versions={ + "v7": GnomadPublicMatrixTableResource( + path="gs://gnomad-public-requester-pays/resources/grch37/gtex_rsem/gtex_rsem_v7.mt", + import_func=_import_gtex_rsem, + import_args={ + "gtex_path": "gs://gcp-public-data--gnomad/resources/grch37/gtex/bulk-gex_v7_rna-seq_GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.gz", + "meta_path": "gs://gcp-public-data--gnomad/resources/grch37/gtex/annotations_v7_GTEx_v7_Annotations_SampleAttributesDS.txt.gz", + "min_partitions": 1000, + }, + ), + }, +) + +gencode = VersionedTableResource( + default_version="v19", + versions={ + "v19": GnomadPublicTableResource( + path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht", + import_func=import_gencode, + import_args={ + "gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz", + "reference_genome": "GRCh37", + "force_bgz": True, + "min_partitions": 10, + }, + ), + }, +) +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/grch38/gnomad.html b/_modules/gnomad/resources/grch38/gnomad.html new file mode 100644 index 000000000..a45d8a79c --- /dev/null +++ b/_modules/gnomad/resources/grch38/gnomad.html @@ -0,0 +1,798 @@ + + + + + + gnomad.resources.grch38.gnomad — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.grch38.gnomad

+# noqa: D100
+
+import logging
+from typing import Optional
+
+import hail as hl
+
+from gnomad.resources.resource_utils import (
+    DataException,
+    GnomadPublicMatrixTableResource,
+    GnomadPublicTableResource,
+    VersionedMatrixTableResource,
+    VersionedTableResource,
+)
+from gnomad.sample_qc.ancestry import POP_NAMES
+from gnomad.utils.annotations import add_gks_va, add_gks_vrs
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+CURRENT_EXOME_RELEASE = "4.0"
+CURRENT_GENOME_RELEASE = "4.0"
+
+CURRENT_EXOME_COVERAGE_RELEASE = "4.0"
+CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1"
+
+CURRENT_EXOME_AN_RELEASE = "4.1"
+CURRENT_GENOME_AN_RELEASE = "4.1"
+
+EXOME_RELEASES = ["4.0"]
+GENOME_RELEASES = ["3.0", "3.1", "3.1.1", "3.1.2", "4.0"]
+
+EXOME_COVERAGE_RELEASES = ["4.0"]
+GENOME_COVERAGE_RELEASES = ["3.0", "3.0.1"]
+
+EXOME_AN_RELEASES = ["4.1"]
+GENOME_AN_RELEASES = ["4.1"]
+
+DATA_TYPES = ["exomes", "genomes"]
+MAJOR_RELEASES = ["v3", "v4"]
+CURRENT_MAJOR_RELEASE = MAJOR_RELEASES[-1]
+
+
+GENOME_POPS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"]
+SUBSETS = {
+    "v3": [
+        "non_v2",
+        "non_topmed",
+        "non_cancer",
+        "controls_and_biobanks",
+        "non_neuro",
+        "tgp",
+        "hgdp",
+    ],
+    "v4": ["non_ukb"],
+}
+"""
+Order to sort subgroupings during VCF export by version.
+
+Ensures that INFO labels in VCF are in desired order (e.g., tgp_raw_AC_esn_XX).
+"""
+
+GROUPS = ["adj", "raw"]
+"""
+Group names used to generate labels for high quality genotypes and all raw genotypes.
+
+Used in VCF export.
+"""
+
+SEXES = ["XX", "XY"]
+"""
+Sample sexes used in VCF export.
+
+Used to stratify frequency annotations (AC, AN, AF) for each sex.
+"""
+
+POPS = {
+    "v3": {
+        "genomes": [
+            "afr",
+            "ami",
+            "amr",
+            "asj",
+            "eas",
+            "fin",
+            "nfe",
+            "oth",
+            "sas",
+            "mid",
+        ]
+    },
+    "v4": {
+        "exomes": [
+            "afr",
+            "amr",
+            "asj",
+            "eas",
+            "fin",
+            "mid",
+            "nfe",
+            "remaining",
+            "sas",
+        ],
+        "genomes": [
+            "afr",
+            "ami",
+            "amr",
+            "asj",
+            "eas",
+            "fin",
+            "mid",
+            "nfe",
+            "remaining",
+            "sas",
+        ],
+    },
+}
+"""
+Global ancestry groups in gnomAD by version.
+"""
+
+COHORTS_WITH_POP_STORED_AS_SUBPOP = ["tgp", "hgdp"]
+"""
+Subsets in gnomAD v3.1 that are broken down by their known subpops instead of global pops in the frequency struct.
+"""
+
+TGP_POPS = [
+    "esn",
+    "pur",
+    "pjl",
+    "clm",
+    "jpt",
+    "chb",
+    "stu",
+    "itu",
+    "tsi",
+    "mxl",
+    "ceu",
+    "msl",
+    "yri",
+    "beb",
+    "fin",
+    "khv",
+    "cdx",
+    "lwk",
+    "acb",
+    "asw",
+    "ibs",
+    "gbr",
+    "pel",
+    "gih",
+    "chs",
+    "gwd",
+]
+"""
+1000 Genomes Project (1KG/TGP) subpops.
+"""
+
+HGDP_POPS = [
+    "japanese",
+    "papuanhighlands",
+    "papuansepik",
+    "adygei",
+    "orcadian",
+    "biaka",
+    "yakut",
+    "han",
+    "northernhan",
+    "uygur",
+    "miao",
+    "mongolian",
+    "balochi",
+    "bedouin",
+    "russian",
+    "daur",
+    "pima",
+    "hezhen",
+    "sindhi",
+    "yi",
+    "oroqen",
+    "san",
+    "tuscan",
+    "tu",
+    "palestinian",
+    "tujia",
+    "druze",
+    "pathan",
+    "basque",
+    "makrani",
+    "bergamoitalian",
+    "naxi",
+    "karitiana",
+    "sardinian",
+    "mbuti",
+    "mozabite",
+    "yoruba",
+    "lahu",
+    "dai",
+    "cambodian",
+    "bougainville",
+    "french",
+    "brahui",
+    "hazara",
+    "bantusouthafrica",
+    "surui",
+    "mandenka",
+    "kalash",
+    "xibo",
+    "colombian",
+    "bantukenya",
+    "she",
+    "burusho",
+    "maya",
+]
+"""
+Human Genome Diversity Project (HGDP) subpops.
+"""
+
+TGP_POP_NAMES = {
+    "chb": "Han Chinese",
+    "jpt": "Japanese",
+    "chs": "Southern Han Chinese",
+    "cdx": "Chinese Dai",
+    "khv": "Kinh",
+    "ceu": "Utah Residents (European Ancestry)",
+    "tsi": "Toscani",
+    "fin": "Finnish",
+    "gbr": "British",
+    "ibs": "Iberian",
+    "yri": "Yoruba",
+    "lwk": "Luhya",
+    "gwd": "Gambian",
+    "msl": "Mende",
+    "esn": "Esan",
+    "asw": "African-American",
+    "acb": "African Caribbean",
+    "mxl": "Mexican-American",
+    "pur": "Puerto Rican",
+    "clm": "Colombian",
+    "pel": "Peruvian",
+    "gih": "Gujarati",
+    "pjl": "Punjabi",
+    "beb": "Bengali",
+    "stu": "Sri Lankan Tamil",
+    "itu": "Indian Telugu",
+}
+"""
+1000 Genomes Project (1KG/TGP) pop label map.
+"""
+
+POPS_STORED_AS_SUBPOPS = TGP_POPS + HGDP_POPS
+POPS_TO_REMOVE_FOR_POPMAX = {
+    "v3": {"asj", "fin", "mid", "oth", "ami", "remaining"},
+    "v4": {"asj", "fin", "oth", "ami", "remaining"},
+}
+"""
+Populations that are removed before popmax calculations.
+"""
+
+DOWNSAMPLINGS = {
+    "v3": [
+        10,
+        20,
+        50,
+        100,
+        200,
+        500,
+        1000,
+        2000,
+        5000,
+        10000,
+        15000,
+        20000,
+        25000,
+        30000,
+        40000,
+        50000,
+        60000,
+        70000,
+        75000,
+        80000,
+        85000,
+        90000,
+        95000,
+        100000,
+        110000,
+        120000,
+    ],
+    "v4": [
+        10,
+        100,
+        500,
+        1000,
+        2000,
+        5000,
+        10000,
+        20000,
+        50000,
+        100000,
+        200000,
+        500000,
+    ],
+}
+"""
+List of the downsampling numbers to use for frequency calculations by version.
+"""
+
+gnomad_syndip = VersionedMatrixTableResource(
+    default_version="3.0",
+    versions={
+        "3.0": GnomadPublicMatrixTableResource(
+            path="gs://gnomad-public-requester-pays/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt"
+        )
+    },
+)
+
+na12878 = VersionedMatrixTableResource(
+    default_version="3.0",
+    versions={
+        "3.0": GnomadPublicMatrixTableResource(
+            path="gs://gnomad-public-requester-pays/truth-sets/hail-0.2/gnomad_v3_na12878.mt"
+        )
+    },
+)
+
+
+def _public_release_ht_path(data_type: str, version: str) -> str:
+    """
+    Get public release table path.
+
+    :param data_type: One of "exomes" or "genomes"
+    :param version: One of the release versions of gnomAD on GRCh38
+    :return: Path to release Table
+    """
+    version_prefix = "r" if version.startswith("3.0") else "v"
+    return f"gs://gnomad-public-requester-pays/release/{version}/ht/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites.ht"
+
+
+def _public_coverage_ht_path(
+    data_type: str, version: str, coverage_type="coverage"
+) -> str:
+    """
+    Get public coverage hail table.
+
+    :param data_type: One of "exomes" or "genomes"
+    :param version: One of the release versions of gnomAD on GRCh38
+    :param coverage_type: One of "coverage" or "allele_number"
+    :return: path to coverage Table
+    """
+    if coverage_type not in ["coverage", "allele_number"]:
+        raise ValueError(
+            "coverage_type must be one of 'coverage' or 'allele_number', not"
+            f" {coverage_type}!"
+        )
+
+    version_prefix = "r" if version.startswith("3.0") else "v"
+    return f"gs://gnomad-public-requester-pays/release/{version}/coverage/{data_type}/gnomad.{data_type}.{version_prefix}{version}.{coverage_type}.ht"
+
+
+
[docs]def public_release(data_type: str) -> VersionedTableResource: + """ + Retrieve publicly released versioned table resource. + + :param data_type: One of "exomes" or "genomes" + :return: Release Table + """ + if data_type not in DATA_TYPES: + raise DataException( + f"{data_type} not in {DATA_TYPES}, please select a data type from" + f" {DATA_TYPES}" + ) + + if data_type == "exomes": + current_release = CURRENT_EXOME_RELEASE + releases = EXOME_RELEASES + else: + current_release = CURRENT_GENOME_RELEASE + releases = GENOME_RELEASES + + return VersionedTableResource( + current_release, + { + release: GnomadPublicTableResource( + path=_public_release_ht_path(data_type, release) + ) + for release in releases + }, + )
+ + +
[docs]def coverage(data_type: str) -> VersionedTableResource: + """ + Retrieve gnomAD's coverage table by data_type. + + :param data_type: One of "exomes" or "genomes" + :return: Coverage Table + """ + if data_type not in DATA_TYPES: + raise DataException( + f"{data_type} not in {DATA_TYPES}, please select a data type from" + f" {DATA_TYPES}" + ) + + if data_type == "exomes": + current_release = CURRENT_EXOME_COVERAGE_RELEASE + releases = EXOME_COVERAGE_RELEASES + else: + current_release = CURRENT_GENOME_COVERAGE_RELEASE + releases = GENOME_COVERAGE_RELEASES + + return VersionedTableResource( + current_release, + { + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release) + ) + for release in releases + }, + )
+ + +
[docs]def all_sites_an(data_type: str) -> VersionedTableResource: + """ + Retrieve gnomAD's all sites allele number table by data_type. + + :param data_type: One of "exomes" or "genomes" + :return: All sites allele number VersionedTableResource + """ + if data_type not in DATA_TYPES: + raise DataException( + f"{data_type} not in {DATA_TYPES}, please select a data type from" + f" {DATA_TYPES}" + ) + + if data_type == "exomes": + current_release = CURRENT_EXOME_AN_RELEASE + releases = EXOME_AN_RELEASES + else: + current_release = CURRENT_GENOME_AN_RELEASE + releases = GENOME_AN_RELEASES + + return VersionedTableResource( + current_release, + { + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release, "allele_number") + ) + for release in releases + }, + )
+ + +
[docs]def coverage_tsv_path(data_type: str, version: Optional[str] = None) -> str: + """ + Retrieve gnomAD's coverage table by data_type. + + :param data_type: One of "exomes" or "genomes" + :return: Coverage Table + """ + if data_type not in DATA_TYPES: + raise DataException( + f"{data_type} not in {DATA_TYPES}, please select a data type from" + f" {DATA_TYPES}" + ) + + if data_type == "exomes": + if version is None: + version = CURRENT_EXOME_COVERAGE_RELEASE + elif version not in EXOME_COVERAGE_RELEASES: + raise DataException( + f"Version {version} of gnomAD exomes for GRCh38 does not exist" + ) + else: + if version is None: + version = CURRENT_GENOME_COVERAGE_RELEASE + elif version not in GENOME_COVERAGE_RELEASES: + raise DataException( + f"Version {version} of gnomAD genomes for GRCh38 does not exist" + ) + + version_prefix = "r" if version.startswith("3.0") else "v" + return f"gs://gcp-public-data--gnomad/release/{version}/coverage/{data_type}/gnomad.{data_type}.{version_prefix}{version}.coverage.summary.tsv.bgz"
+ + +
[docs]def release_vcf_path(data_type: str, version: str, contig: str) -> str: + """ + Publically released VCF. Provide specific contig, i.e. "chr20", to retrieve contig specific VCF. + + :param data_type: One of "exomes" or "genomes" + :param version: One of the release versions of gnomAD on GRCh37 + :param contig: Single contig "chr1" to "chrY" + :return: Path to VCF + """ + if version.startswith("2"): + raise DataException( + f"gnomAD version {version} is not available on reference genome GRCh38" + ) + + contig = f".{contig}" if contig else "" + version_prefix = "r" if version.startswith("3.0") else "v" + return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites{contig}.vcf.bgz"
+ + +
[docs]def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: + """ + Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. + + Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures. + + :param ht: Input hail table. + :return: Annotated hail table. + """ + if "gnomad" in ht.fafmax: + fafmax_field = ht.fafmax.gnomad + else: + fafmax_field = ht.fafmax + ht = ht.annotate( + grpMaxFAF95=hl.struct( + popmax=fafmax_field.faf95_max, + popmax_population=fafmax_field.faf95_max_gen_anc, + ), + jointGrpMaxFAF95=hl.struct( + popmax=ht.joint_fafmax.faf95_max, + popmax_population=ht.joint_fafmax.faf95_max_gen_anc, + ), + ) + return ht
+ + +
[docs]def gnomad_gks( + locus_interval: hl.IntervalExpression, + version: str, + data_type: str = "genomes", + by_ancestry_group: bool = False, + by_sex: bool = False, + vrs_only: bool = False, + custom_ht: hl.Table = None, + skip_checkpoint: bool = False, + skip_coverage: bool = False, + custom_coverage_ht: hl.Table = None, +) -> list: + """ + Perform gnomad GKS annotations on a range of variants at once. + + :param locus_interval: Hail IntervalExpression of locus<reference_genome>. + e.g. hl.locus_interval('chr1', 6424776, 6461367, reference_genome="GRCh38") + :param version: String of version of gnomAD release to use. + :param data_type: String of either "exomes" or "genomes" for the type of reads that are desired. + :param by_ancestry_group: Boolean to pass to obtain frequency information for each cohort. + :param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex. + :param vrs_only: Boolean to pass for only VRS info to be returned + (will not include allele frequency information). + :param custom_ht: Table to use instead of what public_release() method would return for the version. + :param skip_checkpoint: Bool to pass to skip checkpointing selected fields + (checkpointing may be desirable for large datasets by reducing data copies across the cluster). + :param skip_coverage: Bool to pass to skip adding coverage statistics. + :param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table. + :return: List of dictionaries containing VRS information + (and freq info split by ancestry groups and sex if desired) for specified variant. + """ + # Obtain the high level version number and verify that it is 4. + high_level_version = f"v{version.split('.')[0]}" + if high_level_version != "v4": + raise NotImplementedError( + "gnomad_gks() is currently only implemented for gnomAD v4." + ) + + # Read public_release table if no custom table provided. + if custom_ht: + ht = custom_ht + else: + ht = hl.read_table(public_release(data_type).versions[version].path) + + # Read coverage statistics if requested. + coverage_version_3_0_1 = "3.0.1" # v4 genomes coverage + coverage_version_4_0 = "4.0" # v4 exomes coverage + + # In v4, exomes have coverage in v4 coverage table, + # genomes have coverage in v3 coverage table. + if data_type == "genomes": + coverage_version = coverage_version_3_0_1 + else: + coverage_version = coverage_version_4_0 + + coverage_ht = None + + if not skip_coverage: + if custom_coverage_ht: + coverage_ht = custom_coverage_ht + else: + coverage_ht = hl.read_table( + coverage(data_type).versions[coverage_version].path + ) + ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean) + ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20) + + # Retrieve ancestry groups from the imported POPS dictionary. + pops_list = list(POPS[high_level_version][data_type]) if by_ancestry_group else None + + # Throw warnings if contradictory arguments are passed. + if by_ancestry_group and vrs_only: + logger.warning( + "Both 'vrs_only' and 'by_ancestry_groups' have been specified. Ignoring" + " 'by_ancestry_groups' list and returning only VRS information." + ) + elif by_sex and not by_ancestry_group: + logger.warning( + "Splitting whole database by sex is not yet supported. If using 'by_sex'," + " please also specify 'by_ancestry_group' to stratify by." + ) + + # Select relevant fields, checkpoint, and filter to interval before adding + # annotations. + + # Pull up LCR flag and make referrable in the same field. + ht = ht.annotate(lcr=ht.region_flags.lcr) + + # Pull up allele balance histogram arrays. + ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt) + + ht = add_grpMaxFAF95_v4(ht) + + ht = ht.annotate(in_autosome_or_par=ht.locus.in_autosome_or_par()) + + keep_fields = [ + ht.freq, + ht.info.vrs, + ht.info.monoallelic, + ht.grpMaxFAF95, + ht.filters, + ht.lcr, + ht.ab_hist_alt, + ht.in_autosome_or_par, + ] + + if not skip_coverage: + keep_fields.append(ht.mean_depth) + keep_fields.append(ht.fraction_cov_over_20) + + if "jointGrpMaxFAF95" in ht.row: + keep_fields.append(ht.jointGrpMaxFAF95) + + ht = ht.select(*keep_fields) + + # Checkpoint narrower set of columns if not skipped. + ht = hl.filter_intervals(ht, [locus_interval]) + if not skip_checkpoint: + ht = ht.checkpoint(hl.utils.new_temp_file("vrs_checkpoint", extension="ht")) + + # Collect all variants as structs, so all dictionary construction can be + # done in native Python. + variant_list = ht.collect() + ht_freq_index_dict = ht.freq_index_dict.collect()[0] + # gnomad v4 renamed freq_index_dict keys to use underscores instead of dashes. + # Use underscores for v3 as well. + ht_freq_index_dict = {k.replace("-", "_"): v for k, v in ht_freq_index_dict.items()} + + # Assemble output dicts with VRS and optionally frequency, append to list, + # then return list + outputs = [] + for variant in variant_list: + vrs_variant = add_gks_vrs(variant.locus, variant.vrs) + + out = { + "locus": { + "contig": variant.locus.contig, + "position": variant.locus.position, + "reference_genome": variant.locus.reference_genome.name, + }, + "alleles": variant.alleles, + "gks_vrs_variant": vrs_variant, + } + + if not vrs_only: + va_freq_dict = add_gks_va( + input_struct=variant, + label_name="gnomAD", + label_version=version, + ancestry_groups=pops_list, + ancestry_groups_dict=POP_NAMES, + by_sex=by_sex, + freq_index_dict=ht_freq_index_dict, + ) + + # Assign existing VRS information to "focusAllele" key + va_freq_dict["focusAllele"] = vrs_variant + out["gks_va_freq"] = va_freq_dict + + # Append variant dictionary to list of outputs + outputs.append(out) + + return outputs
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/grch38/reference_data.html b/_modules/gnomad/resources/grch38/reference_data.html new file mode 100644 index 000000000..702b7f3bc --- /dev/null +++ b/_modules/gnomad/resources/grch38/reference_data.html @@ -0,0 +1,505 @@ + + + + + + gnomad.resources.grch38.reference_data — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.grch38.reference_data

+# noqa: D100
+
+import hail as hl
+from hail import Table
+
+from gnomad.resources.resource_utils import (
+    DBSNP_B154_CHR_CONTIG_RECODING,
+    NO_CHR_TO_CHR_CONTIG_RECODING,
+    GnomadPublicMatrixTableResource,
+    GnomadPublicTableResource,
+    VersionedMatrixTableResource,
+    VersionedTableResource,
+    import_gencode,
+    import_sites_vcf,
+)
+from gnomad.utils.vep import vep_or_lookup_vep
+
+
+def _import_purcell_5k(path) -> hl.Table:
+    p5k = hl.import_locus_intervals(path, reference_genome="GRCh37")
+    rg37 = hl.get_reference("GRCh37")
+    rg38 = hl.get_reference("GRCh38")
+    if not rg37.has_liftover("GRCh38"):
+        rg37.add_liftover(
+            "gs://hail-common/references/grch37_to_grch38.over.chain.gz", rg38
+        )
+    p5k = p5k.annotate(
+        start=hl.liftover(p5k.interval.start, "GRCh38"),
+        end=hl.liftover(p5k.interval.start, "GRCh38"),
+    )
+    p5k = p5k.filter(
+        (p5k.start.contig == "chr" + p5k.interval.start.contig)
+        & (p5k.end.contig == "chr" + p5k.interval.end.contig)
+    )
+    p5k = p5k.key_by()
+    p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start)
+    return p5k.key_by("locus")
+
+
+def _import_clinvar(**kwargs) -> hl.Table:
+    clinvar = import_sites_vcf(**kwargs)
+    clinvar = clinvar.filter(
+        hl.len(clinvar.alleles) > 1
+    )  # Get around problematic single entry in alleles array in the clinvar vcf
+    clinvar = vep_or_lookup_vep(clinvar, reference="GRCh38")
+    return clinvar
+
+
+def _import_dbsnp(**kwargs) -> hl.Table:
+    dbsnp = import_sites_vcf(**kwargs)
+    # Note: permit_shuffle is set because the dbsnp vcf has duplicate loci
+    # (turned into a set) so might be out of order
+    dbsnp = hl.split_multi(dbsnp, permit_shuffle=True)
+    dbsnp = dbsnp.group_by(dbsnp.locus, dbsnp.alleles).aggregate(
+        rsid=hl.agg.collect_as_set(dbsnp.rsid)
+    )
+
+    return dbsnp
+
+
+def _import_methylation_sites(path) -> hl.Table:
+    """
+    Import methylation data from bed file.
+
+    :param path: Path to bed file containing methylation scores.
+    :return: Table with methylation data.
+    """
+    ht = hl.import_bed(path, min_partitions=100, reference_genome="GRCh38")
+    ht = ht.select(
+        locus=ht.interval.start,
+        methylation_level=hl.int32(ht.target),
+    )
+
+    return ht.key_by("locus").drop("interval")
+
+
+def _import_ensembl_interval(path) -> hl.Table:
+    """
+    Import and parse Ensembl intervals of protein-coding genes to a Hail Table.
+
+    File is expected to include only the following fields: gene_stable_ID, chr, start, end, source_gene, gene_name, and type.
+
+    :param path: Path to the interval Table file.
+    """
+    ensembl = hl.import_table(
+        path,
+        delimiter="\t",
+        min_partitions=100,
+        impute=True,
+    )
+
+    ensembl = ensembl.key_by(
+        interval=hl.locus_interval(
+            "chr" + ensembl.chr,
+            ensembl.start,
+            ensembl.end,
+            reference_genome="GRCh38",
+        )
+    )
+    return ensembl
+
+
+# Resources with no versioning needed
+purcell_5k_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/purcell_5k_intervals/purcell5k.ht",
+    import_func=_import_purcell_5k,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch38/purcell_5k_intervals/purcell5k.interval_list",
+    },
+)
+
+na12878_giab = GnomadPublicMatrixTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt",
+    import_func=hl.import_vcf,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz",
+        "force_bgz": True,
+        "min_partitions": 100,
+        "reference_genome": "GRCh38",
+    },
+)
+
+na12878_giab_hc_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7_hc_regions.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed",
+        "reference_genome": "GRCh38",
+        "skip_invalid_intervals": True,
+    },
+)
+
+# Versioned resources: versions should be listed from most recent to oldest
+vep_context = VersionedTableResource(
+    default_version="95",
+    versions={
+        "95": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.ht",
+        ),
+        "101": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.v101.ht",
+        ),
+        "105": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.v105.ht",
+        ),
+    },
+)
+
+syndip = VersionedMatrixTableResource(
+    default_version="20180222",
+    versions={
+        "20180222": GnomadPublicMatrixTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/syndip/syndip.b38_20180222.mt",
+            import_func=hl.import_vcf,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/syndip/full.38.20180222.vcf.gz",
+                "force_bgz": True,
+                "min_partitions": 100,
+                "reference_genome": "GRCh38",
+            },
+        )
+    },
+)
+
+syndip_hc_intervals = VersionedTableResource(
+    default_version="20180222",
+    versions={
+        "20180222": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/syndip/syndip_b38_20180222_hc_regions.ht",
+            import_func=hl.import_bed,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/syndip/syndip.b38_20180222.bed",
+                "reference_genome": "GRCh38",
+                "skip_invalid_intervals": True,
+                "min_partitions": 10,
+            },
+        )
+    },
+)
+
+# These Ensembl Interval Tables are focused on protein-coding genes on chr1-22,X,Y.
+# Downloaded from the biomart of Ensembl Archive (https://useast.ensembl.org/info/website/archives/index.html)
+# Ensembl 101 & 105 are included, since 101 was used to annotate gnomAD v3 and 105 to gnomAD v4.
+# Basic stats: 19924 protein-coding genes in Ensembl 101, and1 19951
+# protein-coding genes in Ensembl 105.
+ensembl_interval = VersionedTableResource(
+    default_version="105",
+    versions={
+        "105": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.ht",
+            import_func=_import_ensembl_interval,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.tsv",
+            },
+        ),
+        "101": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.ht",
+            import_func=_import_ensembl_interval,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.tsv",
+            },
+        ),
+    },
+)
+
+clinvar = VersionedTableResource(
+    default_version="20190923",
+    versions={
+        "20190923": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/clinvar/clinvar_20190923.ht",
+            import_func=_import_clinvar,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/clinvar/clinvar_20190923.vcf.gz",
+                "force_bgz": True,
+                "contig_recoding": NO_CHR_TO_CHR_CONTIG_RECODING,
+                "skip_invalid_loci": True,
+                "min_partitions": 100,
+                "reference_genome": "GRCh38",
+            },
+        )
+    },
+)
+
+dbsnp = VersionedTableResource(
+    default_version="b156",
+    versions={
+        "b156": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/dbsnp/dbsnp_b156_grch38_all_20221116.ht",
+            import_func=_import_dbsnp,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b156_grch38_all_GCF_000001405.40_20221116.vcf.bgz",
+                "header_file": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b156_grch38_all_GCF_000001405.40_20221116.vcf.header",
+                "force_bgz": True,
+                "contig_recoding": DBSNP_B154_CHR_CONTIG_RECODING,
+                "skip_invalid_loci": True,
+                "min_partitions": 400,
+                "reference_genome": "GRCh38",
+            },
+        ),
+        "b154": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/dbsnp/dbsnp_b154_grch38_all_20200514.ht",
+            import_func=_import_dbsnp,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b154_grch38_all_GCF_000001405.38_20200514.vcf.bgz",
+                "header_file": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b154_grch38_all_GCF_000001405.38_20200514.vcf.header",
+                "force_bgz": True,
+                "contig_recoding": DBSNP_B154_CHR_CONTIG_RECODING,
+                "skip_invalid_loci": True,
+                "min_partitions": 400,
+                "reference_genome": "GRCh38",
+            },
+        ),
+        "b151": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.ht",
+            import_func=import_sites_vcf,
+            import_args={
+                "path": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.vcf.bgz",
+                "header_file": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.vcf.header",
+                "force_bgz": True,
+                "contig_recoding": NO_CHR_TO_CHR_CONTIG_RECODING,
+                "skip_invalid_loci": True,
+                "min_partitions": 400,
+                "reference_genome": "GRCh38",
+            },
+        ),
+    },
+)
+
+hapmap = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/hapmap/hapmap_3.3.hg38.ht",
+    import_func=import_sites_vcf,
+    import_args={
+        "path": (
+            "gs://genomics-public-data/resources/broad/hg38/v0/hapmap_3.3.hg38.vcf.gz"
+        ),
+        "force_bgz": True,
+        "reference_genome": "GRCh38",
+    },
+)
+
+kgp_omni = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/kgp/1000G_omni2.5.hg38.ht",
+    import_func=import_sites_vcf,
+    import_args={
+        "path": "gs://genomics-public-data/resources/broad/hg38/v0/1000G_omni2.5.hg38.vcf.gz",
+        "force_bgz": True,
+        "reference_genome": "GRCh38",
+    },
+)
+
+kgp = VersionedTableResource(
+    default_version="phase_1_hc",
+    versions={
+        "phase_1_hc": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/kgp/1000G_phase1.snps.high_confidence.hg38.ht",
+            import_func=import_sites_vcf,
+            import_args={
+                "path": "gs://genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+                "force_bgz": True,
+                "reference_genome": "GRCh38",
+            },
+        )
+    },
+)
+
+mills = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/mills/Mills_and_1000G_gold_standard.indels.hg38.ht",
+    import_func=import_sites_vcf,
+    import_args={
+        "path": "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+        "force_bgz": True,
+        "reference_genome": "GRCh38",
+    },
+)
+
+# Methylation scores range from 0-15 and are described in Chen et al
+# (https://www.biorxiv.org/content/10.1101/2022.03.20.485034v2.full).
+methylation_sites = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation.ht",
+    import_func=_import_methylation_sites,
+    import_args={
+        "path": "gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation.bed",
+    },
+)
+
+# Methylation scores for chromosome X range from 0-12 and are described in Chen et al
+# (https://www.biorxiv.org/content/10.1101/2022.03.20.485034v2.full).
+methylation_sites_chrx = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation_chrX.ht",
+    import_func=_import_methylation_sites,
+    import_args={
+        "path": "gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation_chrX.bed",
+    },
+)
+
+lcr_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/lcr_intervals/LCRFromHengHg38.ht",
+    import_func=hl.import_locus_intervals,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch38/lcr_intervals/LCRFromHengHg38.txt",
+        "reference_genome": "GRCh38",
+        "skip_invalid_intervals": True,
+    },
+)
+
+seg_dup_intervals = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/seg_dup_intervals/GRCh38_segdups.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch38/seg_dup_intervals/GRCh38_segdups.bed",
+        "reference_genome": "GRCh38",
+    },
+)
+
+telomeres_and_centromeres = GnomadPublicTableResource(
+    path="gs://gnomad-public-requester-pays/resources/grch38/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.ht",
+    import_func=hl.import_bed,
+    import_args={
+        "path": "gs://gcp-public-data--gnomad/resources/grch38/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.bed",
+        "reference_genome": "GRCh38",
+        "skip_invalid_intervals": True,
+    },
+)
+
+
+
[docs]def get_truth_ht() -> Table: + """ + Return a table with annotations from the latest version of the corresponding truth data. + + The following annotations are included: + - hapmap + - kgp_omni (1000 Genomes intersection Onni 2.5M array) + - kgp_phase_1_hc (high confidence sites in 1000 genonmes) + - mills (Mills & Devine indels) + + :return: A table with the latest version of popular truth data annotations + """ + return ( + hapmap.ht() + .select(hapmap=True) + .join(kgp_omni.ht().select(omni=True), how="outer") + .join(kgp.versions["phase_1_hc"].ht().select(kgp_phase1_hc=True), how="outer") + .join(mills.ht().select(mills=True), how="outer") + .repartition(200, shuffle=False) + .persist() + )
+ + +gencode = VersionedTableResource( + default_version="v39", + versions={ + "v39": GnomadPublicTableResource( + path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht", + import_func=import_gencode, + import_args={ + "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz", + "reference_genome": "GRCh38", + "force_bgz": True, + "min_partitions": 100, + }, + ), + }, +) +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/import_resources.html b/_modules/gnomad/resources/import_resources.html new file mode 100644 index 000000000..da8c77f50 --- /dev/null +++ b/_modules/gnomad/resources/import_resources.html @@ -0,0 +1,214 @@ + + + + + + gnomad.resources.import_resources — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.import_resources

+# noqa: D100
+
+import argparse
+import itertools
+import textwrap
+from inspect import getmembers
+from typing import Dict, Optional, Tuple
+
+import gnomad.resources.grch37 as grch37
+import gnomad.resources.grch38 as grch38
+from gnomad.resources.config import (
+    GnomadPublicResourceSource,
+    gnomad_public_resource_configuration,
+)
+from gnomad.resources.resource_utils import BaseResource, BaseVersionedResource
+
+
+# Generate a dictionary of resource available for import for a given genome build
+
[docs]def get_module_importable_resources( + module, prefix: Optional[str] = None +) -> Dict[str, Tuple[str, BaseResource]]: + """ + Take a module that was imported and generates a list of all resources in this module that can be imported (i.e. with a path and import_func). + + The dict produced is as follows: + - keys: {prefix}.{resource_name}.{version} (with prefix only present if `prefix` is set, and `version` only present for versioned resources) + - values: ({resource_name}[ version {version}], resource) with resource_name set to the variable name in the module and the version present for versioned resources. + + The following example will generate a dict with all the resources in gnomad.resources.grch37 that can be imported: + + .. code-block:: python + + import gnomad.resources.grch37 as grch37 + grch37_resources = get_module_importable_resources(grch37, prefix='grch37') + + :param module: Input module + :param prefix: + :return: + """ + _prefix = f"{prefix}." if prefix else "" + resources = {} + for name, obj in getmembers(module): + if isinstance(obj, BaseResource) and obj.path and obj.import_func: + resources[f"{_prefix}{name}"] = (name, obj) + + if isinstance(obj, BaseVersionedResource): + for version_name, version_resource in obj.versions.items(): + if version_resource.path and version_resource.import_func: + resources[f"{_prefix}{name}.{version_name}"] = ( + f"{name}.{version_name}", + version_resource, + ) + + return resources
+ + +
[docs]def get_resources_descriptions( + resources: Dict[str, Tuple[str, BaseResource]], width: Optional[int] = 100 +) -> str: + """ + Return a string listing all resources in the input dict along with the path from which they are imported and the path at which they are stored. + + :param resources: A dict returned from get_module_importable_resources + :param width: Maximum width of lines in the returned string + """ + wrapper = textwrap.TextWrapper( + width=width, initial_indent=" " * 2, subsequent_indent=" " * 4 + ) + return "\n".join( + itertools.chain.from_iterable( + [ + f"{resource_arg}:", + wrapper.fill( + f"import {getattr(resource, 'import_args', {}).get('path', '???')}" + ), + wrapper.fill(f"to {resource.path}"), + "", + ] + for resource_arg, (resource_name, resource) in resources.items() + ) + )
+ + +grch37_resources = get_module_importable_resources(grch37, "grch37") +grch38_resources = get_module_importable_resources(grch38, "grch38") +all_resources = {**grch37_resources, **grch38_resources} + + +
[docs]def main(args): + """Import selected resources.""" + gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD + + for resource_arg in args.resources: + resource_name, resource = all_resources[resource_arg] + print(f"Importing {resource_name}...") + resource.import_resource(args.overwrite)
+ + +if __name__ == "__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "resources", + choices=list(all_resources.keys()), + metavar="resource", + nargs="+", + help="Resource to import. Choices are:\n\n" + + get_resources_descriptions(all_resources), + ) + parser.add_argument( + "--overwrite", help="Overwrites existing files", action="store_true" + ) + main(parser.parse_args()) +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/resources/resource_utils.html b/_modules/gnomad/resources/resource_utils.html new file mode 100644 index 000000000..8514b3348 --- /dev/null +++ b/_modules/gnomad/resources/resource_utils.html @@ -0,0 +1,838 @@ + + + + + + gnomad.resources.resource_utils — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.resources.resource_utils

+# noqa: D100
+
+import logging
+from abc import ABC, abstractmethod
+from functools import reduce, wraps
+from typing import Any, Callable, Dict, Iterable, List, Optional
+
+import hail as hl
+from hail.linalg import BlockMatrix
+
+from gnomad.resources.config import (
+    GnomadPublicResourceSource,
+    gnomad_public_resource_configuration,
+)
+
+logger = logging.getLogger("gnomad.resources")
+
+
+GNOMAD_PUBLIC_BUCKETS = ("gnomad-public", "gnomad-public-requester-pays")
+"""
+Public buckets used to stage gnomAD data.
+
+`gnomad-public` is a legacy bucket and contains one readme text file.
+
+The gnomAD Production Team writes output data to `gnomad-public-requester-pays`, and all data in this bucket
+syncs to the public bucket `gcp-public-data--gnomad`.
+"""
+
+# Resource classes
+
+
+
[docs]class BaseResource(ABC): + """ + Generic abstract resource class. + + :param path: The resource path + :param import_args: Any sources that are required for the import and need to be kept track of (e.g. .vcf path for an imported VCF) + :param import_func: A function used to import the resource. `import_func` will be passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [] + """Expected file extensions for this resource type. If path doesn't end with one of these, a warning is logged.""" + + def __init__( + self, + path: Optional[str] = None, + import_args: Optional[Dict[str, Any]] = None, + import_func: Optional[Callable] = None, + ): + if path is None and import_func is None: + raise ValueError( + f"{self.__class__.__name__} requires at least one of path or" + " import_func arguments." + ) + + self.path = path + self.import_args = import_args + self.import_func = import_func + + if ( + path is not None + and self.expected_file_extensions + and not any(path.endswith(ext) for ext in self.expected_file_extensions) + ): + logger.warning( + "Created the following %s with a path that doesn't end with %s: %s", + self.__class__.__name__, + " or ".join(self.expected_file_extensions), + self, + ) + + def __repr__(self): + attr_str = [f"path={self._path}"] + if self.import_args is not None: + attr_str.append(f"import_args={self.import_args}") + return f'{self.__class__.__name__}({",".join(attr_str)})' + + def _get_path(self): + return self._path + + def _set_path(self, path): + self._path = path # pylint: disable=attribute-defined-outside-init + + # Defining path property this way instead of using a decorator allows _get_path and _set_path + # to be overridden in subclasses without having to reconfigure the property. + path = property( + fget=lambda self: self._get_path(), + fset=lambda self, path: self._set_path(path), + ) + +
[docs] @abstractmethod + def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Abstract method to import the resource using its import_func and writes it in its path. + + :param overwrite: If ``True``, overwrite an existing file at the destination. + :param kwargs: Any other parameters to be passed to the underlying hail write function (acceptable parameters depend on specific resource types) + """
+ + +
[docs]class TableResource(BaseResource): + """ + A Hail Table resource. + + :param path: The Table path (typically ending in .ht) + :param import_args: Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF) + :param import_func: A function used to import the Table. `import_func` will be passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [".ht"] + +
[docs] def ht( + self, + force_import: bool = False, + read_args: Optional[Dict[str, Any]] = None, + ) -> hl.Table: + """ + Read and return the Hail Table resource. + + :param force_import: If ``True``, force the import of the resource even if it + already exists. + :param read_args: Any additional arguments to pass to hl.read_table. + :return: Hail Table resource + """ + if self.path is None or force_import: + return self.import_func(**self.import_args) + else: + return hl.read_table(self.path, **(read_args or {}))
+ +
[docs] def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the TableResource using its import_func and writes it in its path. + + :param overwrite: If ``True``, overwrite an existing file at the destination. + :param kwargs: Any other parameters to be passed to hl.Table.write + :return: Nothing + """ + self.import_func(**self.import_args).write( + self.path, overwrite=overwrite, **kwargs + )
+ + +
[docs]class MatrixTableResource(BaseResource): + """ + A Hail MatrixTable resource. + + :param path: The MatrixTable path (typically ending in .mt) + :param import_args: Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF) + :param import_func: A function used to import the MatrixTable. `import_func` will be passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [".mt"] + +
[docs] def mt( + self, + force_import: bool = False, + read_args: Optional[Dict[str, Any]] = None, + ) -> hl.MatrixTable: + """ + Read and return the Hail MatrixTable resource. + + :param force_import: If ``True``, force the import of the resource even if it + already exists. + :param read_args: Any additional arguments to pass to hl.read_matrix_table. + :return: Hail MatrixTable resource + """ + if self.path is None or force_import: + return self.import_func(**self.import_args) + else: + return hl.read_matrix_table(self.path, **(read_args or {}))
+ +
[docs] def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the MatrixTable resource using its import_func and writes it in its path. + + :param overwrite: If set, existing file(s) will be overwritten + :param kwargs: Any other parameters to be passed to hl.MatrixTable.write + :return: Nothing + """ + self.import_func(**self.import_args).write( + self.path, overwrite=overwrite, **kwargs + )
+ + +
[docs]class VariantDatasetResource(BaseResource): + """ + A Hail VariantDataset resource. + + :param path: The VariantDataset path (typically ending in .vds) + :param import_args: Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF) + :param import_func: A function used to import the VariantDataset. `import_func` will be passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [".vds"] + +
[docs] def vds( + self, + force_import: bool = False, + read_args: Optional[Dict[str, Any]] = None, + ) -> hl.vds.VariantDataset: + """ + Read and return the Hail VariantDataset resource. + + :param force_import: If ``True``, force the import of the resource even if it + already exists. + :param read_args: Any additional arguments to pass to hl.vds.read_vds. + :return: Hail VariantDataset resource + """ + if self.path is None or force_import: + return self.import_func(**self.import_args) + else: + return hl.vds.read_vds(self.path, **(read_args or {}))
+ +
[docs] def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the VariantDataset resource using its import_func and writes it in its path. + + :param overwrite: If set, existing file(s) will be overwritten + :param kwargs: Any other parameters to be passed to hl.vds.VariantDataset.write + :return: Nothing + """ + self.import_func(**self.import_args).write( + self.path, overwrite=overwrite, **kwargs + )
+ + +
[docs]class PedigreeResource(BaseResource): + """ + A pedigree resource. + + :param path: The Pedigree path (typically ending in .fam or .ped) + :param import_args: Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF) + :param import_func: A function used to import the Pedigree. `import_func` will be passed the `import_args` dictionary as kwargs. + :param quant_pheno: If ``True``, phenotype is interpreted as quantitative. + :param delimiter: Field delimiter regex. + :param missing: The string used to denote missing values. For case-control, 0, -9, and non-numeric are also treated as missing. + """ + + expected_file_extensions: List[str] = [".fam", ".ped"] + + def __init__( + self, + path: Optional[str] = None, + import_args: Optional[Dict[str, Any]] = None, + import_func: Optional[Callable[..., hl.Pedigree]] = None, + quant_pheno: bool = False, + delimiter: str = r"\\s+", + missing: str = "NA", + ): + super().__init__( + path=path, + import_args=import_args, + import_func=import_func, + ) + + self.quant_pheno = quant_pheno + self.delimiter = delimiter + self.missing = missing + +
[docs] def ht(self) -> hl.Table: + """ + Read the pedigree into a family HT using hl.import_fam(). + + :return: Family table + """ + return hl.import_fam( + self.path, + quant_pheno=self.quant_pheno, + delimiter=self.delimiter, + missing=self.missing, + )
+ +
[docs] def pedigree(self) -> hl.Pedigree: + """ + Read the pedigree into an hl.Pedigree using hl.Pedigree.read(). + + :param delimiter: Delimiter used in the ped file + :return: pedigree + """ + return hl.Pedigree.read(self.path, delimiter=self.delimiter)
+ +
[docs] def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the Pedigree resource using its import_func and writes it in its path. + + :param overwrite: If set, existing file(s) will be overwritten. IMPORTANT: Currently there is no implementation of this method when `overwrite` is set the `False` + :param kwargs: Any other parameters to be passed to hl.Pedigree.write + :return: Nothing + """ + if not overwrite: + raise NotImplementedError + + self.import_func(**self.import_args).write(self.path)
+ + +
[docs]class BlockMatrixResource(BaseResource): + """ + A Hail BlockMatrix resource. + + :param path: The BlockMatrix path (typically ending in .bm) + :param import_args: Any sources that are required for the import and need to be kept track of and/or passed to the import_func. + :param import_func: A function used to import the BlockMatrix. `import_func` will be passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [".bm"] + +
[docs] def bm(self, read_args: Optional[Dict[str, Any]] = None) -> BlockMatrix: + """ + Read and return the Hail MatrixTable resource. + + :param read_args: Any additional arguments to pass to BlockMatrix.read. + :return: Hail MatrixTable resource + """ + return BlockMatrix.read(self.path, **(read_args or {}))
+ +
[docs] def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the BlockMatrixResource using its import_func and writes it in its path. + + :param overwrite: If ``True``, overwrite an existing file at the destination. + :param kwargs: Any additional parameters to be passed to BlockMatrix.write + :return: Nothing + """ + self.import_func(**self.import_args).write( + self.path, overwrite=overwrite, **kwargs + )
+ + +
[docs]class ExpressionResource(BaseResource): + """ + A Hail Expression resource. + + :param path: The Expression path (typically ending in .he). + :param import_args: Any sources that are required for the import and need to be + kept track of and/or passed to the import_func (e.g. .vcf path for an imported + VCF). + :param import_func: A function used to import the Expression. `import_func` will be + passed the `import_args` dictionary as kwargs. + """ + + expected_file_extensions: List[str] = [".he"] + +
[docs] def he( + self, + force_import: bool = False, + read_args: Optional[Dict[str, Any]] = None, + ) -> hl.expr.Expression: + """ + Read and return the Hail Expression resource. + + :param force_import: If ``True``, force the import of the resource even if it + already exists. + :param read_args: Any additional arguments to pass to hl.experimental.read_expression. + :return: Hail Expression resource. + """ + if self.path is None or force_import: + return self.import_func(**self.import_args) + else: + return hl.experimental.read_expression(self.path, **(read_args or {}))
+ +
[docs] def import_resource(self, overwrite: bool = True, **kwargs) -> None: + """ + Import the Expression resource using its import_func and writes it in its path. + + :param overwrite: If set, existing file(s) will be overwritten. + :param kwargs: Any other parameters to be passed to hl.experimental. + write_expression. + :return: Nothing. + """ + self.import_func(**self.import_args).write( + self.path, overwrite=overwrite, **kwargs + )
+ + +
[docs]class BaseVersionedResource: + """ + Class for a versioned resource. + + The attributes and methods of the versioned resource are those of the default version of the resource. + In addition, all versions of the resource are stored in the `versions` attribute. + + :param default_version: The default version of this resource (must be in the `versions` dict) + :param versions: A dict of version name -> resource. + """ + + resource_class = BaseResource + + __slots__ = {"default_version", "versions"} + + def __init__(self, default_version: str, versions: Dict[str, BaseResource]): + default_resource = versions[default_version] + + for version_resource in versions.values(): + if not isinstance(version_resource, self.resource_class): + raise TypeError( + f"{self.__class__.__name__} requires all versions to be of type" + f" {self.resource_class.__name__}" + ) + + if version_resource.__class__ is not default_resource.__class__: + raise TypeError( + f"{self.__class__.__name__} requires all versions to be of the same" + " type" + ) + + self.default_version = default_version + self.versions = versions + + def __repr__(self): + return ( + "{cls}(default_version={default_version}, versions={{{versions}}})".format( + cls=self.__class__.__name__, + default_version=self.default_version, + versions=", ".join( + f'"{k}": {repr(v)}' for k, v in self.versions.items() + ), + ) + ) + + def __getattr__(self, name): + # If __getattr__ is called for 'default_version', 'version', etc. then + # something has gone wrong. + if name in self.__slots__: + raise ValueError("VersionedResource has not been initialized") + + return getattr(self.versions[self.default_version], name)
+ + +
[docs]class VersionedTableResource(BaseVersionedResource): + """ + Versioned Table resource. + + The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. + In addition, all versions of the resource are stored in the `versions` attribute. + + :param default_version: The default version of this Table resource (must to be in the `versions` dict) + :param versions: A dict of version name -> TableResource. + """ + + resource_class = TableResource + + def __init__(self, default_version: str, versions: Dict[str, TableResource]): + super().__init__(default_version, versions)
+ + +
[docs]class VersionedMatrixTableResource(BaseVersionedResource): + """ + Versioned MatrixTable resource. + + The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. + In addition, all versions of the resource are stored in the `versions` attribute. + + :param default_version: The default version of this MatrixTable resource (must to be in the `versions` dict) + :param versions: A dict of version name -> MatrixTableResource. + """ + + resource_class = MatrixTableResource + + def __init__(self, default_version: str, versions: Dict[str, MatrixTableResource]): + super().__init__(default_version, versions)
+ + +
[docs]class VersionedVariantDatasetResource(BaseVersionedResource): + """ + Versioned VariantDataset resource. + + The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. + In addition, all versions of the resource are stored in the `versions` attribute. + :param default_version: The default version of this VariantDataset resource (must to be in the `versions` dict) + + :param versions: A dict of version name -> VariantDatasetResource. + """ + + resource_class = VariantDatasetResource + + def __init__( + self, default_version: str, versions: Dict[str, VariantDatasetResource] + ): + super().__init__(default_version, versions)
+ + +
[docs]class VersionedPedigreeResource(BaseVersionedResource, PedigreeResource): + """ + Versioned Pedigree resource. + + The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. + In addition, all versions of the resource are stored in the `versions` attribute. + + :param default_version: The default version of this Pedigree resource (must be in the `versions` dict) + :param versions: A dict of version name -> PedigreeResource. + """ + + resource_class = PedigreeResource + + def __init__(self, default_version: str, versions: Dict[str, PedigreeResource]): + super().__init__(default_version, versions)
+ + +
[docs]class VersionedBlockMatrixResource(BaseVersionedResource, BlockMatrixResource): + """ + Versioned BlockMatrix resource. + + The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. + In addition, all versions of the resource are stored in the `versions` attribute. + + :param default_version: The default version of this BlockMatrix resource (must to be in the `versions` dict) + :param versions: A dict of version name -> BlockMatrixResource. + """ + + resource_class = BlockMatrixResource + + def __init__(self, default_version: str, versions: Dict[str, BlockMatrixResource]): + super().__init__(default_version, versions)
+ + +
[docs]class ResourceNotAvailable(Exception): + """Exception raised if a resource is not available from the selected source."""
+ + +
[docs]class GnomadPublicResource(BaseResource, ABC): + """Base class for the gnomAD project's public resources.""" + + def __init_subclass__(cls, *, read_resource_methods: Iterable[str] = []) -> None: + super().__init_subclass__() + + # Some resources may not be available from all sources due to delays in syncing, etc. + # This wraps all methods that read the resource and adds a check for if the resource + # is available from the selected source. If the resource is not available, this + # throws a more helpful error than if the read were simply allowed to fail. + def _wrap_read_resource_method(method_name): + original_method = getattr(cls, method_name) + + @wraps(original_method) + def read_resource(self, *args, **kwargs): + # If one of the known sources is selected, check if the resource is available. + # For custom sources, skip the check and attempt to read the resource. + resource_source = gnomad_public_resource_configuration.source + if not self.is_resource_available(): + if resource_source == GnomadPublicResourceSource.GNOMAD: + message = ( + "This resource is not currently available from the gnomAD" + " project public buckets." + ) + elif isinstance(resource_source, GnomadPublicResourceSource): + message = ( + "This resource is not currently available from" + f" {resource_source.value}." + ) + else: + message = ( + "This resource is not currently available from" + f" {resource_source}." + ) + + raise ResourceNotAvailable( + f"{message}\n\nTo load resources from a different source (for" + " example, Google Cloud Public Datasets) instead, use:\n\n>>>" + " from gnomad.resources.config import" + " gnomad_public_resource_configuration," + " GnomadPublicResourceSource\n>>>" + " gnomad_public_resource_configuration.source =" + " GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS\n\nTo" + " get all available sources for gnomAD resources, use:\n\n>>>" + " from gnomad.resources.config import" + " GnomadPublicResourceSource\n>>>" + " list(GnomadPublicResourceSource)" + ) + + return original_method(self, *args, **kwargs) + + setattr(cls, method_name, read_resource) + + for method_name in read_resource_methods: + _wrap_read_resource_method(method_name) + + def _get_path(self) -> str: + resource_source = gnomad_public_resource_configuration.source + if resource_source == GnomadPublicResourceSource.GNOMAD: + return self._path + + relative_path = reduce( + lambda path, bucket: ( + path[5 + len(bucket) :] if path.startswith(f"gs://{bucket}/") else path + ), + GNOMAD_PUBLIC_BUCKETS, + self._path, + ) + + if resource_source == GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS: + return f"gs://gcp-public-data--gnomad{relative_path}" + + if resource_source == GnomadPublicResourceSource.REGISTRY_OF_OPEN_DATA_ON_AWS: + return f"s3a://gnomad-public-us-east-1{relative_path}" + + if resource_source == GnomadPublicResourceSource.AZURE_OPEN_DATASETS: + return f"wasbs://dataset@datasetgnomad.blob.core.windows.net{relative_path}" + + return ( + f"{resource_source.rstrip('/')}{relative_path}" # pylint: disable=no-member + ) + + def _set_path(self, path): + if not any( + path.startswith(f"gs://{bucket}/") for bucket in GNOMAD_PUBLIC_BUCKETS + ): + raise ValueError( + "GnomadPublicResource requires a path to a file in one of the public" + f" gnomAD buckets ({', '.join(GNOMAD_PUBLIC_BUCKETS)})" + ) + + return super()._set_path(path) + +
[docs] def is_resource_available(self) -> bool: + """ + Check if this resource is available from the selected source. + + :return: True if the resource is available. + """ + path = self.path + + # Hail Tables, MatrixTables, and BlockMatrices are directories. + # For those, check for the existence of the _SUCCESS object. + path_to_test = ( + f"{path}/_SUCCESS" + if any(path.endswith(ext) for ext in (".ht", ".mt", ".bm")) + else path + ) + + return hl.current_backend().fs.exists(path_to_test)
+ + +
[docs]class GnomadPublicTableResource( + TableResource, GnomadPublicResource, read_resource_methods=("ht",) +): + """Resource class for a public Hail Table published by the gnomAD project."""
+ + +
[docs]class GnomadPublicMatrixTableResource( + MatrixTableResource, GnomadPublicResource, read_resource_methods=("mt",) +): + """Resource class for a public Hail MatrixTable published by the gnomAD project."""
+ + +
[docs]class GnomadPublicPedigreeResource( + PedigreeResource, GnomadPublicResource, read_resource_methods=("ht", "pedigree") +): + """Resource class for a public pedigree published by the gnomAD project."""
+ + +
[docs]class GnomadPublicBlockMatrixResource( + BlockMatrixResource, GnomadPublicResource, read_resource_methods=("bm",) +): + """Resource class for a public Hail BlockMatrix published by the gnomAD project."""
+ + +
[docs]class DataException(Exception): # noqa: D101 + pass
+ + +NO_CHR_TO_CHR_CONTIG_RECODING = { + "1": "chr1", + "2": "chr2", + "3": "chr3", + "4": "chr4", + "5": "chr5", + "6": "chr6", + "7": "chr7", + "8": "chr8", + "9": "chr9", + "10": "chr10", + "11": "chr11", + "12": "chr12", + "13": "chr13", + "14": "chr14", + "15": "chr15", + "16": "chr16", + "17": "chr17", + "18": "chr18", + "19": "chr19", + "20": "chr20", + "21": "chr21", + "22": "chr22", + "X": "chrX", + "Y": "chrY", + "MT": "chrM", +} + +DBSNP_B154_CHR_CONTIG_RECODING = { + "NC_000001.11": "chr1", + "NC_000002.12": "chr2", + "NC_000003.12": "chr3", + "NC_000004.12": "chr4", + "NC_000005.10": "chr5", + "NC_000006.12": "chr6", + "NC_000007.14": "chr7", + "NC_000008.11": "chr8", + "NC_000009.12": "chr9", + "NC_000010.11": "chr10", + "NC_000011.10": "chr11", + "NC_000012.12": "chr12", + "NC_000013.11": "chr13", + "NC_000014.9": "chr14", + "NC_000015.10": "chr15", + "NC_000016.10": "chr16", + "NC_000017.11": "chr17", + "NC_000018.10": "chr18", + "NC_000019.10": "chr19", + "NC_000020.11": "chr20", + "NC_000021.9": "chr21", + "NC_000022.11": "chr22", + "NC_000023.11": "chrX", + "NC_000024.10": "chrY", +} + + +
[docs]def import_sites_vcf(**kwargs) -> hl.Table: + """Import site-level data from a VCF into a Hail Table.""" + return hl.import_vcf(**kwargs).rows()
+ + +
[docs]def import_gencode(gtf_path: str, **kwargs) -> hl.Table: + """ + Import GENCODE annotations GTF file as a Hail Table. + + :param gtf_path: Path to GENCODE GTF file. + :return: Table with GENCODE annotation information. + """ + ht = hl.experimental.import_gtf(gtf_path, **kwargs) + + # Only get gene and transcript stable IDs (without version numbers if they + # exist), early versions of GENCODE have no version numbers but later ones do. + ht = ht.annotate( + gene_id=ht.gene_id.split("\\.")[0], + transcript_id=ht.transcript_id.split("\\.")[0], + ) + return ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/sample_qc/ancestry.html b/_modules/gnomad/sample_qc/ancestry.html new file mode 100644 index 000000000..47fe463d0 --- /dev/null +++ b/_modules/gnomad/sample_qc/ancestry.html @@ -0,0 +1,564 @@ + + + + + + gnomad.sample_qc.ancestry — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.sample_qc.ancestry

+# noqa: D100
+
+import logging
+import random
+from collections import Counter
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import hail as hl
+import numpy as np
+import onnx
+import onnxruntime as rt
+import pandas as pd
+from skl2onnx import convert_sklearn
+from skl2onnx.common.data_types import FloatTensorType
+
+from gnomad.utils.filtering import filter_to_autosomes
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+POP_NAMES = {
+    "afr": "African/African-American",
+    "ami": "Amish",
+    "amr": "Admixed American",
+    "asj": "Ashkenazi Jewish",
+    "eas": "East Asian",
+    "eur": "European",
+    "fin": "Finnish",
+    # NOTE: mde is kept for historical purposes, in gnomAD v3.1 mid was used instead
+    "mde": "Middle Eastern",
+    "mid": "Middle Eastern",
+    "nfe": "Non-Finnish European",
+    "oth": "Other",
+    "remaining": "Remaining individuals",
+    "sas": "South Asian",
+    "uniform": "Uniform",
+    "sas_non_consang": "South Asian (F < 0.05)",
+    "consanguineous": "South Asian (F > 0.05)",
+    "exac": "ExAC",
+    "bgr": "Bulgarian (Eastern European)",
+    "est": "Estonian",
+    "gbr": "British",
+    "nwe": "North-Western European",
+    "seu": "Southern European",
+    "swe": "Swedish",
+    "kor": "Korean",
+    "sgp": "Singaporean",
+    "jpn": "Japanese",
+    "oea": "Other East Asian",
+    "oeu": "Other European",
+    "onf": "Other Non-Finnish European",
+    "unk": "Unknown",
+}
+
+POP_COLORS = {
+    "afr": "#941494",
+    "ami": "#FFC0CB",
+    "amr": "#ED1E24",
+    "asj": "#FF7F50",
+    "eas": "#108C44",
+    "eur": "#6AA5CD",
+    "fin": "#002F6C",
+    "mde": "#33CC33",
+    "nfe": "#6AA5CD",
+    "oth": "#ABB9B9",
+    "sas": "#FF9912",
+    "uniform": "pink",
+    "consanguineous": "pink",
+    "sas_non_consang": "orange",
+    "exac": "gray",
+    "bgr": "#66C2A5",
+    "est": "black",
+    "gbr": "#C60C30",
+    "nwe": "#C60C30",
+    "seu": "#3CA021",
+    "swe": "purple",
+    "kor": "#4891D9",
+    "sgp": "darkred",
+    "jpn": "#BC002D",
+    "oea": "#108C44",
+    "oeu": "#6AA5CD",
+    "onf": "#6AA5CD",
+    "unk": "#ABB9B9",
+    "remaining": "#ABB9B9",
+    "": "#ABB9B9",
+}
+
+
+
[docs]def pc_project( + mt: hl.MatrixTable, + loadings_ht: hl.Table, + loading_location: str = "loadings", + af_location: str = "pca_af", +) -> hl.Table: + """ + Project samples in `mt` on pre-computed PCs. + + :param mt: MT containing the samples to project + :param loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA + :param loading_location: Location of expression for loadings in `loadings_ht` + :param af_location: Location of expression for allele frequency in `loadings_ht` + :return: Table with scores calculated from loadings in column `scores` + """ + n_variants = loadings_ht.count() + + mt = mt.annotate_rows( + pca_loadings=loadings_ht[mt.row_key][loading_location], + pca_af=loadings_ht[mt.row_key][af_location], + ) + + mt = mt.filter_rows( + hl.is_defined(mt.pca_loadings) + & hl.is_defined(mt.pca_af) + & (mt.pca_af > 0) + & (mt.pca_af < 1) + ) + + gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt( + n_variants * 2 * mt.pca_af * (1 - mt.pca_af) + ) + + mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm)) + + return mt.cols().select("scores")
+ + +
[docs]def apply_onnx_classification_model( + data_pd: pd.DataFrame, fit: onnx.ModelProto +) -> Tuple[np.ndarray, pd.DataFrame]: + """ + Apply an ONNX classification model `fit` to a pandas dataframe `data_pd`. + + :param data_pd: Pandas dataframe containing the data to be classified. + :param fit: ONNX model to be applied. + :return: Tuple of classification and probabilities. + """ + if not isinstance(fit, onnx.ModelProto): + raise TypeError("The model supplied is not an onnx model!") + + sess = rt.InferenceSession( + fit.SerializeToString(), providers=["CPUExecutionProvider"] + ) + input_name = sess.get_inputs()[0].name + label_name = sess.get_outputs()[0].name + prob_name = sess.get_outputs()[1].name + classification = sess.run([label_name], {input_name: data_pd.astype(np.float32)})[0] + probs = sess.run([prob_name], {input_name: data_pd.astype(np.float32)})[0] + probs = pd.DataFrame.from_dict(probs) + probs = probs.add_prefix("prob_") + + return classification, probs
+ + +
[docs]def apply_sklearn_classification_model( + data_pd: pd.DataFrame, fit: Any +) -> Tuple[np.ndarray, pd.DataFrame]: + """ + Apply an sklearn classification model `fit` to a pandas dataframe `data_pd`. + + :param data_pd: Pandas dataframe containing the data to be classified. + :param fit: Sklearn model to be applied. + :return: Tuple of classification and probabilities. + """ + from sklearn.utils.validation import check_is_fitted + + try: + check_is_fitted(fit) + except TypeError: + raise TypeError("The supplied model is not an sklearn model!") + + classification = fit.predict(data_pd) + probs = fit.predict_proba(data_pd) + probs = pd.DataFrame(probs, columns=[f"prob_{p}" for p in fit.classes_]) + + return classification, probs
+ + +
[docs]def convert_sklearn_rf_to_onnx( + fit: Any, target_opset: Optional[int] = None +) -> onnx.ModelProto: + """ + Convert a sklearn random forest model to ONNX. + + :param fit: Sklearn random forest model to be converted. + :param target_opset: An optional target ONNX opset version to convert the model to. + :return: ONNX model. + """ + from sklearn.utils.validation import check_is_fitted + + try: + check_is_fitted(fit) + except TypeError: + raise TypeError("The supplied model is not an sklearn model!") + + initial_type = [("float_input", FloatTensorType([None, fit.n_features_in_]))] + onx = convert_sklearn(fit, initial_types=initial_type, target_opset=target_opset) + + domains = onx.opset_import + opset_version = "" + for dom in domains: + opset_version += f"domain: {dom.domain}, version: {dom.version}\n" + + logger.info( + "sklearn model converted to onnx model with the following opset version: \n%s", + opset_version, + ) + + return onx
+ + +
[docs]def assign_population_pcs( + pop_pca_scores: Union[hl.Table, pd.DataFrame], + pc_cols: Union[hl.expr.ArrayExpression, List[int], List[str]], + known_col: str = "known_pop", + fit: Any = None, # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside + seed: int = 42, + prop_train: float = 0.8, + n_estimators: int = 100, + min_prob: float = 0.9, + output_col: str = "pop", + missing_label: str = "oth", + pc_expr: Union[hl.expr.ArrayExpression, str] = "scores", + convert_model_func: Optional[Callable[[Any], Any]] = None, + apply_model_func: Callable[ + [pd.DataFrame, Any], Any + ] = apply_sklearn_classification_model, +) -> Tuple[ + Union[hl.Table, pd.DataFrame], Any +]: # 2nd element of the tuple should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside + """ + Use a random forest model to assign population labels based on the results of PCA. + + Default values for model and assignment parameters are those used in gnomAD. + + As input, this function can either take: + - A Hail Table (typically the output of `hwe_normalized_pca`). In this case, + - `pc_cols` should be one of:: + - A list of integers where each element is one of the PCs to use. + - A list of strings where each element is one of the PCs to use. + - An ArrayExpression of Floats where each element is one of the PCs. + to use + - A Hail Table will be returned as output. + - A Pandas DataFrame. In this case: + - Each PC should be in a separate column and `pc_cols` is the list of all + the columns containing the PCs to use. + - A pandas DataFrame is returned as output. + + .. note:: + + If you have a Pandas Dataframe and have all PCs as an array in a single column, + the `expand_pd_array_col`can be used to expand this column into multiple `PC` + columns. + + :param pop_pca_scores: Input Hail Table or Pandas Dataframe. + :param pc_cols: List of which PCs to use/columns storing the PCs to use. Values + provided should be 1-based and should be a list of integers when passing in a + Hail Table (i.e. [1, 2, 4, 5]) or a list of strings when passing in a Pandas + Dataframe (i.e. ["PC1", "PC2", "PC4", "PC5"]). When passing a HT this can also + be an ArrayExpression containing all the PCs to use. + :param known_col: Column storing the known population labels. + :param fit: Fit from a previously trained random forest model (i.e., the output + from a previous RandomForestClassifier() call). + :param seed: Random seed. + :param prop_train: Proportion of known data used for training. + :param n_estimators: Number of trees to use in the RF model. + :param min_prob: Minimum probability of belonging to a given population for the + population to be set (otherwise set to `None`). + :param output_col: Output column storing the assigned population. + :param missing_label: Label for samples for which the assignment probability is + smaller than `min_prob`. + :param pc_expr: Column storing the list of PCs. Only used if `pc_cols` is a List of + integers. Default is scores. + :param convert_model_func: Optional function to convert the model to ONNX format. + Default is no conversion. + :param apply_model_func: Function to apply the model to the data. Default is + `apply_sklearn_classification_model`, which will apply a sklearn classification + model to the data. This default will work if no `fit` is set, or the supplied + `fit` is a sklearn classification model. + :return: Hail Table or Pandas Dataframe (depending on input) containing sample IDs + and imputed population labels, trained random forest model. + """ + from sklearn.ensemble import RandomForestClassifier + + hail_input = isinstance(pop_pca_scores, hl.Table) + if hail_input: + if isinstance(pc_cols, list): + if not all(isinstance(n, int) for n in pc_cols): + raise TypeError( + "Using a Hail Table with a list of PC cols to use (pc_cols) " + "requires all values of the pc_cols list to be integers." + ) + if isinstance(pc_expr, str): + pc_expr = pop_pca_scores[pc_expr] + pcs_to_pull = [pc_expr[i - 1] for i in pc_cols] + else: + pc_col_len = list( + filter( + None, + pop_pca_scores.aggregate(hl.agg.collect_as_set(hl.len(pc_cols))), + ) + ) + if len(pc_col_len) > 1: + raise ValueError( + "More than one length was found among the 'pc_cols' " + "ArrayExpression values. The length must be consistent!" + ) + pcs_to_pull = pc_cols + pc_cols = list(range(1, pc_col_len[0] + 1)) + if not fit: + pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pcs_to_pull) + else: + pop_pca_scores = pop_pca_scores.select(pca_scores=pcs_to_pull) + + pop_pc_pd = pop_pca_scores.to_pandas() + + # Explode the PC array + pc_cols = [f"PC{i}" for i in pc_cols] + pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist()) + + else: + if not all(isinstance(n, str) for n in pc_cols): + raise TypeError( + "Using a Pandas DataFrame with pc_cols requires all values of the" + " pc_cols list to be strings." + ) + pop_pc_pd = pop_pca_scores + + # Split training data into subsamples for fitting and evaluating. + if not fit: + train_data = pop_pc_pd.loc[~pop_pc_pd[known_col].isnull()] + N = len(train_data) + random.seed(seed) + train_subsample_ridx = random.sample(list(range(0, N)), int(N * prop_train)) + train_fit = train_data.iloc[train_subsample_ridx] + fit_samples = [x for x in train_fit["s"]] + evaluate_fit = train_data.loc[~train_data["s"].isin(fit_samples)] + + # Train RF. + training_set_known_labels = train_fit[known_col].values + training_set_pcs = train_fit[pc_cols].values + evaluation_set_pcs = evaluate_fit[pc_cols].values + + pop_clf = RandomForestClassifier(n_estimators=n_estimators, random_state=seed) + pop_clf.fit(training_set_pcs, training_set_known_labels) + logger.info( + "Random forest feature importances are as follows: %s", + pop_clf.feature_importances_, + ) + + # Evaluate RF. + predictions = pop_clf.predict(evaluation_set_pcs) + error_rate = 1 - sum(evaluate_fit[known_col] == predictions) / float( + len(predictions) + ) + logger.info("Estimated error rate for RF model is %.4f", error_rate) + else: + pop_clf = fit + + # Classify data. + classifications, probs = apply_model_func(pop_pc_pd[pc_cols].values, pop_clf) + + pop_pc_pd[output_col] = classifications + pop_pc_pd = pd.concat( + [pop_pc_pd.reset_index(drop=True), probs.reset_index(drop=True)], axis=1 + ) + probs["max"] = probs.max(axis=1) + pop_pc_pd.loc[probs["max"] < min_prob, output_col] = missing_label + pop_pc_pd = pop_pc_pd.drop(pc_cols, axis="columns") + + logger.info( + "Found the following sample count after population assignment: %s", + ", ".join( + f"{pop}: {count}" for pop, count in Counter(pop_pc_pd[output_col]).items() + ), + ) + + if convert_model_func is not None: + pop_clf = convert_model_func(pop_clf) + + if hail_input: + pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key)) + pops_ht = pops_ht.annotate_globals( + assign_pops_from_pc_params=hl.struct(min_assignment_prob=min_prob) + ) + + if not fit: + pops_ht = pops_ht.annotate_globals( + assign_pops_from_pc_params=pops_ht.assign_pops_from_pc_params.annotate( + error_rate=error_rate + ) + ) + + pops_ht = pops_ht.annotate( + evaluation_sample=hl.literal(list(evaluate_fit.s)).contains(pops_ht.s), + training_sample=hl.literal(list(train_fit.s)).contains(pops_ht.s), + ) + return pops_ht, pop_clf + else: + return pop_pc_pd, pop_clf
+ + +
[docs]def run_pca_with_relateds( + qc_mt: hl.MatrixTable, + related_samples_to_drop: Optional[hl.Table] = None, + additional_samples_to_drop: Optional[hl.Table] = None, + n_pcs: int = 10, + autosomes_only: bool = True, +) -> Tuple[List[float], hl.Table, hl.Table]: + """ + Run PCA excluding the given related or additional samples, and project those samples in the PC space to return scores for all samples. + + The `related_samples_to_drop` and `additional_samples_to_drop` Tables have to be keyed by the sample ID and all samples present in these + tables will be excluded from the PCA. + + The loadings Table returned also contains a `pca_af` annotation which is the allele frequency + used for PCA. This is useful to project other samples in the PC space. + + :param qc_mt: Input QC MT + :param related_samples_to_drop: Optional table of related samples to drop when generating the PCs, these samples will be projected in the PC space + :param additional_samples_to_drop: Optional table of additional samples to drop when generating the PCs, these samples will be projected in the PC space + :param n_pcs: Number of PCs to compute + :param autosomes_only: Whether to run the analysis on autosomes only + :return: eigenvalues, scores and loadings + """ + if autosomes_only: + qc_mt = filter_to_autosomes(qc_mt) + + # 'pca_mt' is the MatrixTable to use for generating the PCs + # If samples to drop are provided in 'related_samples_to_drop' or + # 'additional_samples_to_drop', 'project_pca_mt' will also be generated + # and will contain the samples to project in the PC space + pca_mt = qc_mt + + if related_samples_to_drop: + pca_mt = pca_mt.filter_cols( + hl.is_missing(related_samples_to_drop[pca_mt.col_key]) + ) + if additional_samples_to_drop: + pca_mt = pca_mt.filter_cols( + hl.is_missing(additional_samples_to_drop[pca_mt.col_key]) + ) + + pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( + pca_mt.GT, k=n_pcs, compute_loadings=True + ) + pca_af_ht = pca_mt.annotate_rows( + pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2 + ).rows() + pca_loadings = pca_loadings.annotate( + pca_af=pca_af_ht[pca_loadings.key].pca_af + ) # TODO: Evaluate if needed to write results at this point if relateds or not + + if not related_samples_to_drop and not additional_samples_to_drop: + return pca_evals, pca_scores, pca_loadings + else: + pca_loadings = pca_loadings.persist() + pca_scores = pca_scores.persist() + project_pca_mt = qc_mt.filter_cols(hl.is_missing(pca_mt.cols()[qc_mt.col_key])) + projected_scores = pc_project(project_pca_mt, pca_loadings) + pca_scores = pca_scores.union(projected_scores) + return pca_evals, pca_scores, pca_loadings
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/sample_qc/filtering.html b/_modules/gnomad/sample_qc/filtering.html new file mode 100644 index 000000000..f68d2efcf --- /dev/null +++ b/_modules/gnomad/sample_qc/filtering.html @@ -0,0 +1,768 @@ + + + + + + gnomad.sample_qc.filtering — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.sample_qc.filtering

+# noqa: D100
+
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+
+import hail as hl
+import pandas as pd
+from annoy import AnnoyIndex
+from hail.utils.misc import divide_null, new_temp_file
+from sklearn.neighbors import NearestNeighbors
+
+from gnomad.utils.gen_stats import get_median_and_mad_expr, merge_stats_counters_expr
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def compute_qc_metrics_residuals( + ht: hl.Table, + pc_scores: hl.expr.ArrayNumericExpression, + qc_metrics: Dict[str, hl.expr.NumericExpression], + use_pc_square: bool = True, + n_pcs: Optional[int] = None, + regression_sample_inclusion_expr: hl.expr.BooleanExpression = hl.bool(True), + strata: Optional[Dict[str, hl.expr.Expression]] = None, +) -> hl.Table: + """ + Compute QC metrics residuals after regressing out PCs (and optionally PC^2). + + .. note:: + + The `regression_sample_inclusion_expr` can be used to select a subset of the + samples to include in the regression calculation. Residuals are always computed + for all samples. + + :param ht: Input sample QC metrics HT. + :param pc_scores: The expression in the input HT that stores the PC scores. + :param qc_metrics: A dictionary with the name of each QC metric to compute + residuals for and their corresponding expression in the input HT. + :param use_pc_square: Whether to use PC^2 in the regression or not. + :param n_pcs: Numer of PCs to use. If not set, then all PCs in `pc_scores` are used. + :param regression_sample_inclusion_expr: An optional expression to select samples + to include in the regression calculation. + :param strata: Optional dictionary used for stratification. Keys are strata names + and values are filtering expressions. These expressions should refer to + data with discrete types! + :return: Table with QC metrics residuals. + """ + if strata is None: + strata = {"all": True} + collapse_lms = True + else: + collapse_lms = False + + # Annotate QC HT with fields necessary for computation + _sample_qc_ht = ht.select( + **qc_metrics, + scores=pc_scores, + _keep=regression_sample_inclusion_expr, + _strata=hl.tuple([strata[x] for x in strata]), + ) + + # If n_pcs wasn't provided, use all PCs + if n_pcs is None: + n_pcs = _sample_qc_ht.aggregate(hl.agg.min(hl.len(_sample_qc_ht.scores))) + + logger.info( + "Computing regressed QC metrics filters using %d PCs for metrics: %s", + n_pcs, + ", ".join(qc_metrics), + ) + + # Prepare regression variables, adding 1.0 first for the intercept + # Adds square of variables if use_pc_square is true + x_expr = [1.0] + [_sample_qc_ht.scores[i] for i in range(0, n_pcs)] + if use_pc_square: + x_expr.extend( + [_sample_qc_ht.scores[i] * _sample_qc_ht.scores[i] for i in range(0, n_pcs)] + ) + + # Compute linear regressions + lms = _sample_qc_ht.aggregate( + hl.agg.group_by( + _sample_qc_ht._strata, + hl.struct( + **{ + metric: hl.agg.filter( + _sample_qc_ht._keep, + hl.agg.linreg(y=_sample_qc_ht[metric], x=x_expr), + ) + for metric in qc_metrics + } + ), + ), + _localize=False, + ) + + _sample_qc_ht = _sample_qc_ht.annotate_globals(lms=lms) + _sample_qc_ht = _sample_qc_ht.checkpoint( + new_temp_file("compute_qc_metrics_residuals.lms", extension="ht") + ) + + # Compute residuals + def get_lm_prediction_expr(metric: str): + lm_pred_expr = _sample_qc_ht.lms[_sample_qc_ht._strata][metric].beta[ + 0 + ] + hl.sum( + hl.range(n_pcs).map( + lambda i: _sample_qc_ht.lms[_sample_qc_ht._strata][metric].beta[i + 1] + * _sample_qc_ht.scores[i] + ) + ) + if use_pc_square: + lm_pred_expr = lm_pred_expr + hl.sum( + hl.range(n_pcs).map( + lambda i: _sample_qc_ht.lms[_sample_qc_ht._strata][metric].beta[ + i + n_pcs + 1 + ] + * _sample_qc_ht.scores[i] + * _sample_qc_ht.scores[i] + ) + ) + return lm_pred_expr + + residuals_ht = _sample_qc_ht.select( + **{ + f"{metric}_residual": _sample_qc_ht[metric] - get_lm_prediction_expr(metric) + for metric in qc_metrics + } + ) + if collapse_lms: + residuals_ht = residuals_ht.annotate_globals( + lms=residuals_ht.lms[hl.tuple([True])] + ) + residuals_ht = residuals_ht.checkpoint( + new_temp_file("compute_qc_metrics_residuals.residuals", extension="ht") + ) + + return residuals_ht
+ + +
[docs]def compute_stratified_metrics_filter( + ht: hl.Table, + qc_metrics: Dict[str, hl.expr.NumericExpression], + strata: Optional[Dict[str, hl.expr.Expression]] = None, + lower_threshold: float = 4.0, + upper_threshold: float = 4.0, + metric_threshold: Optional[Dict[str, Tuple[float, float]]] = None, + filter_name: str = "qc_metrics_filters", + comparison_sample_expr: Optional[ + Union[hl.expr.BooleanExpression, hl.expr.CollectionExpression] + ] = None, +) -> hl.Table: + """ + Compute median, MAD, and upper and lower thresholds for each metric used in outlier filtering. + + :param ht: HT containing relevant sample QC metric annotations. + :param qc_metrics: List of metrics (name and expr) for which to compute the + critical values for filtering outliers. + :param strata: Dictionary of annotations used for stratification. These metrics should be + discrete types! + :param lower_threshold: Lower MAD threshold. + :param upper_threshold: Upper MAD threshold. + :param metric_threshold: Can be used to specify different (lower, upper) thresholds + for one or more metrics. + :param filter_name: Name of resulting filters annotation. + :param comparison_sample_expr: Optional BooleanExpression or CollectionExpression + of sample IDs to use for computation of the metric median, MAD, and upper and + lower thresholds to use for each sample. For instance, this works well with the + output of `determine_nearest_neighbors` or a boolean expression defining + releasable samples. + :return: Table grouped by strata, with upper and lower threshold values computed + for each sample QC metric. + """ + _metric_threshold = { + metric: (lower_threshold, upper_threshold) for metric in qc_metrics + } + if metric_threshold is not None: + _metric_threshold.update(metric_threshold) + + no_strata = False + if strata is None: + no_strata = True + strata = {"all": True} + + strata = list(strata.items()) + select_expr = { + "_qc_metrics": qc_metrics, + "_strata": hl.tuple([x[1] for x in strata]), + } + + sample_explode = False + if comparison_sample_expr is not None: + if isinstance(comparison_sample_expr, hl.expr.BooleanExpression): + select_expr["_comparison_qc_metrics"] = hl.or_missing( + comparison_sample_expr, qc_metrics + ) + ht = ht.select(**select_expr) + metric_ann = "_comparison_qc_metrics" + strata_ann = "_strata" + else: + sample_explode = True + select_expr["_comparison_sample"] = comparison_sample_expr + pre_explode_ht = ht.select(**select_expr) + ht = pre_explode_ht.explode(pre_explode_ht._comparison_sample) + ht = ht.annotate( + _comparison_qc_metrics=ht[ht._comparison_sample]._qc_metrics, + _comparison_strata=ht[ht._comparison_sample]._strata, + ) + metric_ann = "_comparison_qc_metrics" + strata_ann = "_comparison_strata" + else: + ht = ht.select(**select_expr) + metric_ann = "_qc_metrics" + strata_ann = "_strata" + + ht = ht.checkpoint( + new_temp_file("compute_stratified_metrics_filter", extension="ht") + ) + + agg_expr = hl.agg.group_by( + ht[strata_ann], + hl.struct( + **{ + metric: hl.bind( + lambda x: x.annotate( + lower=x.median - _metric_threshold[metric][0] * x.mad, + upper=x.median + _metric_threshold[metric][1] * x.mad, + ), + get_median_and_mad_expr(ht[metric_ann][metric]), + ) + for metric in qc_metrics + } + ), + ) + + select_expr = {} + if sample_explode: + ht = pre_explode_ht.annotate( + **ht.group_by(ht.s).aggregate(qc_metrics_stats=agg_expr)[pre_explode_ht.key] + ) + select_expr = {"qc_metrics_stats": ht.qc_metrics_stats} + else: + ht = ht.annotate_globals( + qc_metrics_stats=ht.aggregate(agg_expr, _localize=False) + ) + + metrics_stats_expr = ht.qc_metrics_stats[ht._strata] + select_expr.update( + **{ + f"fail_{metric}": ( + ht._qc_metrics[metric] <= metrics_stats_expr[metric].lower + ) + | (ht._qc_metrics[metric] >= metrics_stats_expr[metric].upper) + for metric in qc_metrics + } + ) + ht = ht.select(**select_expr) + + stratified_filters = hl.set( + hl.filter( + lambda x: hl.is_defined(x), + [hl.or_missing(ht[f"fail_{metric}"], metric) for metric in qc_metrics], + ) + ) + ht = ht.annotate(**{filter_name: stratified_filters}) + + if no_strata: + ann_expr = {"qc_metrics_stats": ht.qc_metrics_stats[(True,)]} + if sample_explode: + ht = ht.annotate(**ann_expr) + else: + ht = ht.annotate_globals(**ann_expr) + + else: + ht = ht.annotate_globals(strata=hl.tuple([x[0] for x in strata])) + ht = ht.annotate_globals(qc_metrics=list(qc_metrics.keys())) + + return ht
+ + +
[docs]def compute_stratified_sample_qc( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + strata: Dict[str, hl.expr.BooleanExpression], + tmp_ht_prefix: Optional[str], + gt_col: Optional[str] = None, +) -> hl.Table: + """ + Run hl.sample_qc on different strata and then also merge the results into a single expression. + + .. note:: + + Strata should be non-overlapping, e.g. SNV vs indels or bi-allelic vs multi-allelic + + :param mtds: Input MatrixTable or VariantDataset + :param strata: Strata names and filtering expressions + :param tmp_ht_prefix: Optional path prefix to write the intermediate strata results to (recommended for larger datasets) + :param gt_col: Name of entry field storing the genotype. Default: 'GT' + :return: Sample QC table, including strat-specific numbers + """ + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + mt = mtds.variant_data + else: + mt = mtds + + mt = mt.select_rows(**strata) + + if gt_col is not None: + mt = mt.select_entries(GT=mt[gt_col]) + else: + mt = mt.select_entries("GT") + + strat_hts = {} + for strat in strata: + if is_vds: + ht = mt.filter_rows(mt[strat]).rows() + strat_sample_qc_ht = hl.vds.sample_qc(hl.vds.filter_variants(mtds, ht)) + else: + strat_sample_qc_ht = hl.sample_qc(mt.filter_rows(mt[strat])).cols() + if tmp_ht_prefix is not None: + strat_sample_qc_ht = strat_sample_qc_ht.checkpoint( + tmp_ht_prefix + f"_{strat}.ht", overwrite=True + ) + else: + strat_sample_qc_ht = strat_sample_qc_ht.persist() + strat_hts[strat] = strat_sample_qc_ht + + sample_qc_ht = strat_hts.pop(list(strata)[0]) + if is_vds: + sample_qc_ht = sample_qc_ht.select( + **{ + f"{list(strata)[0]}_sample_qc": hl.struct( + **{ + f"{field}": sample_qc_ht[field] + for field in sample_qc_ht.row_value + } + ) + }, + **{ + f"{strat}_sample_qc": hl.struct( + **{ + f"{field}": strat_hts[strat][sample_qc_ht.key][field] + for field in sample_qc_ht.row_value + } + ) + for strat in list(strata)[1:] + }, + ) + else: + sample_qc_ht = sample_qc_ht.select( + **{f"{list(strata)[0]}_sample_qc": sample_qc_ht.sample_qc}, + **{ + f"{strat}_sample_qc": strat_hts[strat][sample_qc_ht.key].sample_qc + for strat in list(strata)[1:] + }, + ) + sample_qc_ht = sample_qc_ht.annotate( + sample_qc=merge_sample_qc_expr(list(sample_qc_ht.row_value.values())) + ) + + return sample_qc_ht
+ + +
[docs]def merge_sample_qc_expr( + sample_qc_exprs: List[hl.expr.StructExpression], +) -> hl.expr.StructExpression: + """ + Create an expression that merges results from non-overlapping strata of hail.sample_qc. + + E.g.: + + - Compute autosomes and sex chromosomes metrics separately, then merge results + - Compute bi-allelic and multi-allelic metrics separately, then merge results + + Note regarding the merging of ``dp_stats`` and ``gq_stats``: + Because ``n`` is needed to aggregate ``stdev``, ``n_called`` is used for this purpose. + This should work very well on a standard GATK VCF and it essentially assumes that: + + - samples that are called have `DP` and `GQ` fields + - samples that are not called do not have `DP` and `GQ` fields + + Even if these assumptions are broken for some genotypes, it shouldn't matter too much. + + :param sample_qc_exprs: List of sample QC struct expressions for each stratification + :return: Combined sample QC results + """ + # List of metrics that can be aggregated by summing + additive_metrics = ( + [ + "n_called", + "n_not_called", + "n_filtered", + "n_hom_ref", + "n_het", + "n_hom_var", + "n_non_ref", + "n_snp", + "n_insertion", + "n_deletion", + "n_singleton", + "n_transition", + "n_transversion", + "n_star", + "n_singleton_ti", + "n_singleton_tv", + ] + + ["gq_over_" + f"{GQ}" for GQ in range(0, 70, 10)] + + ["dp_over_" + f"{DP}" for DP in range(0, 40, 10)] + ) + + # List of metrics that are ratio of summed metrics (name, nominator, denominator) + ratio_metrics = [ + ("call_rate", "n_called", "n_not_called"), + ("r_ti_tv", "n_transition", "n_transversion"), + ("r_ti_tv_singleton", "n_singleton_ti", "n_singleton_tv"), + ("r_het_hom_var", "n_het", "n_hom_var"), + ("r_insertion_deletion", "n_insertion", "n_deletion"), + ] + + # List of metrics that are struct generated by a stats counter + stats_metrics = ["gq_stats", "dp_stats"] + + # Gather metrics present in sample qc fields + sample_qc_fields = set(sample_qc_exprs[0]) + for sample_qc_expr in sample_qc_exprs[1:]: + sample_qc_fields = sample_qc_fields.union(set(sample_qc_expr)) + + # Merge additive metrics in sample qc fields + merged_exprs = { + metric: hl.sum([sample_qc_expr[metric] for sample_qc_expr in sample_qc_exprs]) + for metric in additive_metrics + if metric in sample_qc_fields + } + + # Merge ratio metrics in sample qc fields + merged_exprs.update( + { + metric: hl.float64(divide_null(merged_exprs[nom], merged_exprs[denom])) + for metric, nom, denom in ratio_metrics + if nom in sample_qc_fields and denom in sample_qc_fields + } + ) + + # Merge stats counter metrics in sample qc fields + # Use n_called as n for DP and GQ stats + if "n_called" in sample_qc_fields: + merged_exprs.update( + { + metric: merge_stats_counters_expr( + [ + sample_qc_expr[metric].annotate(n=sample_qc_expr.n_called) + for sample_qc_expr in sample_qc_exprs + ] + ).drop("n") + for metric in stats_metrics + if metric in sample_qc_fields + } + ) + + return hl.struct(**merged_exprs)
+ + +
[docs]def determine_nearest_neighbors( + ht: hl.Table, + scores_expr: hl.expr.ArrayNumericExpression, + strata: Optional[Dict[str, hl.expr.Expression]] = None, + n_pcs: Optional[int] = None, + n_neighbors: int = 50, + n_jobs: int = -1, + add_neighbor_distances: bool = False, + distance_metric: str = "euclidean", + use_approximation: bool = False, + n_trees: int = 10, +) -> hl.Table: + """ + Determine the nearest neighbors of each sample with information in `scores_expr`. + + .. note:: + + If strata is provided, the nearest neighbors for each sample is limited to the + other samples with the same strata values. If `n_neighbors` is greater than the + number of samples in a stratification grouping, all samples within the + stratification are returned and a warning is raised indicating that any sample + within the stratification group has less than the expected `n_neighbors`. + + The following annotations are in the returned Table: + - nearest_neighbors + - nearest_neighbor_dists (if `add_neighbor_distances` is True) + + :param ht: Input Table. + :param scores_expr: Expression in the input HT that stores the PC scores. + :param strata: Optional dictionary used for stratification. Keys are strata names + and values are filtering expressions. These expressions should refer to + data with discrete types! + :param n_pcs: Number of PCs to use. If not set, then all PCs in `scores_expr` are + used. + :param n_neighbors: Number of nearest neighbors to identify for each sample. + Default is 50. + :param n_jobs: Number of threads to use when finding the nearest neighbors. Default + is -1 which uses the number of CPUs on the head node -1. + :param add_neighbor_distances: Whether to return an annotation for the nearest + neighbor distances. + :param distance_metric: Distance metric to use. Default is euclidean. Options + using scikit-learn are: "euclidean", "cityblock", "cosine", "haversine", "l1", + "l2", and "manhattan". Options using Annoy: "angular", "euclidean", "manhattan", + "hamming", and "dot". + :param use_approximation: Whether to use the package Annoy to determine approximate + nearest neighbors instead of using scikit-learn's `NearestNeighbors`. This + method is faster, but only needed for very large datasets, for instance + > 500,000 samples. + :param n_trees: Number of trees to use in the annoy approximation approach. + `n_trees` is provided during build time and affects the build time and the + index size. A larger value will give more accurate results, but larger indexes. + Default is 10. + :return: Table with an annotation for the nearest neighbors and optionally their + distances. + """ + # Get spark session for conversion of pandas DataFrame to a spark DataFrame. + # This method is faster and uses less memory than hl.Table.from_pandas. + spark = hl.utils.java.Env.spark_session() + + # Annotate HT with fields necessary for nearest neighbors computation. + # Checkpoint before filtering and exporting to pandas dataframes. + ann_expr = {"scores": scores_expr} + if strata is not None: + ann_expr["strata"] = hl.tuple([strata[x] for x in strata]) + else: + ann_expr["strata"] = True + + # If `n_pcs` wasn't provided, use all PCs. + if n_pcs is None: + n_pcs = ht.aggregate(hl.agg.min(hl.len(scores_expr))) + + _ht = ht.select(**ann_expr) + _ht = _ht.filter(hl.is_defined(_ht.scores)) + _ht = _ht.transmute(**{f"PC{i + 1}": _ht.scores[i] for i in range(n_pcs)}) + logger.info("Checkpointing intermediate Table before converting to pandas...") + _ht = _ht.checkpoint(new_temp_file("determine_nearest_neighbors", extension="ht")) + + all_strata_vals = _ht.aggregate(hl.agg.collect_as_set(_ht.strata)) + all_nbr_hts = [] + for group in all_strata_vals: + logger_str = "" + if strata is not None: + logger_str += f", for the following stratification group: {group}" + logger.info( + "Finding %d %snearest neighbors, using the %s distance metric%s.", + n_neighbors, + "approximate " if use_approximation else "", + distance_metric, + logger_str, + ) + scores_pd = _ht.filter(_ht.strata == group).to_pandas() + scores_pd_s = scores_pd.s + scores_pd = scores_pd[[f"PC{i + 1}" for i in range(n_pcs)]] + # Get the number of rows/samples in the stratification group. + group_n = scores_pd.shape[0] + group_n_neighbors = min(n_neighbors, group_n) + if n_neighbors > group_n: + logger.warning( + "The requested number of nearest neighbors (%d) is larger than the " + "number of samples in the %s stratification group (%d). Only %d " + "neighbors will be returned for all samples in this group.", + n_neighbors, + group, + group_n, + group_n, + ) + if use_approximation: + nbrs = AnnoyIndex(n_pcs, distance_metric) + for i, row in scores_pd.iterrows(): + nbrs.add_item(i, row) + nbrs.build(n_trees, n_jobs=n_jobs) + + indexes = [] + for i in range(group_n): + indexes.append( + nbrs.get_nns_by_item( + i, group_n_neighbors, include_distances=add_neighbor_distances + ) + ) + if add_neighbor_distances: + distances = [d for i, d in indexes] + indexes = [i for i, d in indexes] + else: + scores = scores_pd.values + nbrs = NearestNeighbors( + n_neighbors=group_n_neighbors, n_jobs=n_jobs, metric=distance_metric + ) + nbrs.fit(scores) + indexes = nbrs.kneighbors(scores, return_distance=add_neighbor_distances) + if add_neighbor_distances: + distances, indexes = indexes + + # Format neighbor indexes as a Hail Table. + indexes_pd = pd.DataFrame(indexes) + indexes_pd = pd.concat([scores_pd_s, indexes_pd], axis=1) + indexes_pd = indexes_pd.rename( + columns={i: f"nbrs_index_{i}" for i in range(group_n_neighbors)} + ) + indexes_ht = hl.Table.from_spark(spark.createDataFrame(indexes_pd), key=["s"]) + indexes_ht = indexes_ht.transmute( + nearest_neighbor_idxs=hl.array( + [indexes_ht[f"nbrs_index_{i}"] for i in range(group_n_neighbors)] + ) + ) + + if add_neighbor_distances: + # Format neighbor distances as a Hail Table. + distances_pd = pd.DataFrame(distances) + distances_pd = distances_pd.rename( + columns={i: f"nbrs_{i}" for i in range(group_n_neighbors)} + ) + distances_pd = pd.concat([scores_pd_s, distances_pd], axis=1) + distances_ht = hl.Table.from_spark( + spark.createDataFrame(distances_pd), key=["s"] + ) + distances_indexed = distances_ht[indexes_ht.key] + nbrs_ht = indexes_ht.annotate( + nearest_neighbor_dists=hl.array( + [ + distances_indexed[f"nbrs_{str(i)}"] + for i in range(group_n_neighbors) + ] + ) + ) + else: + nbrs_ht = indexes_ht + + # Add nearest_neighbors annotation to use instead of indexes. + nbrs_ht = nbrs_ht.add_index() + nbrs_ht = nbrs_ht.annotate( + _nearest_neighbor_idxs=hl.enumerate(nbrs_ht.nearest_neighbor_idxs) + ) + explode_nbrs_ht = nbrs_ht.key_by("idx").explode("_nearest_neighbor_idxs") + nbrs_idx_expr = explode_nbrs_ht._nearest_neighbor_idxs + explode_nbrs_ht = explode_nbrs_ht.transmute( + nbr=(nbrs_idx_expr[0], explode_nbrs_ht[hl.int64(nbrs_idx_expr[1])].s) + ) + explode_nbrs_ht = explode_nbrs_ht.group_by("s").aggregate( + nearest_neighbors=hl.sorted(hl.agg.collect(explode_nbrs_ht.nbr)).map( + lambda x: x[1] + ) + ) + nbrs_ht = nbrs_ht.annotate( + nearest_neighbors=explode_nbrs_ht[nbrs_ht.key].nearest_neighbors + ) + nbrs_ht = nbrs_ht.drop("_nearest_neighbor_idxs") + logger.info( + "Checkpointing intermediate Table with nearest neighbor information..." + ) + nbrs_ht = nbrs_ht.checkpoint( + new_temp_file("determine_nearest_neighbors.strata", extension="ht") + ) + all_nbr_hts.append(nbrs_ht) + + nbrs_ht = all_nbr_hts[0] + if len(all_nbr_hts) > 1: + logger.info("Combining all nearest neighbor stratification Tables...") + nbrs_ht = nbrs_ht.union(*all_nbr_hts[1:]) + + nbrs_ht = nbrs_ht.annotate_globals(n_pcs=n_pcs, n_neighbors=n_neighbors) + + return ht.annotate(**nbrs_ht[ht.key])
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/sample_qc/pipeline.html b/_modules/gnomad/sample_qc/pipeline.html new file mode 100644 index 000000000..ef533e2c9 --- /dev/null +++ b/_modules/gnomad/sample_qc/pipeline.html @@ -0,0 +1,829 @@ + + + + + + gnomad.sample_qc.pipeline — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.sample_qc.pipeline

+# noqa: D100
+
+import functools
+import logging
+import operator
+from typing import List, Optional, Union
+
+import hail as hl
+
+from gnomad.sample_qc.sex import (
+    gaussian_mixture_model_karyotype_assignment,
+    get_chr_x_hom_alt_cutoffs,
+    get_ploidy_cutoffs,
+    get_sex_expr,
+)
+from gnomad.utils.annotations import (
+    bi_allelic_expr,
+    bi_allelic_site_inbreeding_expr,
+    get_adj_expr,
+)
+from gnomad.utils.filtering import filter_low_conf_regions, filter_to_adj
+from gnomad.utils.reference_genome import get_reference_genome
+from gnomad.utils.sparse_mt import impute_sex_ploidy
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def filter_rows_for_qc( + mt: hl.MatrixTable, + min_af: Optional[float] = 0.001, + min_callrate: Optional[float] = 0.99, + min_inbreeding_coeff_threshold: Optional[float] = -0.8, + min_hardy_weinberg_threshold: Optional[float] = 1e-8, + apply_hard_filters: bool = True, + bi_allelic_only: bool = True, + snv_only: bool = True, +) -> hl.MatrixTable: + """ + Annotate rows with `sites_callrate`, `site_inbreeding_coeff` and `af`, then apply thresholds. + + AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from + GATK best practices. + + .. note:: + + This function expect the typical ``info`` annotation of type struct with fields ``MQ``, ``FS`` and ``QD`` + if applying hard filters. + + :param mt: Input MT + :param min_af: Minimum site AF to keep. Not applied if set to ``None``. + :param min_callrate: Minimum site call rate to keep. Not applied if set to ``None``. + :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. + :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. + :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30. + :param bi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too. + :param snv_only: Whether to only keep SNVs or include other variant types. + :return: annotated and filtered table + """ + annotation_expr = {} + + if min_af is not None: + annotation_expr["af"] = hl.agg.mean(mt.GT.n_alt_alleles()) / 2 + if min_callrate is not None: + annotation_expr["site_callrate"] = hl.agg.fraction(hl.is_defined(mt.GT)) + if min_inbreeding_coeff_threshold is not None: + annotation_expr["site_inbreeding_coeff"] = bi_allelic_site_inbreeding_expr( + mt.GT + ) + if min_hardy_weinberg_threshold is not None: + annotation_expr["hwe"] = hl.agg.hardy_weinberg_test(mt.GT) + + if annotation_expr: + mt = mt.annotate_rows(**annotation_expr) + + filter_expr = [] + if min_af is not None: + filter_expr.append((mt.af > min_af)) + if min_callrate is not None: + filter_expr.append((mt.site_callrate > min_callrate)) + if min_inbreeding_coeff_threshold is not None: + filter_expr.append((mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold)) + if min_hardy_weinberg_threshold is not None: + filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold)) + if snv_only: + filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1])) + if bi_allelic_only: + filter_expr.append(bi_allelic_expr(mt)) + + if apply_hard_filters: + if "info" in mt.row_value: + if "QD" in mt.info: + filter_expr.append((mt.info.QD >= 2)) + else: + logger.warning( + "Could not apply QD hard filter, as `info.QD` not found in schema." + ) + if "FS" in mt.info: + filter_expr.append((mt.info.FS <= 60)) + else: + logger.warning( + "Could not apply FS hard filter, as `info.FS` not found in schema." + ) + if "MQ" in mt.info: + filter_expr.append((mt.info.MQ >= 30)) + else: + logger.warning( + "Could not apply MQ hard filter, as `info.MQ` not found in schema." + ) + else: + logger.warning( + "Could not apply hard filters as `info` not found in schema." + ) + + return mt.filter_rows(functools.reduce(operator.iand, filter_expr))
+ + +
[docs]def get_qc_mt( + mt: hl.MatrixTable, + bi_allelic_only: bool = True, + snv_only: bool = True, + adj_only: bool = True, + min_af: Optional[float] = 0.001, + min_callrate: Optional[float] = 0.99, + min_inbreeding_coeff_threshold: Optional[float] = -0.8, + min_hardy_weinberg_threshold: Optional[float] = 1e-8, + apply_hard_filters: bool = True, + ld_r2: Optional[float] = 0.1, + filter_lcr: bool = True, + filter_decoy: bool = True, + filter_segdup: bool = True, + filter_exome_low_coverage_regions: bool = False, + high_conf_regions: Optional[List[str]] = None, + checkpoint_path: Optional[str] = None, + n_partitions: Optional[int] = None, + block_size: Optional[int] = None, +) -> hl.MatrixTable: + """ + Create a QC-ready MT. + + Has options to filter to the following: + - Variants outside known problematic regions + - Bi-allelic sites only + - SNVs only + - Variants passing hard thresholds + - Variants passing the set call rate and MAF thresholds + - Genotypes passing on gnomAD ADJ criteria (GQ>=20, DP>=10, AB>0.2 for hets) + + In addition, the MT will be LD-pruned if `ld_r2` is set. + + :param mt: Input MT. + :param bi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too. + :param snv_only: Whether to only keep SNVs or include other variant types. + :param adj_only: If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation. + :param min_af: Minimum allele frequency to keep. Not applied if set to ``None``. + :param min_callrate: Minimum call rate to keep. Not applied if set to ``None``. + :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. + :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. + :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30. + :param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning). + :param filter_lcr: Filter LCR regions. + :param filter_decoy: Filter decoy regions. + :param filter_segdup: Filter segmental duplication regions. + :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept). + :param high_conf_regions: If given, the data will be filtered to only include variants in those regions. + :param checkpoint_path: If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead. + :param n_partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. `checkpoint_path` must also be specified as the MT will first be written to the `checkpoint_path` before being reread with the new number of partitions. + :param block_size: If given, set the block size to this value when LD pruning. + :return: Filtered MT. + """ + logger.info("Creating QC MatrixTable") + if ld_r2 is not None: + logger.warning( + "The LD-prune step of this function requires non-preemptible workers only!" + ) + + if n_partitions and not checkpoint_path: + raise ValueError("checkpoint_path must be supplied if repartitioning!") + + qc_mt = filter_low_conf_regions( + mt, + filter_lcr=filter_lcr, + filter_decoy=filter_decoy, + filter_segdup=filter_segdup, + filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, + high_conf_regions=high_conf_regions, + ) + + if adj_only: + qc_mt = filter_to_adj( + qc_mt + ) # TODO: Make sure that this works fine before call rate filtering + + qc_mt = filter_rows_for_qc( + qc_mt, + min_af, + min_callrate, + min_inbreeding_coeff_threshold, + min_hardy_weinberg_threshold, + apply_hard_filters, + bi_allelic_only, + snv_only, + ) + + if ld_r2 is not None: + if checkpoint_path: + if n_partitions: + logger.info("Checkpointing and repartitioning the MT and LD pruning") + qc_mt.write(checkpoint_path, overwrite=True) + qc_mt = hl.read_matrix_table( + checkpoint_path, _n_partitions=n_partitions + ) + else: + logger.info("Checkpointing the MT and LD pruning") + qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) + else: + logger.info("Persisting the MT and LD pruning") + qc_mt = qc_mt.persist() + unfiltered_qc_mt = qc_mt.unfilter_entries() + pruned_ht = hl.ld_prune(unfiltered_qc_mt.GT, r2=ld_r2, block_size=block_size) + qc_mt = qc_mt.filter_rows(hl.is_defined(pruned_ht[qc_mt.row_key])) + + qc_mt = qc_mt.annotate_globals( + qc_mt_params=hl.struct( + bi_allelic_only=bi_allelic_only, + snv_only=snv_only, + adj_only=adj_only, + min_af=min_af if min_af is not None else hl.null(hl.tfloat32), + min_callrate=( + min_callrate if min_callrate is not None else hl.null(hl.tfloat32) + ), + inbreeding_coeff_threshold=( + min_inbreeding_coeff_threshold + if min_inbreeding_coeff_threshold is not None + else hl.null(hl.tfloat32) + ), + min_hardy_weinberg_threshold=( + min_hardy_weinberg_threshold + if min_hardy_weinberg_threshold is not None + else hl.null(hl.tfloat32) + ), + apply_hard_filters=apply_hard_filters, + ld_r2=ld_r2 if ld_r2 is not None else hl.null(hl.tfloat32), + filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, + high_conf_regions=( + high_conf_regions + if high_conf_regions is not None + else hl.null(hl.tarray(hl.tstr)) + ), + ) + ) + return qc_mt.annotate_cols(sample_callrate=hl.agg.fraction(hl.is_defined(qc_mt.GT)))
+ + +
[docs]def infer_sex_karyotype( + ploidy_ht: hl.Table, + f_stat_cutoff: float = 0.5, + use_gaussian_mixture_model: bool = False, + normal_ploidy_cutoff: int = 5, + aneuploidy_cutoff: int = 6, + chr_x_frac_hom_alt_expr: Optional[hl.expr.NumericExpression] = None, + normal_chr_x_hom_alt_cutoff: int = 5, +) -> hl.Table: + """ + Create a Table with X_karyotype, Y_karyotype, and sex_karyotype. + + This function uses `get_ploidy_cutoffs` to determine X and Y ploidy cutoffs and then `get_sex_expr` to get + karyotype annotations from those cutoffs. + + By default `f_stat_cutoff` will be used to roughly split samples into 'XX' and 'XY' for use in `get_ploidy_cutoffs`. + If `use_gaussian_mixture_model` is True a gaussian mixture model will be used to split samples into 'XX' and 'XY' + instead of f-stat. + + :param ploidy_ht: Input Table with chromosome X and chromosome Y ploidy values and optionally f-stat. + :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY + are above cutoff. Default is 0.5. + :param use_gaussian_mixture_model: Use gaussian mixture model to split samples into 'XX' and 'XY' instead of f-stat. + :param normal_ploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs + for XX, XY karyotypes. + :param aneuploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs for + aneuploidies. + :param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X. + :param normal_chr_x_hom_alt_cutoff: Number of standard deviations to use when determining cutoffs for the fraction + of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X for for XX and XY karyotypes. Only + used if `chr_x_frac_hom_alt_expr` is supplied. + :return: Table of samples imputed sex karyotype. + """ + logger.info("Inferring sex karyotype") + if chr_x_frac_hom_alt_expr is not None: + ploidy_ht = ploidy_ht.annotate(_chr_x_frac_hom_alt=chr_x_frac_hom_alt_expr) + + if use_gaussian_mixture_model: + logger.info("Using Gaussian Mixture Model for karyotype assignment") + gmm_sex_ht = gaussian_mixture_model_karyotype_assignment(ploidy_ht) + x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( + gmm_sex_ht, + group_by_expr=gmm_sex_ht.gmm_karyotype, + normal_ploidy_cutoff=normal_ploidy_cutoff, + aneuploidy_cutoff=aneuploidy_cutoff, + ) + ploidy_ht = ploidy_ht.annotate( + gmm_karyotype=gmm_sex_ht[ploidy_ht.key].gmm_karyotype + ) + group_by_expr = ploidy_ht.gmm_karyotype + f_stat_cutoff = None + else: + logger.info("Using f-stat for karyotype assignment") + x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( + ploidy_ht, + f_stat_cutoff=f_stat_cutoff, + normal_ploidy_cutoff=normal_ploidy_cutoff, + aneuploidy_cutoff=aneuploidy_cutoff, + ) + group_by_expr = None + + if chr_x_frac_hom_alt_expr is not None: + logger.info( + "Including cutoffs for the fraction of homozygous alternate genotypes" + " (hom-alt/(hom-alt + het)) on chromosome X. Using %d standard deviations" + " to determine cutoffs.", + normal_chr_x_hom_alt_cutoff, + ) + chr_x_frac_hom_alt_expr = ploidy_ht._chr_x_frac_hom_alt + chr_x_frac_hom_alt_cutoffs = get_chr_x_hom_alt_cutoffs( + ploidy_ht, + chr_x_frac_hom_alt_expr, + f_stat_cutoff=f_stat_cutoff, + group_by_expr=group_by_expr, + cutoff_stdev=normal_chr_x_hom_alt_cutoff, + ) + + else: + chr_x_frac_hom_alt_cutoffs = None + + karyotype_ht = ploidy_ht.select( + **get_sex_expr( + ploidy_ht.chrX_ploidy, + ploidy_ht.chrY_ploidy, + x_ploidy_cutoffs, + y_ploidy_cutoffs, + chr_x_frac_hom_alt_expr=chr_x_frac_hom_alt_expr, + chr_x_frac_hom_alt_cutoffs=chr_x_frac_hom_alt_cutoffs, + ) + ) + karyotype_ht = karyotype_ht.annotate_globals( + use_gaussian_mixture_model=use_gaussian_mixture_model, + normal_ploidy_cutoff=normal_ploidy_cutoff, + aneuploidy_cutoff=aneuploidy_cutoff, + x_ploidy_cutoffs=hl.struct( + upper_cutoff_X=x_ploidy_cutoffs[0], + lower_cutoff_XX=x_ploidy_cutoffs[1][0], + upper_cutoff_XX=x_ploidy_cutoffs[1][1], + lower_cutoff_XXX=x_ploidy_cutoffs[2], + ), + y_ploidy_cutoffs=hl.struct( + lower_cutoff_Y=y_ploidy_cutoffs[0][0], + upper_cutoff_Y=y_ploidy_cutoffs[0][1], + lower_cutoff_YY=y_ploidy_cutoffs[1], + ), + ) + if chr_x_frac_hom_alt_expr is not None: + karyotype_ht = karyotype_ht.annotate_globals( + x_frac_hom_alt_cutoffs=hl.struct( + lower_cutoff_more_than_one_X=chr_x_frac_hom_alt_cutoffs[0][0], + upper_cutoff_more_than_one_X=chr_x_frac_hom_alt_cutoffs[0][1], + lower_cutoff_single_X=chr_x_frac_hom_alt_cutoffs[1], + ) + ) + + if use_gaussian_mixture_model: + karyotype_ht = karyotype_ht.annotate( + gmm_sex_karyotype=ploidy_ht[karyotype_ht.key].gmm_karyotype + ) + else: + karyotype_ht = karyotype_ht.annotate_globals(f_stat_cutoff=f_stat_cutoff) + + return karyotype_ht
+ + +
[docs]def annotate_sex( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + is_sparse: bool = True, + excluded_intervals: Optional[hl.Table] = None, + included_intervals: Optional[hl.Table] = None, + normalization_contig: str = "chr20", + sites_ht: Optional[hl.Table] = None, + aaf_expr: Optional[str] = None, + gt_expr: str = "GT", + f_stat_cutoff: float = 0.5, + aaf_threshold: float = 0.001, + variants_only_x_ploidy: bool = False, + variants_only_y_ploidy: bool = False, + variants_filter_lcr: bool = True, + variants_filter_segdup: bool = True, + variants_filter_decoy: bool = False, + variants_snv_only: bool = False, + coverage_mt: Optional[hl.MatrixTable] = None, + compute_x_frac_variants_hom_alt: bool = False, + compute_fstat: bool = True, + infer_karyotype: bool = True, + use_gaussian_mixture_model: bool = False, +) -> hl.Table: + """ + Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. + + Return Table with the following fields: + - s (str): Sample + - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. + - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. + - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. + - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. + - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. + + If `compute_fstat`: + - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. + - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. + - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. + - observed_homs (int64): Observed number of homozygotes. Calculated using hl.impute_sex. + + If `infer_karyotype`: + - X_karyotype (str): Sample's chromosome X karyotype. + - Y_karyotype (str): Sample's chromosome Y karyotype. + - sex_karyotype (str): Sample's sex karyotype. + + .. note:: + + In order to infer sex karyotype (`infer_karyotype`=True), one of `compute_fstat` or + `use_gaussian_mixture_model` must be set to True. + + :param mtds: Input MatrixTable or VariantDataset. + :param is_sparse: Whether input MatrixTable is in sparse data format. Default is True. + :param excluded_intervals: Optional table of intervals to exclude from the computation. This option is currently + not implemented for imputing sex chromosome ploidy on a VDS. + :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. + :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex + chromosome ploidies. Default is "chr20". + :param sites_ht: Optional Table of sites and alternate allele frequencies for filtering the input MatrixTable prior to imputing sex. + :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. + :param gt_expr: Name of entry field storing the genotype. Default is 'GT'. + :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY + samples are above cutoff. Default is 0.5. + :param aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. Default is 0.001. + :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation. + :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation. + :param variants_filter_lcr: Whether to filter out variants in LCR regions for variants only ploidy estimation and + fraction of homozygous alternate variants on chromosome X. Default is True. + :param variants_filter_segdup: Whether to filter out variants in segdup regions for variants only ploidy estimation + and fraction of homozygous alternate variants on chromosome X. Default is True. + :param variants_filter_decoy: Whether to filter out variants in decoy regions for variants only ploidy estimation + and fraction of homozygous alternate variants on chromosome X. Default is False. Note: this option doesn't + exist for GRCh38. + :param variants_snv_only: Whether to filter to only single nucleotide variants for variants only ploidy estimation + and fraction of homozygous alternate variants on chromosome X. Default is False. + :param coverage_mt: Optional precomputed coverage MatrixTable to use in reference based VDS ploidy estimation. + :param compute_x_frac_variants_hom_alt: Whether to return an annotation for the fraction of homozygous alternate + variants on chromosome X. Default is False. + :param compute_fstat: Whether to compute f-stat. Default is True. + :param infer_karyotype: Whether to infer sex karyotypes. Default is True. + :param use_gaussian_mixture_model: Whether to use gaussian mixture model to split samples into 'XX' and 'XY' + instead of f-stat. Default is False. + :return: Table of samples and their imputed sex karyotypes. + """ + logger.info("Imputing sex chromosome ploidies...") + + if infer_karyotype and not (compute_fstat or use_gaussian_mixture_model): + raise ValueError( + "In order to infer sex karyotype (infer_karyotype=True), one of" + " 'compute_fstat' or 'use_gaussian_mixture_model' must be set to True!" + ) + + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + if excluded_intervals is not None: + raise NotImplementedError( + "The use of the parameter 'excluded_intervals' is currently not" + " implemented for imputing sex chromosome ploidy on a VDS!" + ) + if included_intervals is None: + raise NotImplementedError( + "The current implementation for imputing sex chromosome ploidy on a VDS" + " requires a list of 'included_intervals'!" + ) + mt = mtds.variant_data + else: + if not is_sparse: + raise NotImplementedError( + "Imputing sex ploidy does not exist yet for dense data." + ) + mt = mtds + + # Determine the contigs that are needed for variant only and reference + # block only sex ploidy imputation + rg = get_reference_genome(mt.locus) + if normalization_contig not in rg.contigs: + raise ValueError( + f"Normalization contig {normalization_contig} is not found in reference" + f" genome {rg.name}!" + ) + + x_contigs = set(rg.x_contigs) + y_contigs = set(rg.y_contigs) + if variants_only_x_ploidy: + var_keep_contigs = x_contigs | {normalization_contig} + ref_keep_contigs = set() + else: + ref_keep_contigs = x_contigs | {normalization_contig} + var_keep_contigs = set() + if variants_only_y_ploidy: + var_keep_contigs = {normalization_contig} | var_keep_contigs | y_contigs + else: + ref_keep_contigs = {normalization_contig} | ref_keep_contigs | y_contigs + + ref_keep_locus_intervals = [ + hl.parse_locus_interval(contig, reference_genome=rg.name) + for contig in ref_keep_contigs + ] + var_keep_locus_intervals = [ + hl.parse_locus_interval(contig, reference_genome=rg.name) + for contig in var_keep_contigs + ] + x_locus_intervals = [ + hl.parse_locus_interval(contig, reference_genome=rg.name) + for contig in x_contigs + ] + + if ref_keep_contigs: + logger.info( + "Imputing sex chromosome ploidy using only reference block depth" + " information on the following contigs: %s", + ref_keep_contigs, + ) + if is_vds: + if coverage_mt is not None: + ploidy_ht = hl.vds.impute_sex_chr_ploidy_from_interval_coverage( + coverage_mt.filter_rows( + hl.is_defined(included_intervals[coverage_mt.row_key]) + & hl.literal(ref_keep_contigs).contains( + coverage_mt.interval.start.contig + ) + ), + normalization_contig=normalization_contig, + ) + else: + ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( + hl.vds.filter_intervals(mtds, ref_keep_locus_intervals), + calling_intervals=included_intervals, + normalization_contig=normalization_contig, + use_variant_dataset=False, + ) + ploidy_ht = ploidy_ht.rename( + { + "x_ploidy": "chrX_ploidy", + "y_ploidy": "chrY_ploidy", + "x_mean_dp": "chrX_mean_dp", + "y_mean_dp": "chrY_mean_dp", + } + ) + else: + ploidy_ht = impute_sex_ploidy( + hl.filter_intervals(mt, ref_keep_locus_intervals), + excluded_intervals, + included_intervals, + normalization_contig, + use_only_variants=False, + ) + if variants_only_x_ploidy: + ploidy_ht = ploidy_ht.drop("chrX_ploidy", "chrX_mean_dp") + if variants_only_y_ploidy: + ploidy_ht = ploidy_ht.drop("chrY_ploidy", "chrY_mean_dp") + + add_globals = hl.struct() + if compute_x_frac_variants_hom_alt or var_keep_contigs: + logger.info( + "Filtering variants for variant only sex chromosome ploidy imputation" + " and/or computation of the fraction of homozygous alternate variants on" + " chromosome X", + ) + filtered_mt = hl.filter_intervals( + mt, var_keep_locus_intervals + x_locus_intervals + ) + if variants_filter_lcr or variants_filter_segdup or variants_filter_decoy: + logger.info( + "Filtering out variants in: %s", + ("segmental duplications, " if variants_filter_segdup else "") + + ("low confidence regions, " if variants_filter_lcr else "") + + (" decoy regions" if variants_filter_decoy else ""), + ) + filtered_mt = filter_low_conf_regions( + filtered_mt, + filter_lcr=variants_filter_lcr, + filter_decoy=variants_filter_decoy, + filter_segdup=variants_filter_segdup, + ) + if variants_snv_only: + logger.info("Filtering to SNVs") + filtered_mt = filtered_mt.filter_rows( + hl.is_snp(filtered_mt.alleles[0], filtered_mt.alleles[1]) + ) + + add_globals = add_globals.annotate( + variants_filter_lcr=variants_filter_lcr, + variants_segdup=variants_filter_segdup, + variants_filter_decoy=variants_filter_decoy, + variants_snv_only=variants_snv_only, + ) + + if var_keep_contigs: + logger.info( + "Imputing sex chromosome ploidy using only variant depth information on the" + " following contigs: %s", + var_keep_contigs, + ) + var_filtered_mt = hl.filter_intervals(filtered_mt, var_keep_locus_intervals) + if is_vds: + var_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( + hl.vds.VariantDataset(mtds.reference_data, var_filtered_mt), + calling_intervals=included_intervals, + normalization_contig=normalization_contig, + use_variant_dataset=True, + ) + var_ploidy_ht = var_ploidy_ht.rename( + { + "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp", + "x_ploidy": "chrX_ploidy", + "y_ploidy": "chrY_ploidy", + "x_mean_dp": "chrX_mean_dp", + "y_mean_dp": "chrY_mean_dp", + } + ) + else: + var_ploidy_ht = impute_sex_ploidy( + var_filtered_mt, + excluded_intervals, + included_intervals, + normalization_contig, + use_only_variants=True, + ) + var_ploidy_ht = var_ploidy_ht.rename( + { + f"{normalization_contig}_mean_dp": ( + f"var_data_{normalization_contig}_mean_dp" + ) + } + ) + + if ref_keep_contigs: + ploidy_ht = var_ploidy_ht.annotate(**ploidy_ht[var_ploidy_ht.key]) + else: + ploidy_ht = var_ploidy_ht + + ploidy_ht = ploidy_ht.annotate_globals( + normalization_contig=normalization_contig, + variants_only_x_ploidy=variants_only_x_ploidy, + variants_only_y_ploidy=variants_only_y_ploidy, + **add_globals, + ) + + if compute_x_frac_variants_hom_alt: + logger.info( + "Computing fraction of variants that are homozygous alternate on" + " chromosome X" + ) + filtered_mt = hl.filter_intervals(filtered_mt, x_locus_intervals) + filtered_mt = filtered_mt.filter_rows( + hl.is_defined(included_intervals[filtered_mt.locus]) + ) + filtered_mt = filtered_mt.annotate_entries( + adj=get_adj_expr( + filtered_mt.LGT, filtered_mt.GQ, filtered_mt.DP, filtered_mt.LAD + ) + ) + frac_hom_alt_ht = filtered_mt.select_cols( + chrx_frac_hom_alt=hl.agg.count_where(filtered_mt.LGT.is_hom_var()) + / hl.agg.count_where(hl.is_defined(filtered_mt.LGT)), + chrx_frac_hom_alt_adj=hl.agg.filter( + filtered_mt.adj, + hl.agg.count_where(filtered_mt.LGT.is_hom_var()) + / hl.agg.count_where(hl.is_defined(filtered_mt.LGT)), + ), + ).cols() + ploidy_ht = ploidy_ht.annotate(**frac_hom_alt_ht[ploidy_ht.key]) + + if compute_fstat: + logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) + if "was_split" in list(mt.row): + mt = mt.filter_rows( + (~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1]) + ) + else: + mt = mt.filter_rows( + (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) + ) + + mt = hl.filter_intervals(mt, x_locus_intervals) + if sites_ht is not None: + if aaf_expr is None: + logger.warning( + "sites_ht was provided, but aaf_expr is missing. Assuming name of" + " field with alternate allele frequency is 'AF'." + ) + aaf_expr = "AF" + logger.info("Filtering to provided sites") + mt = mt.annotate_rows(**sites_ht[mt.row_key]) + mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) + + logger.info("Calculating inbreeding coefficient on chrX") + sex_ht = hl.impute_sex( + mt[gt_expr], + aaf_threshold=aaf_threshold, + male_threshold=f_stat_cutoff, + female_threshold=f_stat_cutoff, + aaf=aaf_expr, + ) + + logger.info("Annotating sex chromosome ploidy HT with impute_sex HT") + ploidy_ht = ploidy_ht.annotate(**sex_ht[ploidy_ht.key]) + ploidy_ht = ploidy_ht.annotate_globals(f_stat_cutoff=f_stat_cutoff) + + if infer_karyotype: + karyotype_ht = infer_sex_karyotype( + ploidy_ht, f_stat_cutoff, use_gaussian_mixture_model + ) + ploidy_ht = ploidy_ht.annotate(**karyotype_ht[ploidy_ht.key]) + ploidy_ht = ploidy_ht.annotate_globals(**karyotype_ht.index_globals()) + + return ploidy_ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/sample_qc/platform.html b/_modules/gnomad/sample_qc/platform.html new file mode 100644 index 000000000..1ae4a5178 --- /dev/null +++ b/_modules/gnomad/sample_qc/platform.html @@ -0,0 +1,265 @@ + + + + + + gnomad.sample_qc.platform — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.sample_qc.platform

+# noqa: D100
+
+import logging
+from typing import List, Optional, Tuple
+
+import hail as hl
+import numpy as np
+
+from gnomad.utils.annotations import bi_allelic_expr
+from gnomad.utils.filtering import filter_to_autosomes
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def compute_callrate_mt( + mt: hl.MatrixTable, + intervals_ht: hl.Table, + bi_allelic_only: bool = True, + autosomes_only: bool = True, + match: bool = True, +) -> hl.MatrixTable: + """ + Compute a sample/interval MT with each entry containing the call rate for that sample/interval. + + This can be used as input for imputing exome sequencing platforms. + + .. note:: + + The input interval HT should have a key of type Interval. + The resulting table will have a key of the same type as the `intervals_ht` table and + contain an `interval_info` field containing all non-key fields of the `intervals_ht`. + + :param mt: Input MT + :param intervals_ht: Table containing the intervals. This table has to be keyed by locus. + :param bi_allelic_only: If set, only bi-allelic sites are used for the computation + :param autosomes_only: If set, only autosomal intervals are used. + :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT. + :return: Callrate MT + """ + logger.info("Computing call rate MatrixTable") + + if len(intervals_ht.key) != 1 or not isinstance( + intervals_ht.key[0], hl.expr.IntervalExpression + ): + logger.warning( + "Call rate matrix computation expects `intervals_ht` with a key of type" + " Interval. Found: %s", + intervals_ht.key, + ) + + if autosomes_only: + callrate_mt = filter_to_autosomes(mt) + + if bi_allelic_only: + callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt)) + + intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key) + callrate_mt = callrate_mt.annotate_rows( + _interval_key=intervals_ht.index( + callrate_mt.locus, all_matches=match + )._interval_key + ) + + if match: + callrate_mt = callrate_mt.explode_rows("_interval_key") + + callrate_mt = callrate_mt.filter_rows( + hl.is_defined(callrate_mt._interval_key.interval) + ) + callrate_mt = callrate_mt.select_entries( + GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct()) + ) + callrate_mt = callrate_mt.group_rows_by(**callrate_mt._interval_key).aggregate( + callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT)) + ) + intervals_ht = intervals_ht.drop("_interval_key") + callrate_mt = callrate_mt.annotate_rows( + interval_info=hl.struct(**intervals_ht[callrate_mt.row_key]) + ) + return callrate_mt
+ + +
[docs]def run_platform_pca( + callrate_mt: hl.MatrixTable, + binarization_threshold: Optional[float] = 0.25, + n_pcs: int = 10, +) -> Tuple[List[float], hl.Table, hl.Table]: + """ + Run PCA on a sample/interval MT with each entry containing the call rate. + + When `binzarization_threshold` is set, the callrate is transformed to a 0/1 value based on the threshold. + E.g. with the default threshold of 0.25, all entries with a callrate < 0.25 are considered as 0s, others as 1s. + + :param callrate_mt: Input callrate MT + :param binarization_threshold: binzarization_threshold. None is no threshold desired + :param n_pcs: Number of PCs to compute + :return: eigenvalues, scores_ht, loadings_ht + """ + logger.info("Running platform PCA") + + if binarization_threshold is not None: + callrate_mt = callrate_mt.annotate_entries( + callrate=hl.int(callrate_mt.callrate > binarization_threshold) + ) + # Center until Hail's PCA does it for you + callrate_mt = callrate_mt.annotate_rows( + mean_callrate=hl.agg.mean(callrate_mt.callrate) + ) + callrate_mt = callrate_mt.annotate_entries( + callrate=callrate_mt.callrate - callrate_mt.mean_callrate + ) + eigenvalues, scores, loadings = hl.pca( + callrate_mt.callrate, + compute_loadings=True, + k=n_pcs, + ) # TODO: Evaluate whether computing loadings is a good / worthy thing + logger.info("Platform PCA eigenvalues: %s", eigenvalues) + + return eigenvalues, scores, loadings
+ + +
[docs]def assign_platform_from_pcs( + platform_pca_scores_ht: hl.Table, + pc_scores_ann: str = "scores", + hdbscan_min_cluster_size: Optional[int] = None, + hdbscan_min_samples: int = None, +) -> hl.Table: + """ + Assign platforms using HBDSCAN on the results of call rate PCA. + + :param platform_pca_scores_ht: Input table with the PCA score for each sample + :param pc_scores_ann: Field containing the scores + :param hdbscan_min_cluster_size: HDBSCAN `min_cluster_size` parameter. If not specified the smallest of 500 and 0.1*n_samples will be used. + :param hdbscan_min_samples: HDBSCAN `min_samples` parameter + :return: A Table with a `qc_platform` annotation containing the platform based on HDBSCAN clustering + """ + import hdbscan + + logger.info("Assigning platforms based on platform PCA clustering") + + # Read and format data for clustering + data = platform_pca_scores_ht.to_pandas() + callrate_data = np.matrix(data[pc_scores_ann].tolist()) + logger.info("Assigning platforms to %d samples.", len(callrate_data)) + + # Cluster data + if hdbscan_min_cluster_size is None: + hdbscan_min_cluster_size = min(500, 0.1 * data.shape[0]) + clusterer = hdbscan.HDBSCAN( + min_cluster_size=hdbscan_min_cluster_size, min_samples=hdbscan_min_samples + ) + cluster_labels = clusterer.fit_predict(callrate_data) + n_clusters = len(set(cluster_labels)) - ( + -1 in cluster_labels + ) # NOTE: -1 is the label for noisy (un-classifiable) data points + logger.info("Found %d unique platforms during platform imputation.", n_clusters) + + data["qc_platform"] = cluster_labels + ht = hl.Table.from_pandas(data, key=[*platform_pca_scores_ht.key]) + ht = ht.annotate(qc_platform="platform_" + hl.str(ht.qc_platform)) + return ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/sample_qc/relatedness.html b/_modules/gnomad/sample_qc/relatedness.html new file mode 100644 index 000000000..851ca9013 --- /dev/null +++ b/_modules/gnomad/sample_qc/relatedness.html @@ -0,0 +1,1391 @@ + + + + + + gnomad.sample_qc.relatedness — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.sample_qc.relatedness

+# noqa: D100
+
+import logging
+import random
+from collections import defaultdict
+from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import hail as hl
+import networkx as nx
+
+from gnomad.utils.annotations import annotate_adj
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+UNRELATED = "unrelated"
+"""
+String representation for a pair of unrelated individuals in this module.
+Typically >2nd degree relatives, but the threshold is user-dependant.
+"""
+
+SECOND_DEGREE_RELATIVES = "second degree relatives"
+"""
+String representation for a pair of 2nd degree relatives in this module.
+"""
+
+PARENT_CHILD = "parent-child"
+"""
+String representation for a parent-child pair in this module.
+"""
+
+SIBLINGS = "siblings"
+"""
+String representation for a sibling pair in this module.
+"""
+
+DUPLICATE_OR_TWINS = "duplicate/twins"
+"""
+String representation for a pair of samples who are identical (either MZ twins of duplicate) in this module.
+"""
+
+AMBIGUOUS_RELATIONSHIP = "ambiguous"
+"""
+String representation for a pair of samples whose relationship is ambiguous.
+This is used in the case of a pair of samples which kinship/IBD values do not correspond to any biological relationship between two individuals.
+"""
+
+
+
[docs]def get_duplicated_samples( + relationship_ht: hl.Table, + i_col: str = "i", + j_col: str = "j", + rel_col: str = "relationship", +) -> List[Set[str]]: + """ + Extract the list of duplicate samples using a Table ouput from pc_relate. + + :param relationship_ht: Table with relationships between pairs of samples + :param i_col: Column containing the 1st sample + :param j_col: Column containing the 2nd sample + :param rel_col: Column containing the sample pair relationship annotated with get_relationship_expr + :return: List of sets of samples that are duplicates + """ + + def get_all_dups( + s: str, dups: Set[str], samples_duplicates: Dict[str, Set[str]] + ) -> Tuple[Set[str], Dict[str, Set[str]]]: + """ + Create the set of all duplicated samples corresponding to `s` that are found in `sample_duplicates`. + + Also return the remaining sample duplicates after removing all duplicated samples corresponding to `s`. + + Works by recursively adding duplicated samples to the set. + + :param s: sample to identify duplicates for + :param dups: set of corresponding samples already identified + :param samples_duplicates: dict of sample -> duplicate-pair left to assign + :return: (set of duplicates corresponding to s found in samples_duplicates, remaining samples_duplicates) + """ + if s in samples_duplicates: + dups.add(s) + s_dups = samples_duplicates.pop(s) + for s_dup in s_dups: + if s_dup not in dups: + dups, samples_duplicates = get_all_dups( + s_dup, dups, samples_duplicates + ) + return dups, samples_duplicates + + logger.info("Computing duplicate sets") + dup_pairs = relationship_ht.aggregate( + hl.agg.filter( + relationship_ht[rel_col] == DUPLICATE_OR_TWINS, + hl.agg.collect(hl.tuple([relationship_ht[i_col], relationship_ht[j_col]])), + ) + ) + + samples_duplicates = defaultdict(set) + for i, j in dup_pairs: + samples_duplicates[i].add(j) + samples_duplicates[j].add(i) + + duplicated_samples = [] + while len(samples_duplicates) > 0: + dup_set, samples_duplicates = get_all_dups( + list(samples_duplicates)[0], set(), samples_duplicates + ) + duplicated_samples.append(dup_set) + + return duplicated_samples
+ + +
[docs]def get_duplicated_samples_ht( + duplicated_samples: List[Set[str]], + samples_rankings_ht: hl.Table, + rank_ann: str = "rank", +): + """ + Create a HT with duplicated samples sets. + + Each row is indexed by the sample that is kept and also contains the set of duplicate samples that should be filtered. + + `samples_rankings_ht` is a HT containing a global rank for each of the samples (smaller is better). + + :param duplicated_samples: List of sets of duplicated samples + :param samples_rankings_ht: HT with global rank for each sample + :param rank_ann: Annotation in `samples_ranking_ht` containing each sample global rank (smaller is better). + :return: HT with duplicate sample sets, including which to keep/filter + """ + dups_ht = hl.Table.parallelize( + [ + hl.struct(dup_set=i, dups=duplicated_samples[i]) + for i in range(0, len(duplicated_samples)) + ] + ) + dups_ht = dups_ht.explode(dups_ht.dups, name="_dup") + dups_ht = dups_ht.key_by("_dup") + dups_ht = dups_ht.annotate(rank=samples_rankings_ht[dups_ht.key][rank_ann]) + dups_cols = hl.bind( + lambda x: hl.struct(kept=x[0], filtered=x[1:]), + hl.sorted( + hl.agg.collect(hl.tuple([dups_ht._dup, dups_ht.rank])), key=lambda x: x[1] + ).map(lambda x: x[0]), + ) + dups_ht = dups_ht.group_by(dups_ht.dup_set).aggregate(**dups_cols) + + if isinstance(dups_ht.kept, hl.expr.StructExpression): + dups_ht = dups_ht.key_by(**dups_ht.kept).drop("kept") + else: + dups_ht = dups_ht.key_by( + s=dups_ht.kept + ) # Since there is no defined name in the case of a non-struct type, use `s` + return dups_ht
+ + +
[docs]def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: + """ + Explode the result of `get_duplicated_samples_ht`, so that each line contains a single sample. + + An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. + Requires a field `filtered` which type should be the same as the input duplicated samples Table key. + + :param dups_ht: Input HT + :return: Flattened HT + """ + + def get_dups_to_keep_expr(): + if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype: + return (dups_ht.key, False) + elif (len(dups_ht.key) == 1) & ( + dups_ht.filtered.dtype.element_type == dups_ht.key[0].dtype + ): + return (dups_ht.key[0], False) + else: + raise TypeError( + "Cannot explode table as types of the filtered field" + f" ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are" + " incompatible." + ) + + dups_ht = dups_ht.annotate( + dups=hl.array([get_dups_to_keep_expr()]).extend( + dups_ht.filtered.map(lambda x: (x, True)) + ) + ) + dups_ht = dups_ht.explode("dups") + dups_ht = dups_ht.key_by() + return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by("s")
+ + +
[docs]def get_relationship_expr( # TODO: The threshold detection could be easily automated by fitting distributions over the data. + kin_expr: hl.expr.NumericExpression, + ibd0_expr: hl.expr.NumericExpression, + ibd1_expr: hl.expr.NumericExpression, + ibd2_expr: hl.expr.NumericExpression, + first_degree_kin_thresholds: Tuple[float, float] = (0.19, 0.4), + second_degree_min_kin: float = 0.1, + ibd0_0_max: float = 0.025, + ibd0_25_thresholds: Tuple[float, float] = (0.1, 0.425), + # ibd0_50_thresholds = [0.37, 0.625], Not useful for relationship inference + # ibd0_100_threshold = 0.625 , Not useful for relationship inference + ibd1_0_thresholds: Tuple[float, float] = (-0.15, 0.1), + # ibd1_25_thresholds: Tuple[float, float] = (0.1, 0.37), Not useful for + # relationship inference + ibd1_50_thresholds: Tuple[float, float] = (0.275, 0.75), + ibd1_100_min: float = 0.75, + ibd2_0_max: float = 0.125, + ibd2_25_thresholds: Tuple[float, float] = (0.1, 0.5), + ibd2_100_thresholds: Tuple[float, float] = (0.75, 1.25), +) -> hl.expr.StringExpression: + """ + Return an expression indicating the relationship between a pair of samples given their kin coefficient and IBDO, IBD1, IBD2 values. + + The kinship coefficient values in the defaults are in line with those output from + `hail.methods.pc_relate <https://hail.is/docs/0.2/methods/genetics.html?highlight=pc_relate#hail.methods.pc_relate>`. + + :param kin_expr: Kin coefficient expression + :param ibd0_expr: IBDO expression + :param ibd1_expr: IBD1 expression + :param ibd2_expr: IDB2 expression + :param first_degree_kin_thresholds: (min, max) kinship threshold for 1st degree relatives + :param second_degree_min_kin: min kinship threshold for 2nd degree relatives + :param ibd0_0_max: max IBD0 threshold for 0 IBD0 sharing + :param ibd0_25_thresholds: (min, max) thresholds for 0.25 IBD0 sharing + :param ibd1_0_thresholds: (min, max) thresholds for 0 IBD1 sharing. Note that the min is there because pc_relate can output large negative values in some corner cases. + :param ibd1_50_thresholds: (min, max) thresholds for 0.5 IBD1 sharing + :param ibd1_100_min: min IBD1 threshold for 1.0 IBD1 sharing + :param ibd2_0_max: max IBD2 threshold for 0 IBD2 sharing + :param ibd2_25_thresholds: (min, max) thresholds for 0.25 IBD2 sharing + :param ibd2_100_thresholds: (min, max) thresholds for 1.00 IBD2 sharing. Note that the min is there because pc_relate can output much larger IBD2 values in some corner cases. + :return: The relationship annotation using the constants defined in this module. + """ + return ( + hl.case() + .when(kin_expr < second_degree_min_kin, UNRELATED) + .when((kin_expr < first_degree_kin_thresholds[0]), SECOND_DEGREE_RELATIVES) + .when( + (kin_expr < first_degree_kin_thresholds[1]) + & (ibd0_expr <= ibd0_0_max) + & (ibd1_expr >= ibd1_100_min) + & (ibd2_expr <= ibd2_0_max), + PARENT_CHILD, + ) + .when( + (kin_expr < first_degree_kin_thresholds[1]) + & (ibd0_expr >= ibd0_25_thresholds[0]) + & (ibd0_expr <= ibd0_25_thresholds[1]) + & (ibd1_expr >= ibd1_50_thresholds[0]) + & (ibd1_expr <= ibd1_50_thresholds[1]) + & (ibd2_expr >= ibd2_25_thresholds[0]) + & (ibd2_expr <= ibd2_25_thresholds[1]), + SIBLINGS, + ) + .when( + (kin_expr > first_degree_kin_thresholds[1]) + & (ibd0_expr < ibd0_0_max) + & (ibd1_expr >= ibd1_0_thresholds[0]) + & (ibd1_expr <= ibd1_0_thresholds[1]) + & (ibd2_expr >= ibd2_100_thresholds[0]) + & (ibd2_expr <= ibd2_100_thresholds[1]), + DUPLICATE_OR_TWINS, + ) + .default(AMBIGUOUS_RELATIONSHIP) + )
+ + +
[docs]def get_slope_int_relationship_expr( + kin_expr: hl.expr.NumericExpression, + y_expr: hl.expr.NumericExpression, + parent_child_max_y: float, + second_degree_sibling_lower_cutoff_slope: float, + second_degree_sibling_lower_cutoff_intercept: float, + second_degree_upper_sibling_lower_cutoff_slope: float, + second_degree_upper_sibling_lower_cutoff_intercept: float, + duplicate_twin_min_kin: float = 0.42, + second_degree_min_kin: float = 0.1, + duplicate_twin_ibd1_min: float = -0.15, + duplicate_twin_ibd1_max: float = 0.1, + ibd1_expr: Optional[hl.expr.NumericExpression] = None, +): + """ + Return an expression indicating the relationship between a pair of samples given slope and intercept cutoffs. + + The kinship coefficient (`kin_expr`) and an additional metric (`y_expr`) are used + to define the relationship between a pair of samples. For this function the + slope and intercepts should refer to cutoff lines where the x-axis, or independent + variable is the kinship coefficient and the y-axis, or dependent variable, is + the metric defined by `y_expr`. Typically, the y-axis metric IBS0, IBS0/IBS2, or + IBD0. + + .. note:: + + No defaults are provided for the slope and intercept cutoffs because they are + highly dependent on the dataset and the metric used in `y_expr`. + + The relationship expression is determined as follows: + - If `kin_expr` < `second_degree_min_kin` -> UNRELATED + - If `kin_expr` > `duplicate_twin_min_kin`: + - If `y_expr` < `parent_child_max_y`: + - If `ibd1_expr` is defined: + - If `duplicate_twin_ibd1_min` <= `ibd1_expr` <= ` + duplicate_twin_ibd1_max` -> DUPLICATE_OR_TWINS + - Else -> AMBIGUOUS_RELATIONSHIP + - Else -> DUPLICATE_OR_TWINS + - If `y_expr` < `parent_child_max_y` -> PARENT_CHILD + - If pair is over second_degree_sibling_lower_cutoff line: + - If pair is over second_degree_upper_sibling_lower_cutoff line -> SIBLINGS + - Else -> SECOND_DEGREE_RELATIVES + - If none of the above conditions are met -> AMBIGUOUS_RELATIONSHIP + + :param kin_expr: Kin coefficient expression. Used as the x-axis, or independent + variable, for the slope and intercept cutoffs. + :param y_expr: Expression for the metric to use as the y-axis, or dependent + variable, for the slope and intercept cutoffs. This is typically an expression + for IBS0, IBS0/IBS2, or IBD0. + :param parent_child_max_y: Maximum value of the metric defined by `y_expr` for a + parent-child pair. + :param second_degree_sibling_lower_cutoff_slope: Slope of the line to use as a + lower cutoff for second degree relatives and siblings from parent-child pairs. + :param second_degree_sibling_lower_cutoff_intercept: Intercept of the line to use + as a lower cutoff for second degree relatives and siblings from parent-child + pairs. + :param second_degree_upper_sibling_lower_cutoff_slope: Slope of the line to use as + an upper cutoff for second degree relatives and a lower cutoff for siblings. + :param second_degree_upper_sibling_lower_cutoff_intercept: Intercept of the line to + use as an upper cutoff for second degree relatives and a lower cutoff for + siblings. + :param duplicate_twin_min_kin: Minimum kinship for duplicate or twin pairs. + Default is 0.42. + :param second_degree_min_kin: Minimum kinship threshold for 2nd degree relatives. + Default is 0.08838835. Bycroft et al. (2018) calculates a theoretical kinship + of 0.08838835 for a second degree relationship cutoff, but this cutoff should be + determined by evaluation of the kinship distribution. + :param ibd1_expr: Optional IBD1 expression. If this expression is provided, + `duplicate_twin_ibd1_min` and `duplicate_twin_ibd1_max` will be used as an + additional cutoff for duplicate or twin pairs. + :param duplicate_twin_ibd1_min: Minimum IBD1 cutoff for duplicate or twin pairs. + Note: the min is because pc_relate can output large negative values in some + corner cases. + :param duplicate_twin_ibd1_max: Maximum IBD1 cutoff for duplicate or twin pairs. + :return: The relationship annotation using the constants defined in this module. + """ + # Only use a duplicate/twin IBD1 cutoff if an ibd1_expr is supplied. + if ibd1_expr is not None: + dup_twin_ibd1_expr = (ibd1_expr >= duplicate_twin_ibd1_min) & ( + ibd1_expr <= duplicate_twin_ibd1_max + ) + else: + dup_twin_ibd1_expr = True + + return ( + hl.case() + .when(kin_expr < second_degree_min_kin, UNRELATED) + .when( + kin_expr > duplicate_twin_min_kin, + hl.if_else( + dup_twin_ibd1_expr & (y_expr < parent_child_max_y), + DUPLICATE_OR_TWINS, + AMBIGUOUS_RELATIONSHIP, + ), + ) + .when(y_expr < parent_child_max_y, PARENT_CHILD) + .when( + y_expr + > second_degree_sibling_lower_cutoff_slope * kin_expr + + second_degree_sibling_lower_cutoff_intercept, + hl.if_else( + y_expr + > second_degree_upper_sibling_lower_cutoff_slope * kin_expr + + second_degree_upper_sibling_lower_cutoff_intercept, + SIBLINGS, + SECOND_DEGREE_RELATIVES, + ), + ) + .default(AMBIGUOUS_RELATIONSHIP) + )
+ + +
[docs]def infer_families( + relationship_ht: hl.Table, + sex: Union[hl.Table, Dict[str, bool]], + duplicate_samples_ht: hl.Table, + i_col: str = "i", + j_col: str = "j", + relationship_col: str = "relationship", +) -> hl.Pedigree: + """ + Generate a pedigree containing trios inferred from the `relationship_ht`. + + This function takes a hail Table with a row for each pair of related individuals i, j in the data (it's OK to have + unrelated samples too). + + The `relationship_col` should be a column specifying the relationship between each two samples as defined in this + module's constants. + + This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple + trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID. + + .. note:: + + This function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents). + + :param relationship_ht: Input relationship table + :param sex: A Table or dict giving the sex for each sample (`TRUE`=female, `FALSE`=male). If a Table is given, it should have a field `is_female`. + :param duplicated_samples: All duplicated samples TO REMOVE (If not provided, this function won't work as it assumes that each child has exactly two parents) + :param i_col: Column containing the 1st sample of the pair in the relationship table + :param j_col: Column containing the 2nd sample of the pair in the relationship table + :param relationship_col: Column contatining the relationship for the sample pair as defined in this module constants. + :return: Pedigree of complete trios + """ + + def group_parent_child_pairs_by_fam( + parent_child_pairs: Iterable[Tuple[str, str]], + ) -> List[List[Tuple[str, str]]]: + """ + Group parent-child pairs into a list of families. + + A family here is defined as a list of sample-pairs which all share at least one sample with at least one other + sample-pair in the list. + + :param parent_child_pairs: All the parent-children pairs + :return: A list of families, where each element of the list is a list of the parent-children pairs + """ + fam_id = 1 # stores the current family id + s_fam = dict() # stores the family id for each sample + fams = defaultdict(list) # stores fam_id -> sample-pairs + for pair in parent_child_pairs: + if pair[0] in s_fam: + if pair[1] in s_fam: + if ( + s_fam[pair[0]] != s_fam[pair[1]] + ): # If both samples are in different families, merge the families + new_fam_id = s_fam[pair[0]] + fam_id_to_merge = s_fam[pair[1]] + for s in s_fam: + if s_fam[s] == fam_id_to_merge: + s_fam[s] = new_fam_id + fams[new_fam_id].extend(fams.pop(fam_id_to_merge)) + else: # If only the 1st sample in the pair is already in a family, assign the 2nd sample in the pair to the same family + s_fam[pair[1]] = s_fam[pair[0]] + fams[s_fam[pair[0]]].append(pair) + elif ( + pair[1] in s_fam + ): # If only the 2nd sample in the pair is already in a family, assign the 1st sample in the pair to the same family + s_fam[pair[0]] = s_fam[pair[1]] + fams[s_fam[pair[1]]].append(pair) + else: # If none of the samples in the pair is already in a family, create a new family + s_fam[pair[0]] = fam_id + s_fam[pair[1]] = fam_id + fams[fam_id].append(pair) + fam_id += 1 + + return list(fams.values()) + + def get_trios( + fam_id: str, + parent_child_pairs: List[Tuple[str, str]], + related_pairs: Dict[Tuple[str, str], str], + ) -> List[hl.Trio]: + """ + Generate trios based on the list of parent-child pairs in the family and all related pairs in the data. + + Only complete parent/offspring trios are included in the results. + + The trios are assembled as follows: + 1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs + 2. For each possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent) + 3. If there are multiple children for a given parent pair, all children should be siblings with each other + 4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded. + + :param fam_id: The family ID + :param parent_child_pairs: The parent-child pairs for this family + :param related_pairs: All related sample pairs in the data + :return: List of trios in the family + """ + + def get_possible_parents(samples: List[str]) -> List[Tuple[str, str]]: + """ + Return all pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs. + + :param samples: All samples in the family + :return: Possible parent pairs + """ + possible_parents = [] + for i in range(len(samples)): + for j in range(i + 1, len(samples)): + if ( + related_pairs.get(tuple(sorted([samples[i], samples[j]]))) + is None + ): + if sex.get(samples[i]) is False and sex.get(samples[j]) is True: + possible_parents.append((samples[i], samples[j])) + elif ( + sex.get(samples[i]) is True and sex.get(samples[j]) is False + ): + possible_parents.append((samples[j], samples[i])) + return possible_parents + + def get_children(possible_parents: Tuple[str, str]) -> List[str]: + """ + Construct a list of all children for a given possible parent pair. + + Each child in the list has a parent-offspring pair with each parent. + + :param possible_parents: A pair of possible parents + :return: The list of all children (if any) corresponding to the possible parents + """ + possible_offsprings = defaultdict( + set + ) # stores sample -> set of parents in the possible_parents where (sample, parent) is found in possible_child_pairs + for pair in parent_child_pairs: + if possible_parents[0] == pair[0]: + possible_offsprings[pair[1]].add(possible_parents[0]) + elif possible_parents[0] == pair[1]: + possible_offsprings[pair[0]].add(possible_parents[0]) + elif possible_parents[1] == pair[0]: + possible_offsprings[pair[1]].add(possible_parents[1]) + elif possible_parents[1] == pair[1]: + possible_offsprings[pair[0]].add(possible_parents[1]) + + return [ + s for s, parents in possible_offsprings.items() if len(parents) == 2 + ] + + def check_sibs(children: List[str]) -> bool: + """ + Confirm that all children of a parent pair are siblings with each other. + + If there are multiple children for a given parent pair, all children should be siblings with each other. + + :param children: List of all children for a given parent pair + :return: Whether all children in the list are siblings + """ + for i in range(len(children)): + for j in range(i + 1, len(children)): + if ( + related_pairs[tuple(sorted([children[i], children[j]]))] + != SIBLINGS + ): + return False + return True + + def discard_multi_parents_children(trios: List[hl.Trio]): + """ + Check that each child was only assigned a single pair of parents. + + If a child is found to have multiple parent pairs, they are ALL discarded. + + :param trios: All trios formed for this family + :return: The list of trios for which each child has a single parents pair. + """ + children_trios = defaultdict(list) + for trio in trios: + children_trios[trio.s].append(trio) + + for s, s_trios in children_trios.items(): + if len(s_trios) > 1: + logger.warning( + "Discarded duplicated child %s found multiple in trios: %s", + s, + ", ".join([str(trio) for trio in s_trios]), + ) + + return [trios[0] for trios in children_trios.values() if len(trios) == 1] + + # Get all possible pairs of parents in (father, mother) order + all_possible_parents = get_possible_parents( + list({s for pair in parent_child_pairs for s in pair}) + ) + + trios = [] + for possible_parents in all_possible_parents: + children = get_children(possible_parents) + if check_sibs(children): + trios.extend( + [ + hl.Trio( + s=s, + fam_id=fam_id, + pat_id=possible_parents[0], + mat_id=possible_parents[1], + is_female=sex.get(s), + ) + for s in children + ] + ) + else: + logger.warning( + "Discarded family with same parents, and multiple offspring that" + " weren't siblings:\nMother: %s\nFather:%s\nChildren:%s", + possible_parents[0], + possible_parents[1], + ", ".join(children), + ) + + return discard_multi_parents_children(trios) + + # Get all the relations we care about: + # => Remove unrelateds and duplicates + dups = duplicate_samples_ht.aggregate( + hl.agg.explode( + lambda dup: hl.agg.collect_as_set(dup), duplicate_samples_ht.filtered + ), + _localize=False, + ) + relationship_ht = relationship_ht.filter( + ~dups.contains(relationship_ht[i_col]) + & ~dups.contains(relationship_ht[j_col]) + & (relationship_ht[relationship_col] != UNRELATED) + ) + + # Check relatedness table format + if not relationship_ht[i_col].dtype == relationship_ht[j_col].dtype: + logger.error( + "i_col and j_col of the relatedness table need to be of the same type." + ) + + # If i_col and j_col aren't str, then convert them + if not isinstance(relationship_ht[i_col], hl.expr.StringExpression): + logger.warning( + "Pedigrees can only be constructed from string IDs, but your relatedness_ht" + " ID column is of type: %s. Expression will be converted to string in" + " Pedigrees.", + relationship_ht[i_col].dtype, + ) + if isinstance(relationship_ht[i_col], hl.expr.StructExpression): + logger.warning( + "Struct fields %s will be joined by underscores to use as sample names" + " in Pedigree.", + list(relationship_ht[i_col]), + ) + relationship_ht = relationship_ht.key_by( + **{ + i_col: hl.delimit( + hl.array( + [ + hl.str(relationship_ht[i_col][x]) + for x in relationship_ht[i_col] + ] + ), + "_", + ), + j_col: hl.delimit( + hl.array( + [ + hl.str(relationship_ht[j_col][x]) + for x in relationship_ht[j_col] + ] + ), + "_", + ), + } + ) + else: + raise NotImplementedError( + "The `i_col` and `j_col` columns of the `relationship_ht` argument" + " passed to infer_families are not of type StringExpression or Struct." + ) + + # If sex is a Table, extract sex information as a Dict + if isinstance(sex, hl.Table): + sex = dict(hl.tuple([sex.s, sex.is_female]).collect()) + + # Collect all related sample pairs and + # create a dictionnary with pairs as keys and relationships as values + # Sample-pairs are tuples ordered by sample name + related_pairs = { + tuple(sorted([i, j])): rel + for i, j, rel in hl.tuple( + [relationship_ht.i, relationship_ht.j, relationship_ht.relationship] + ).collect() + } + + parent_child_pairs_by_fam = group_parent_child_pairs_by_fam( + [pair for pair, rel in related_pairs.items() if rel == PARENT_CHILD] + ) + return hl.Pedigree( + [ + trio + for fam_index, parent_child_pairs in enumerate(parent_child_pairs_by_fam) + for trio in get_trios(str(fam_index), parent_child_pairs, related_pairs) + ] + )
+ + +
[docs]def create_fake_pedigree( + n: int, + sample_list: List[str], + exclude_real_probands: bool = False, + max_tries: int = 10, + real_pedigree: Optional[hl.Pedigree] = None, + sample_list_stratification: Optional[Dict[str, str]] = None, +) -> hl.Pedigree: + """ + Generate a pedigree made of trios created by sampling 3 random samples in the sample list. + + - If `real_pedigree` is given, then children in the resulting fake trios will not + include any trio with proband - parents that are in the real ones. + - Each sample can be used only once as a proband in the resulting trios. + - Sex of probands in fake trios is random. + + :param n: Number of fake trios desired in the pedigree. + :param sample_list: List of samples. + :param exclude_real_probands: If set, then fake trios probands cannot be in the + real trios probands. + :param max_tries: Maximum number of sampling to try before bailing out (preventing + infinite loop if `n` is too large w.r.t. the number of samples). + :param real_pedigree: Optional pedigree to exclude children from. + :param sample_list_stratification: Optional dictionary with samples as keys and + a value that should be used to stratify samples in `sample_list` into groups + that the trio should be picked from. This ensures that each fake trio will + contain samples from only the same stratification. For example, if all samples + within a fake trio should be chosen from the same platform, this can be a + dictionary of sample: platform. + :return: Fake pedigree. + """ + real_trios = ( + {trio.s: trio for trio in real_pedigree.trios} + if real_pedigree is not None + else dict() + ) + + if sample_list_stratification is not None: + sample_list_stratified = defaultdict(list) + for s in sample_list: + s_strata = sample_list_stratification.get(s) + if s_strata is None: + raise ValueError( + f"Sample {s} not found in 'sample_list_stratification' dict!" + ) + sample_list_stratified[s_strata].append(s) + else: + sample_list_stratified = None + + if exclude_real_probands and len(real_trios) == len(set(sample_list)): + logger.warning( + "All samples are in the real probands list; cannot create any fake" + " pedigrees with exclude_real_probands=True. Returning an empty Pedigree." + ) + return hl.Pedigree([]) + + fake_trios = {} + tries = 0 + while len(fake_trios) < n and tries < max_tries: + s = random.choice(sample_list) + if sample_list_stratified is None: + curr_sample_list = sample_list + else: + curr_sample_list = sample_list_stratified[sample_list_stratification[s]] + + mat_id, pat_id = random.sample(curr_sample_list, 2) + if ( + s in real_trios + and ( + exclude_real_probands + or {mat_id, pat_id} == {real_trios[s].mat_id, real_trios[s].pat_id} + ) + ) or s in fake_trios: + tries += 1 + else: + tries = 0 + fake_trios[s] = hl.Trio( + s=s, + pat_id=pat_id, + mat_id=mat_id, + fam_id=f"fake_{str(len(fake_trios))}", + is_female=bool(random.getrandbits(1)), + ) + + if tries == max_tries: + logger.warning( + "Only returning %d fake trios; random trio sampling stopped after reaching" + " the maximum %d iterations", + len(fake_trios), + max_tries, + ) + + return hl.Pedigree(list(fake_trios.values()))
+ + + + + +
[docs]def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable: + """ + Filter a MatrixTable to a set of trios in `fam_ht` and annotates with adj. + + :param mt: A Matrix Table to filter to only trios + :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam` + :return: A MT filtered to trios and adj annotated + """ + # Filter MT to samples present in any of the trios + fam_ht = fam_ht.annotate(fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]) + fam_ht = fam_ht.explode("fam_members", name="s") + fam_ht = fam_ht.key_by("s").select().distinct() + + mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) + if "adj" not in mt.entry: + mt = annotate_adj(mt) + + return mt
+ + +
[docs]def generate_trio_stats_expr( + trio_mt: hl.MatrixTable, + transmitted_strata: Dict[str, hl.expr.BooleanExpression] = {"raw": True}, + de_novo_strata: Dict[str, hl.expr.BooleanExpression] = {"raw": True}, + ac_strata: Dict[str, hl.expr.BooleanExpression] = {"raw": True}, + proband_is_female_expr: Optional[hl.expr.BooleanExpression] = None, +) -> hl.expr.StructExpression: + """ + Generate a row-wise expression containing trio transmission stats. + + The expression will generate the following counts: + - Number of alleles in het parents transmitted to the proband + - Number of alleles in het parents not transmitted to the proband + - Number of de novo mutations + - Parent allele count + - Proband allele count + + Transmission and de novo mutation metrics and allele counts can be stratified using additional filters. + `transmitted_strata`, `de_novo_strata`, and `ac_strata` all expect a dictionary of filtering expressions keyed + by their desired suffix to append for labeling. The default will perform counts using all genotypes and append + 'raw' to the label. + + .. note:: + + Expects that `mt` is dense if dealing with a sparse MT `hl.experimental.densify` must be run first. + + :param trio_mt: A trio standard trio MT (with the format as produced by hail.methods.trio_matrix) + :param transmitted_strata: Strata for the transmission counts + :param de_novo_strata: Strata for the de novo counts + :param ac_strata: Strata for the parent and child allele counts + :param proband_is_female_expr: An optional expression giving the sex the proband. If not given, DNMs are only computed for autosomes. + :return: An expression with the counts + """ + # Create map for transmitted, untransmitted and DNM + hom_ref = 0 + het = 1 + hom_var = 2 + + auto_or_par = 2 + hemi_x = 1 + hemi_y = 0 + + trans_config_counts = { + # kid, dad, mom, copy -> t, u + (hom_ref, het, het, auto_or_par): (0, 2), + (hom_ref, hom_ref, het, auto_or_par): (0, 1), + (hom_ref, het, hom_ref, auto_or_par): (0, 1), + (het, het, het, auto_or_par): (1, 1), + (het, hom_ref, het, auto_or_par): (1, 0), + (het, het, hom_ref, auto_or_par): (1, 0), + (het, hom_var, het, auto_or_par): (0, 1), + (het, het, hom_var, auto_or_par): (0, 1), + (hom_var, het, het, auto_or_par): (2, 0), + (hom_var, het, hom_var, auto_or_par): (1, 0), + (hom_var, hom_var, het, auto_or_par): (1, 0), + (hom_ref, hom_ref, het, hemi_x): (0, 1), + (hom_ref, hom_var, het, hemi_x): (0, 1), + (hom_var, hom_ref, het, hemi_x): (1, 0), + (hom_var, hom_var, het, hemi_x): (1, 0), + } + + trans_count_map = hl.literal(trans_config_counts) + + def _get_copy_state(locus: hl.expr.LocusExpression) -> hl.expr.Int32Expression: + """Get copy-state int from LocusExpression for indexing into trans_count_map.""" + return ( + hl.case() + .when(locus.in_autosome_or_par(), auto_or_par) + .when(locus.in_x_nonpar(), hemi_x) + .when(locus.in_y_nonpar(), hemi_y) + .or_missing() + ) + + def _is_dnm( + proband_gt: hl.expr.CallExpression, + father_gt: hl.expr.CallExpression, + mother_gt: hl.expr.CallExpression, + locus: hl.expr.LocusExpression, + proband_is_female: Optional[hl.expr.BooleanExpression], + ) -> hl.expr.BooleanExpression: + """Determine whether a trio genotype combination is a DNM.""" + if proband_is_female is None: + logger.warning( + "Since no proband sex expression was given to generate_trio_stats_expr," + " only DNMs in autosomes will be counted." + ) + return hl.or_missing( + locus.in_autosome(), + proband_gt.is_het() & father_gt.is_hom_ref() & mother_gt.is_hom_ref(), + ) + return hl.if_else( + locus.in_autosome_or_par() | (proband_is_female & locus.in_x_nonpar()), + proband_gt.is_het() & father_gt.is_hom_ref() & mother_gt.is_hom_ref(), + hl.or_missing( + ~proband_is_female, proband_gt.is_hom_var() & father_gt.is_hom_ref() + ), + ) + + def _ac_an_parent_child_count( + proband_gt: hl.expr.CallExpression, + father_gt: hl.expr.CallExpression, + mother_gt: hl.expr.CallExpression, + ) -> Dict[str, hl.expr.Int64Expression]: + """Get AC and AN for parents and children.""" + ac_parent_expr = hl.agg.sum( + father_gt.n_alt_alleles() + mother_gt.n_alt_alleles() + ) + an_parent_expr = hl.agg.sum( + (hl.is_defined(father_gt) + hl.is_defined(mother_gt)) * 2 + ) + ac_child_expr = hl.agg.sum(proband_gt.n_alt_alleles()) + an_child_expr = hl.agg.sum(hl.is_defined(proband_gt) * 2) + + return { + "ac_parents": ac_parent_expr, + "an_parents": an_parent_expr, + "ac_children": ac_child_expr, + "an_children": an_child_expr, + } + + # Create transmission counters + trio_stats = hl.struct( + **{ + f"{name2}_{name}": hl.agg.filter( + ( + trio_mt.proband_entry.GT.is_non_ref() + | trio_mt.father_entry.GT.is_non_ref() + | trio_mt.mother_entry.GT.is_non_ref() + ) + & expr, + hl.agg.sum( + trans_count_map.get( + ( + trio_mt.proband_entry.GT.n_alt_alleles(), + trio_mt.father_entry.GT.n_alt_alleles(), + trio_mt.mother_entry.GT.n_alt_alleles(), + _get_copy_state(trio_mt.locus), + ), + default=(0, 0), + )[i] + ), + ) + for name, expr in transmitted_strata.items() + for i, name2 in enumerate(["n_transmitted", "n_untransmitted"]) + } + ) + + # Create de novo counters + trio_stats = trio_stats.annotate( + **{ + f"n_de_novos_{name}": hl.agg.filter( + _is_dnm( + trio_mt.proband_entry.GT, + trio_mt.father_entry.GT, + trio_mt.mother_entry.GT, + trio_mt.locus, + proband_is_female_expr, + ) + & expr, + hl.agg.count(), + ) + for name, expr in de_novo_strata.items() + } + ) + + trio_stats = trio_stats.annotate( + **{ + f"{name2}_{name}": hl.agg.filter( + expr, + _ac_an_parent_child_count( + trio_mt.proband_entry.GT, + trio_mt.father_entry.GT, + trio_mt.mother_entry.GT, + )[name2], + ) + for name, expr in ac_strata.items() + for name2 in ["ac_parents", "an_parents", "ac_children", "an_children"] + } + ) + + return trio_stats
+ + +
[docs]def generate_sib_stats_expr( + mt: hl.MatrixTable, + sib_ht: hl.Table, + i_col: str = "i", + j_col: str = "j", + strata: Dict[str, hl.expr.BooleanExpression] = {"raw": True}, + is_female: Optional[hl.expr.BooleanExpression] = None, +) -> hl.expr.StructExpression: + """ + Generate a row-wise expression containing the number of alternate alleles in common between sibling pairs. + + The sibling sharing counts can be stratified using additional filters using `stata`. + + .. note:: + + This function expects that the `mt` has either been split or filtered to only bi-allelics + If a sample has multiple sibling pairs, only one pair will be counted + + :param mt: Input matrix table + :param sib_ht: Table defining sibling pairs with one sample in a col (`i_col`) and the second in another col (`j_col`) + :param i_col: Column containing the 1st sample of the pair in the relationship table + :param j_col: Column containing the 2nd sample of the pair in the relationship table + :param strata: Dict with additional strata to use when computing shared sibling variant counts + :param is_female: An optional column in mt giving the sample sex. If not given, counts are only computed for autosomes. + :return: A Table with the sibling shared variant counts + """ + + def _get_alt_count(locus, gt, is_female): + """Calculate alt allele count with sex info if present.""" + if is_female is None: + return hl.or_missing(locus.in_autosome(), gt.n_alt_alleles()) + return ( + hl.case() + .when(locus.in_autosome_or_par(), gt.n_alt_alleles()) + .when( + ~is_female & (locus.in_x_nonpar() | locus.in_y_nonpar()), + hl.min(1, gt.n_alt_alleles()), + ) + .when(is_female & locus.in_y_nonpar(), 0) + .default(0) + ) + + if is_female is None: + logger.warning( + "Since no sex expression was given to generate_sib_stats_expr, only" + " variants in autosomes will be counted." + ) + + # If a sample is in sib_ht more than one time, keep only one of the sibling pairs + # First filter to only samples found in mt to keep as many pairs as possible + s_to_keep = mt.aggregate_cols(hl.agg.collect_as_set(mt.s), _localize=False) + sib_ht = sib_ht.filter( + s_to_keep.contains(sib_ht[i_col].s) & s_to_keep.contains(sib_ht[j_col].s) + ) + sib_ht = sib_ht.add_index("sib_idx") + sib_ht = sib_ht.annotate(sibs=[sib_ht[i_col].s, sib_ht[j_col].s]) + sib_ht = sib_ht.explode("sibs") + sib_ht = sib_ht.group_by("sibs").aggregate( + sib_idx=(hl.agg.take(sib_ht.sib_idx, 1, ordering=sib_ht.sib_idx)[0]) + ) + sib_ht = sib_ht.group_by(sib_ht.sib_idx).aggregate(sibs=hl.agg.collect(sib_ht.sibs)) + sib_ht = sib_ht.filter(hl.len(sib_ht.sibs) == 2).persist() + + logger.info( + "Generating sibling variant sharing counts using %d pairs.", sib_ht.count() + ) + sib_ht = sib_ht.explode("sibs").key_by("sibs")[mt.s] + + # Create sibling sharing counters + sib_stats = hl.struct( + **{ + f"n_sib_shared_variants_{name}": hl.sum( + hl.agg.filter( + expr, + hl.agg.group_by( + sib_ht.sib_idx, + hl.or_missing( + hl.agg.sum(hl.is_defined(mt.GT)) == 2, + hl.agg.min(_get_alt_count(mt.locus, mt.GT, is_female)), + ), + ), + ).values() + ) + for name, expr in strata.items() + } + ) + + sib_stats = sib_stats.annotate( + **{ + f"ac_sibs_{name}": hl.agg.filter( + expr & hl.is_defined(sib_ht.sib_idx), hl.agg.sum(mt.GT.n_alt_alleles()) + ) + for name, expr in strata.items() + } + ) + + return sib_stats
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/sample_qc/sex.html b/_modules/gnomad/sample_qc/sex.html new file mode 100644 index 000000000..f5756f421 --- /dev/null +++ b/_modules/gnomad/sample_qc/sex.html @@ -0,0 +1,536 @@ + + + + + + gnomad.sample_qc.sex — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.sample_qc.sex

+# noqa: D100
+
+import logging
+from typing import List, Optional, Tuple, Union
+
+import hail as hl
+import numpy as np
+import pandas as pd
+from sklearn.mixture import GaussianMixture
+
+from gnomad.utils.annotations import annotate_and_index_source_mt_for_sex_ploidy
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+SEXES = {"Male": "Male", "Female": "Female"}
+
+
+
[docs]def adjusted_sex_ploidy_expr( + locus_expr: hl.expr.LocusExpression, + gt_expr: hl.expr.CallExpression, + karyotype_expr: hl.expr.StringExpression, + xy_karyotype_str: str = "XY", + xx_karyotype_str: str = "XX", +) -> hl.expr.CallExpression: + """ + Create an entry expression to convert XY to haploid on non-PAR X/Y and XX to missing on Y. + + :param locus_expr: Locus expression. + :param gt_expr: Genotype expression. + :param karyotype_expr: Sex karyotype expression. + :param xy_karyotype_str: String representing XY karyotype. Default is "XY". + :param xx_karyotype_str: String representing XX karyotype. Default is "XX". + :return: Genotype adjusted for sex ploidy. + """ + # An optimization that annotates the locus's source matrix table with the + # fields in the case statements below, so they are not re-computed for every entry. + col_idx, row_idx = annotate_and_index_source_mt_for_sex_ploidy( + locus_expr, karyotype_expr, xy_karyotype_str, xx_karyotype_str + ) + + return ( + hl.case(missing_false=True) + # Added to reduce the checks by entry. + .when(row_idx.in_autosome, gt_expr) + .when((row_idx.y_par | row_idx.y_nonpar) & col_idx.xx, hl.missing(hl.tcall)) + .when(~row_idx.in_non_par, gt_expr) + .when( + (row_idx.x_nonpar | row_idx.y_nonpar) & col_idx.xy & gt_expr.is_het(), + hl.missing(hl.tcall), + ) + .when( + (row_idx.x_nonpar | row_idx.y_nonpar) & col_idx.xy, + hl.call(gt_expr[0], phased=False), + ) + .default(gt_expr) + )
+ + +
[docs]def adjust_sex_ploidy( + mt: hl.MatrixTable, + sex_expr: hl.expr.StringExpression, + male_str: str = "male", + female_str: str = "female", +) -> hl.MatrixTable: + """ + Convert males to haploid on non-PAR X/Y, sets females to missing on Y. + + :param mt: Input MatrixTable + :param sex_expr: Expression pointing to sex in MT (if not male_str or female_str, no change) + :param male_str: String for males (default 'male') + :param female_str: String for females (default 'female') + :return: MatrixTable with fixed ploidy for sex chromosomes + """ + return mt.annotate_entries( + GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, sex_expr, male_str, female_str) + )
+ + +
[docs]def gaussian_mixture_model_karyotype_assignment( + sex_ht: hl.Table, + chrx_ploidy_expr: Union[hl.expr.NumericExpression, str] = "chrX_ploidy", + chry_ploidy_expr: Union[hl.expr.NumericExpression, str] = "chrY_ploidy", + karyotype_output_prefix: str = "gmm", +) -> hl.Table: + """ + Annotate the input Table with an X karyotype, Y karyotype, and sex karyotype based on a gaussian mixture model. + + This function uses two component Gaussian mixture models on `chrx_ploidy_expr` and `chry_ploidy_expr` to assign + an X karyotype and a Y karyotype which are then combined into the sex karyotype. + + The following annotations are added: + - {karyotype_output_prefix}_x_karyotype + - {karyotype_output_prefix_y_karyotype + - {karyotype_output_prefix}_karyotype = {karyotype_output_prefix}_x_karyotype + {karyotype_output_prefix}_y_karyotype + + .. note:: + + This uses a two component Gaussian mixture model so all samples are given one of the following sex karyotypes: + X, XX, XY, YY. It's recommended that this annotation is only used to split samples into XX and + XY groups that can then be used in `get_ploidy_cutoffs` to determine XX and XY ploidy means and stdevs. + + :param sex_ht: Input Table with chromosome X and chromosome Y ploidy values. + :param chrx_ploidy_expr: Expression pointing to chromosome X ploidy in `sex_ht`. Default is 'chrX_ploidy'. + :param chry_ploidy_expr: Expression pointing to chromosome Y ploidy in `sex_ht`. Default is 'chrY_ploidy'. + :param karyotype_output_prefix: String to use as the prefix for the Gaussian mixture model karyotype output. Default is 'gmm'. + :return: Input Table with Gaussian mixture model karyotype annotations added. + """ + if isinstance(chrx_ploidy_expr, str): + chrx_ploidy_expr = sex_ht[chrx_ploidy_expr] + if isinstance(chry_ploidy_expr, str): + chry_ploidy_expr = sex_ht[chry_ploidy_expr] + + sex_pd = sex_ht.select( + chrX_ploidy=chrx_ploidy_expr, + chrY_ploidy=chry_ploidy_expr, + ).to_pandas() + + def _run_gaussian_mixture_model( + feature: str, karyotypes: List[str], karyotype_name: str + ) -> pd.DataFrame: + """ + Run Gaussian mixture model on ploidy estimates and infer karyotype. + + :param feature: Column name of ploidy feature to use in Gaussian mixture model. + :param karyotypes: List of possible karyotypes in order of expected `feature` mean. + :param karyotype_name: Column name to use for karyotype output. + :return: Pandas DataFrame with karyotype assignment. + """ + df = sex_pd[["s", feature]].set_index("s") + gmm = GaussianMixture(n_components=2) + gmm.fit(df) + probs = gmm.predict_proba(df) + # Assign cluster to karyotype based on cluster means and the order of + # `karyotypes` + cluster_to_karyotype = dict( + zip(np.argsort([m[0] for m in gmm.means_]), karyotypes) + ) + + df[f"{feature}_cluster"] = gmm.predict(df) + df[karyotype_name] = df.apply( + lambda row: cluster_to_karyotype[row[f"{feature}_cluster"]], axis=1 + ) + for i in cluster_to_karyotype: + df[f"{feature}_prob_{cluster_to_karyotype[i]}"] = probs[:, i] + + return df + + x_df = _run_gaussian_mixture_model( + "chrX_ploidy", ["X", "XX"], f"{karyotype_output_prefix}_x_karyotype" + ) + y_df = _run_gaussian_mixture_model( + "chrY_ploidy", ["", "Y"], f"{karyotype_output_prefix}_y_karyotype" + ) + xy_df = pd.concat( + [ + x_df[f"{karyotype_output_prefix}_x_karyotype"], + y_df[f"{karyotype_output_prefix}_y_karyotype"], + ], + axis=1, + ) + xy_df[f"{karyotype_output_prefix}_karyotype"] = ( + xy_df[f"{karyotype_output_prefix}_x_karyotype"] + + xy_df[f"{karyotype_output_prefix}_y_karyotype"] + ) + xy_ht = hl.Table.from_pandas(xy_df.reset_index(), key=["s"]) + + return sex_ht.annotate(**xy_ht[sex_ht.key])
+ + +
[docs]def get_ploidy_cutoffs( + ht: hl.Table, + f_stat_cutoff: float = None, + normal_ploidy_cutoff: int = 5, + aneuploidy_cutoff: int = 6, + group_by_expr: hl.expr.StringExpression = None, +) -> Tuple[Tuple[float, Tuple[float, float], float], Tuple[Tuple[float, float], float]]: + """ + Get chromosome X and Y ploidy cutoffs for XY and XX samples. + + .. note:: + + This assumes the input hail Table has the fields chrX_ploidy, and chrY_ploidy, and f_stat if `f_stat_cutoff` is + set. + + Return a tuple of sex chromosome ploidy cutoffs: ((x_ploidy_cutoffs), (y_ploidy_cutoffs)). + x_ploidy_cutoffs: (upper cutoff for single X, (lower cutoff for double X, upper cutoff for double X), lower cutoff for triple X) + y_ploidy_cutoffs: ((lower cutoff for single Y, upper cutoff for single Y), lower cutoff for double Y) + + Uses the normal_ploidy_cutoff parameter to determine the ploidy cutoffs for XX and XY karyotypes. + Uses the aneuploidy_cutoff parameter to determine the cutoffs for sex aneuploidies. + + .. note:: + + `f_stat_cutoff` or `group_by_expr` must be supplied. If `f_stat_cutoff` is supplied then f-stat is used to + split the samples into roughly 'XX' and 'XY'. If `group_by_expr` is supplied instead, then it must include an + annotation grouping samples by 'XX' and 'XY'. These are both only used to divide samples into XX and XY to + determine means and standard deviations for these categories and are not used in the final karyotype annotation. + + :param ht: Table with f_stat and sex chromosome ploidies + :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY + are above cutoff. + :param normal_ploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs + for XX, XY karyotypes. + :param aneuploidy_cutoff: Number of standard deviations to use when sex chromosome ploidy cutoffs for aneuploidies. + :param group_by_expr: Expression grouping samples into 'XX' and 'XY'. Can be used instead of and `f_stat_cutoff`. + :return: Tuple of ploidy cutoff tuples: ((x_ploidy_cutoffs), (y_ploidy_cutoffs)) + """ + if (f_stat_cutoff is None and group_by_expr is None) or ( + f_stat_cutoff is not None and group_by_expr is not None + ): + raise ValueError( + "One and only one of 'f_stat_cutoff' or 'group_by_expr' must be supplied!" + ) + + # If 'f_stat_cutoff' is supplied, group the sex chromosome ploidy table by + # f_stat cutoff + if f_stat_cutoff is not None: + group_by_expr = hl.if_else(ht.f_stat < f_stat_cutoff, "XX", "XY") + + # Get mean/stdev for chrX/Y ploidies based on 'group_by_expr' + sex_stats = ht.aggregate( + hl.agg.group_by( + group_by_expr, + hl.struct(x=hl.agg.stats(ht.chrX_ploidy), y=hl.agg.stats(ht.chrY_ploidy)), + ) + ) + if "XX" not in sex_stats: + raise ValueError("No samples are grouped as XX!") + if "XY" not in sex_stats: + raise ValueError("No samples are grouped as XY!") + logger.info("XX stats: %s", sex_stats["XX"]) + logger.info("XY stats: %s", sex_stats["XY"]) + + cutoffs = ( + ( + sex_stats["XY"].x.mean + (normal_ploidy_cutoff * sex_stats["XY"].x.stdev), + ( + sex_stats["XX"].x.mean + - (normal_ploidy_cutoff * sex_stats["XX"].x.stdev), + sex_stats["XX"].x.mean + + (normal_ploidy_cutoff * sex_stats["XX"].x.stdev), + ), + sex_stats["XX"].x.mean + (aneuploidy_cutoff * sex_stats["XX"].x.stdev), + ), + ( + ( + sex_stats["XX"].y.mean + + (normal_ploidy_cutoff * sex_stats["XX"].y.stdev), + sex_stats["XY"].y.mean + + (normal_ploidy_cutoff * sex_stats["XY"].y.stdev), + ), + sex_stats["XY"].y.mean + (aneuploidy_cutoff * sex_stats["XY"].y.stdev), + ), + ) + + logger.info("X ploidy cutoffs: %s", cutoffs[0]) + logger.info("Y ploidy cutoffs: %s", cutoffs[1]) + return cutoffs
+ + +
[docs]def get_chr_x_hom_alt_cutoffs( + ht: hl.Table, + chr_x_frac_hom_alt_expr: hl.expr.NumericExpression, + f_stat_cutoff: float = None, + group_by_expr: hl.expr.StringExpression = None, + cutoff_stdev: int = 5, +) -> Tuple[Tuple[float, float], float]: + """ + Get cutoffs for the fraction homozygous alternate genotypes on chromosome X in 'XY' and 'XX' samples. + + .. note:: + + This assumes the input hail Table has the field 'f_stat' if `f_stat_cutoff` is set. + + Return a tuple of cutoffs for the fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on + chromosome X: ((lower cutoff for more than one X, upper cutoff for more than one X), lower cutoff for single X). + + Uses the `cutoff_stdev` parameter to determine the fraction of homozygous alternate genotypes + (hom-alt/(hom-alt + het)) on chromosome X cutoffs for 'XX' and 'XY' karyotypes. + + .. note:: + + `f_stat_cutoff` or `group_by_expr` must be supplied. If `f_stat_cutoff` is supplied then f-stat is used to + split the samples into roughly 'XX' and 'XY'. If `group_by_expr` is supplied instead, then it must include an + annotation grouping samples by 'XX' and 'XY'. These are both only used to divide samples into XX and XY to + determine means and standard deviations for these categories and are not used in the final karyotype annotation. + + :param ht: Table with f_stat and fraction of homozygous alternate genotypes on chromosome X. + :param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X. + :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY + are above cutoff. + :param group_by_expr: Expression grouping samples into 'XX' and 'XY'. Can be used instead of `f_stat_cutoff`. + :param cutoff_stdev: Number of standard deviations to use when determining sex chromosome ploidy cutoffs + for XX, XY karyotypes. + :return: Tuple of cutoffs: ((lower cutoff for more than one X, upper cutoff for more than one X), lower cutoff for + single X). + """ + if (f_stat_cutoff is None and group_by_expr is None) or ( + f_stat_cutoff is not None and group_by_expr is not None + ): + raise ValueError( + "One and only one of 'f_stat_cutoff' or 'group_by_expr' must be supplied!" + ) + + # If 'f_stat_cutoff' is supplied, group the input Table by f_stat cutoff + if f_stat_cutoff is not None: + group_by_expr = hl.if_else(ht.f_stat < f_stat_cutoff, "XX", "XY") + + # Get mean/stdev based on 'group_by_expr' + sex_stats = ht.aggregate( + hl.agg.group_by( + group_by_expr, + hl.struct(chrx_homalt=hl.agg.stats(chr_x_frac_hom_alt_expr)), + ) + ) + if "XX" not in sex_stats: + raise ValueError("No samples are grouped as XX!") + if "XY" not in sex_stats: + raise ValueError("No samples are grouped as XY!") + + logger.info("XX stats: %s", sex_stats["XX"]) + logger.info("XY stats: %s", sex_stats["XY"]) + + cutoffs = ( + ( + sex_stats["XX"].chrx_homalt.mean + - (cutoff_stdev * sex_stats["XX"].chrx_homalt.stdev), + sex_stats["XX"].chrx_homalt.mean + + (cutoff_stdev * sex_stats["XX"].chrx_homalt.stdev), + ), + sex_stats["XY"].chrx_homalt.mean + - (cutoff_stdev * sex_stats["XY"].chrx_homalt.stdev), + ) + + logger.info("chrx_homalt cutoffs: %s", cutoffs) + + return cutoffs
+ + +
[docs]def get_sex_expr( + chr_x_ploidy: hl.expr.NumericExpression, + chr_y_ploidy: hl.expr.NumericExpression, + x_ploidy_cutoffs: Tuple[float, Tuple[float, float], float], + y_ploidy_cutoffs: Tuple[Tuple[float, float], float], + chr_x_frac_hom_alt_expr: Optional[hl.expr.NumericExpression] = None, + chr_x_frac_hom_alt_cutoffs: Optional[Tuple[Tuple[float, float], float]] = None, +) -> hl.expr.StructExpression: + """ + Create a struct with X_karyotype, Y_karyotype, and sex_karyotype. + + Note that X0 is currently returned as 'X'. + + :param chr_x_ploidy: Chromosome X ploidy (or relative ploidy). + :param chr_y_ploidy: Chromosome Y ploidy (or relative ploidy). + :param x_ploidy_cutoffs: Tuple of X chromosome ploidy cutoffs: (upper cutoff for single X, (lower cutoff for + double X, upper cutoff for double X), lower cutoff for triple X). + :param y_ploidy_cutoffs: Tuple of Y chromosome ploidy cutoffs: ((lower cutoff for single Y, upper cutoff for + single Y), lower cutoff for double Y). + :param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X. + :param chr_x_frac_hom_alt_cutoffs: Tuple of cutoffs for the fraction of homozygous alternate genotypes + (hom-alt/(hom-alt + het)) on chromosome X: ((lower cutoff for more than one X, upper cutoff for more than one X), + lower cutoff for single X). + :return: Struct containing X_karyotype, Y_karyotype, and sex_karyotype. + """ + if sum([chr_x_frac_hom_alt_expr is None, chr_x_frac_hom_alt_cutoffs is None]) == 1: + raise ValueError( + "None or both of `chr_x_frac_hom_alt_expr` and `chr_x_frac_hom_alt_cutoffs`" + " must be set!" + ) + + if chr_x_frac_hom_alt_expr is not None: + lower_cutoff_for_single_x = chr_x_frac_hom_alt_cutoffs[1] + lower_cutoff_for_multiple_x = chr_x_frac_hom_alt_cutoffs[0][0] + upper_cutoff_for_multiple_x = chr_x_frac_hom_alt_cutoffs[0][1] + + add_x_condition = chr_x_frac_hom_alt_expr > lower_cutoff_for_single_x + add_xx_condition = (chr_x_frac_hom_alt_expr > lower_cutoff_for_multiple_x) & ( + chr_x_frac_hom_alt_expr < upper_cutoff_for_multiple_x + ) + add_xxx_condition = chr_x_frac_hom_alt_expr < upper_cutoff_for_multiple_x + else: + add_x_condition = add_xx_condition = add_xxx_condition = True + + upper_ploidy_cutoff_for_x = x_ploidy_cutoffs[0] + lower_ploidy_cutoff_for_xx = x_ploidy_cutoffs[1][0] + upper_ploidy_cutoff_for_xx = x_ploidy_cutoffs[1][1] + lower_ploidy_cutoff_for_xxx = x_ploidy_cutoffs[2] + + lower_ploidy_cutoff_for_y = y_ploidy_cutoffs[0][0] + upper_ploidy_cutoff_for_y = y_ploidy_cutoffs[0][1] + lower_ploidy_cutoff_for_yy = y_ploidy_cutoffs[1] + + sex_expr = hl.struct( + X_karyotype=( + hl.case() + .when((chr_x_ploidy < upper_ploidy_cutoff_for_x) & add_x_condition, "X") + .when( + ( + (chr_x_ploidy > lower_ploidy_cutoff_for_xx) + & (chr_x_ploidy < upper_ploidy_cutoff_for_xx) + & add_xx_condition + ), + "XX", + ) + .when( + (chr_x_ploidy >= lower_ploidy_cutoff_for_xxx) & add_xxx_condition, "XXX" + ) + .default("ambiguous") + ), + Y_karyotype=( + hl.case() + .when(chr_y_ploidy < lower_ploidy_cutoff_for_y, "") + .when( + ( + (chr_y_ploidy > lower_ploidy_cutoff_for_y) + & (chr_y_ploidy < upper_ploidy_cutoff_for_y) + ), + "Y", + ) + .when(chr_y_ploidy >= lower_ploidy_cutoff_for_yy, "YY") + .default("ambiguous") + ), + ) + + return sex_expr.annotate( + sex_karyotype=hl.if_else( + (sex_expr.X_karyotype == "ambiguous") + | (sex_expr.Y_karyotype == "ambiguous"), + "ambiguous", + sex_expr.X_karyotype + sex_expr.Y_karyotype, + ) + )
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/annotations.html b/_modules/gnomad/utils/annotations.html new file mode 100644 index 000000000..cc3eb1e1d --- /dev/null +++ b/_modules/gnomad/utils/annotations.html @@ -0,0 +1,2750 @@ + + + + + + gnomad.utils.annotations — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.annotations

+# noqa: D100
+
+import itertools
+import logging
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+
+import hail as hl
+
+import gnomad.utils.filtering as filter_utils
+from gnomad.utils.gen_stats import to_phred
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+ANNOTATIONS_HISTS = {
+    "FS": (0, 50, 50),  # NOTE: in 2.0.2 release this was on (0,20)
+    "InbreedingCoeff": (-0.25, 0.25, 50),
+    "MQ": (0, 80, 40),
+    "RAW_MQ": (2, 13, 33),
+    "MQRankSum": (-15, 15, 60),
+    "QD": (0, 40, 40),
+    "ReadPosRankSum": (-15, 15, 60),
+    "SOR": (0, 10, 50),
+    "BaseQRankSum": (-15, 15, 60),
+    "ClippingRankSum": (-5, 5, 40),
+    "DP": (1, 9, 32),  # NOTE: in 2.0.2 release this was on (0,8)
+    "VQSLOD": (-30, 30, 60),  # NOTE: in 2.0.2 release this was on (-20,20)
+    "AS_VQSLOD": (-30, 30, 60),
+    "rf_tp_probability": (0, 1, 50),
+    "pab_max": (0, 1, 50),
+}
+
+VRS_CHROM_IDS = {
+    "GRCh38": {
+        "chr1": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
+        "chr2": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g",
+        "chr3": "ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX",
+        "chr4": "ga4gh:SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc",
+        "chr5": "ga4gh:SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI",
+        "chr6": "ga4gh:SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV",
+        "chr7": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
+        "chr8": "ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs",
+        "chr9": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
+        "chr10": "ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB",
+        "chr11": "ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1",
+        "chr12": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl",
+        "chr13": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
+        "chr14": "ga4gh:SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm",
+        "chr15": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
+        "chr16": "ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0",
+        "chr17": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
+        "chr18": "ga4gh:SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV",
+        "chr19": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
+        "chr20": "ga4gh:SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo",
+        "chr21": "ga4gh:SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8",
+        "chr22": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ",
+        "chrX": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
+        "chrY": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
+    },
+    "GRCh37": {
+        "1": "ga4gh:SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU",
+        "2": "ga4gh:SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO",
+        "3": "ga4gh:SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS",
+        "4": "ga4gh:SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V",
+        "5": "ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX",
+        "6": "ga4gh:SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek",
+        "7": "ga4gh:SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86",
+        "8": "ga4gh:SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6",
+        "9": "ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
+        "10": "ga4gh:SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX",
+        "11": "ga4gh:SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG",
+        "12": "ga4gh:SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH",
+        "13": "ga4gh:SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH",
+        "14": "ga4gh:SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf",
+        "15": "ga4gh:SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt",
+        "16": "ga4gh:SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb",
+        "17": "ga4gh:SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz",
+        "18": "ga4gh:SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD",
+        "19": "ga4gh:SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX",
+        "20": "ga4gh:SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf",
+        "21": "ga4gh:SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg",
+        "22": "ga4gh:SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8",
+        "X": "ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm",
+        "Y": "ga4gh:SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-",
+    },
+}
+
+
+
[docs]def pop_max_expr( + freq: hl.expr.ArrayExpression, + freq_meta: hl.expr.ArrayExpression, + pops_to_exclude: Optional[Set[str]] = None, + pop_label: str = "pop", +) -> hl.expr.StructExpression: + """ + + Create an expression containing the frequency information about the population that has the highest AF in `freq_meta`. + + Populations specified in `pops_to_exclude` are excluded and only frequencies from adj populations are considered. + + This resulting struct contains the following fields: + + - AC: int32 + - AF: float64 + - AN: int32 + - homozygote_count: int32 + - pop: str + + :param freq: ArrayExpression of Structs with fields ['AC', 'AF', 'AN', 'homozygote_count'] + :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq) + :param pops_to_exclude: Set of populations to skip for popmax calcluation + :param pop_label: Label of the population field in the meta dictionary + :return: Popmax struct + """ + _pops_to_exclude = ( + hl.literal(pops_to_exclude) + if pops_to_exclude is not None + else hl.empty_set(hl.tstr) + ) + + # pylint: disable=invalid-unary-operand-type + popmax_freq_indices = hl.range(0, hl.len(freq_meta)).filter( + lambda i: (hl.set(freq_meta[i].keys()) == {"group", pop_label}) + & (freq_meta[i]["group"] == "adj") + & (~_pops_to_exclude.contains(freq_meta[i][pop_label])) + ) + freq_filtered = popmax_freq_indices.map( + lambda i: freq[i].annotate(**{pop_label: freq_meta[i][pop_label]}) + ).filter(lambda f: f.AC > 0) + + sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) + return hl.or_missing(hl.len(sorted_freqs) > 0, sorted_freqs[0])
+ + +
[docs]def project_max_expr( + project_expr: hl.expr.StringExpression, + gt_expr: hl.expr.CallExpression, + alleles_expr: hl.expr.ArrayExpression, + n_projects: int = 5, +) -> hl.expr.ArrayExpression: + """ + Create an expression that computes allele frequency information by project for the `n_projects` with the largest AF at this row. + + Will return an array with one element per non-reference allele. + + Each of these elements is itself an array of structs with the following fields: + + - AC: int32 + - AF: float64 + - AN: int32 + - homozygote_count: int32 + - project: str + + .. note:: + + Only projects with AF > 0 are returned. + In case of ties, the project ordering is not guaranteed, and at most `n_projects` are returned. + + :param project_expr: column expression containing the project + :param gt_expr: entry expression containing the genotype + :param alleles_expr: row expression containing the alleles + :param n_projects: Maximum number of projects to return for each row + :return: projectmax expression + """ + n_alleles = hl.len(alleles_expr) + + # compute call stats by project + project_cs = hl.array( + hl.agg.group_by(project_expr, hl.agg.call_stats(gt_expr, alleles_expr)) + ) + + return hl.or_missing( + n_alleles > 1, # Exclude monomorphic sites + hl.range(1, n_alleles).map( + lambda ai: hl.sorted( + project_cs.filter( + # filter to projects with AF > 0 + lambda x: x[1].AF[ai] + > 0 + ), + # order the callstats computed by AF in decreasing order + lambda x: -x[1].AF[ai], + # take the n_projects projects with largest AF + )[:n_projects].map( + # add the project in the callstats struct + lambda x: x[1].annotate( + AC=x[1].AC[ai], + AF=x[1].AF[ai], + AN=x[1].AN, + homozygote_count=x[1].homozygote_count[ai], + project=x[0], + ) + ) + ), + )
+ + +
[docs]def faf_expr( + freq: hl.expr.ArrayExpression, + freq_meta: hl.expr.ArrayExpression, + locus: hl.expr.LocusExpression, + pops_to_exclude: Optional[Set[str]] = None, + faf_thresholds: List[float] = [0.95, 0.99], + pop_label: str = "pop", +) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, str]]]: + """ + Calculate the filtering allele frequency (FAF) for each threshold specified in `faf_thresholds`. + + See http://cardiodb.org/allelefrequencyapp/ for more information. + + The FAF is computed for each of the following population stratification if found in `freq_meta`: + + - All samples, with adj criteria + - For each population, with adj criteria + - For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes) + + Each of the FAF entry is a struct with one entry per threshold specified in `faf_thresholds` of type float64. + + This returns a tuple with two expressions: + + 1. An array of FAF expressions as described above + 2. An array of dict containing the metadata for each of the array elements, in the same format as that produced by `annotate_freq`. + + :param freq: ArrayExpression of call stats structs (typically generated by hl.agg.call_stats) + :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq) + :param locus: locus + :param pops_to_exclude: Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations) + :param faf_thresholds: List of FAF thresholds to compute + :param pop_label: Label of the population field in the meta dictionary + :return: (FAF expression, FAF metadata) + """ + _pops_to_exclude = ( + hl.literal(pops_to_exclude) + if pops_to_exclude is not None + else hl.empty_set(hl.tstr) + ) + + # pylint: disable=invalid-unary-operand-type + faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( + lambda i: (freq_meta[i].get("group") == "adj") + & ( + (freq_meta[i].size() == 1) + | ( + (hl.set(freq_meta[i].keys()) == {pop_label, "group"}) + & (~_pops_to_exclude.contains(freq_meta[i][pop_label])) + ) + ) + ) + sex_faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( + lambda i: (freq_meta[i].get("group") == "adj") + & (freq_meta[i].contains("sex")) + & ( + (freq_meta[i].size() == 2) + | ( + (hl.set(freq_meta[i].keys()) == {pop_label, "group", "sex"}) + & (~_pops_to_exclude.contains(freq_meta[i][pop_label])) + ) + ) + ) + + faf_expr = faf_freq_indices.map( + lambda i: hl.struct( + **{ + f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency( + freq[i].AC, freq[i].AN, threshold + ) + for threshold in faf_thresholds + } + ) + ) + + faf_expr = faf_expr.extend( + sex_faf_freq_indices.map( + lambda i: hl.or_missing( + ~locus.in_autosome_or_par(), + hl.struct( + **{ + f"faf{str(threshold)[2:]}": ( + hl.experimental.filtering_allele_frequency( + freq[i].AC, freq[i].AN, threshold + ) + ) + for threshold in faf_thresholds + } + ), + ) + ) + ) + + faf_meta = faf_freq_indices.extend(sex_faf_freq_indices).map(lambda i: freq_meta[i]) + return faf_expr, hl.eval(faf_meta)
+ + +
[docs]def gen_anc_faf_max_expr( + faf: hl.expr.ArrayExpression, + faf_meta: hl.expr.ArrayExpression, + pop_label: str = "pop", +) -> hl.expr.StructExpression: + """ + Retrieve the maximum FAF and corresponding genetic ancestry for each of the thresholds in `faf`. + + This resulting struct contains the following fields: + + - faf95_max: float64 + - faf95_max_gen_anc: str + - faf99_max: float64 + - faf99_max_gen_anc: str + + :param faf: ArrayExpression of Structs of FAF thresholds previously computed. When + `faf_expr` is used, contains fields 'faf95' and 'faf99'. + :param faf_meta: ArrayExpression of meta dictionaries corresponding to faf (as + returned by faf_expr) + :param pop_label: Label of the population field in the meta dictionary + :return: Genetic ancestry group struct for FAF max + """ + faf_gen_anc_indices = hl.enumerate(faf_meta).filter( + lambda i: (hl.set(i[1].keys()) == {"group", pop_label}) + & (i[1]["group"] == "adj") + ) + max_fafs_expr = hl.struct() + + # Iterate through faf thresholds, generally 'faf95' and 'faf99', and + # take the maximum faf value, '[0]', and its gen_anc from the sorted faf array + for threshold in faf[0].keys(): + faf_struct = hl.sorted( + faf_gen_anc_indices.map( + lambda x: { + f"{threshold}_max": hl.or_missing( + faf[x[0]][threshold] > 0, faf[x[0]][threshold] + ), + f"{threshold}_max_gen_anc": hl.or_missing( + faf[x[0]][threshold] > 0, x[1][pop_label] + ), + } + ), + key=lambda faf: faf[f"{threshold}_max"], + reverse=True, + )[0] + + max_fafs_expr = max_fafs_expr.annotate(**faf_struct) + + return max_fafs_expr
+ + +
[docs]def qual_hist_expr( + gt_expr: Optional[hl.expr.CallExpression] = None, + gq_expr: Optional[hl.expr.NumericExpression] = None, + dp_expr: Optional[hl.expr.NumericExpression] = None, + ad_expr: Optional[hl.expr.ArrayNumericExpression] = None, + adj_expr: Optional[hl.expr.BooleanExpression] = None, + ab_expr: Optional[hl.expr.NumericExpression] = None, + split_adj_and_raw: bool = False, +) -> hl.expr.StructExpression: + """ + Return a struct expression with genotype quality histograms based on the arguments given (dp, gq, ad, ab). + + .. note:: + + - If `gt_expr` is provided, will return histograms for non-reference samples only as well as all samples. + - `gt_expr` is required for the allele-balance histogram, as it is only computed on het samples. + - If `ab_expr` is provided, the allele-balance histogram is computed using this expression instead of the ad_expr. + - If `adj_expr` is provided, additional histograms are computed using only adj samples. + + :param gt_expr: Entry expression containing genotype. + :param gq_expr: Entry expression containing genotype quality. + :param dp_expr: Entry expression containing depth. + :param ad_expr: Entry expression containing allelic depth (bi-allelic here). + :param adj_expr: Entry expression containing adj (high quality) genotype status. + :param ab_expr: Entry expression containing allele balance (bi-allelic here). + :param split_adj_and_raw: Whether to split the adj and raw histograms into separate fields in the returned struct expr. + :return: Genotype quality histograms expression. + """ + qual_hists = {} + if gq_expr is not None: + qual_hists["gq_hist"] = hl.agg.hist(gq_expr, 0, 100, 20) + if dp_expr is not None: + qual_hists["dp_hist"] = hl.agg.hist(dp_expr, 0, 100, 20) + + if gt_expr is not None: + qual_hists = { + **{ + f"{qual_hist_name}_all": qual_hist_expr + for qual_hist_name, qual_hist_expr in qual_hists.items() + }, + **{ + f"{qual_hist_name}_alt": hl.agg.filter( + gt_expr.is_non_ref(), qual_hist_expr + ) + for qual_hist_name, qual_hist_expr in qual_hists.items() + }, + } + ab_hist_msg = "Using the %s to compute allele balance histogram..." + if ab_expr is not None: + logger.info(ab_hist_msg, "ab_expr") + qual_hists["ab_hist_alt"] = hl.agg.filter( + gt_expr.is_het(), hl.agg.hist(ab_expr, 0, 1, 20) + ) + elif ad_expr is not None: + logger.info(ab_hist_msg, "ad_expr") + qual_hists["ab_hist_alt"] = hl.agg.filter( + gt_expr.is_het(), hl.agg.hist(ad_expr[1] / hl.sum(ad_expr), 0, 1, 20) + ) + + else: + qual_hists = { + f"{qual_hist_name}_all": qual_hist_expr + for qual_hist_name, qual_hist_expr in qual_hists.items() + } + + if adj_expr is not None: + adj_qual_hists = { + qual_hist_name: hl.agg.filter(adj_expr, qual_hist_expr) + for qual_hist_name, qual_hist_expr in qual_hists.items() + } + if split_adj_and_raw: + return hl.struct( + raw_qual_hists=hl.struct(**qual_hists), + qual_hists=hl.struct(**adj_qual_hists), + ) + else: + qual_hists.update({f"{k}_adj": v for k, v in adj_qual_hists.items()}) + return hl.struct(**qual_hists)
+ + +
[docs]def age_hists_expr( + adj_expr: hl.expr.BooleanExpression, + gt_expr: hl.expr.CallExpression, + age_expr: hl.expr.NumericExpression, + lowest_boundary: int = 30, + highest_boundary: int = 80, + n_bins: int = 10, +) -> hl.expr.StructExpression: + """ + Return a StructExpression with the age histograms for hets and homs. + + :param adj_expr: Entry expression containing whether a genotype is high quality (adj) or not + :param gt_expr: Entry expression containing the genotype + :param age_expr: Col expression containing the sample's age + :param lowest_boundary: Lowest bin boundary (any younger sample will be binned in n_smaller) + :param highest_boundary: Highest bin boundary (any older sample will be binned in n_larger) + :param n_bins: Total number of bins + :return: A struct with `age_hist_het` and `age_hist_hom` + """ + return hl.struct( + age_hist_het=hl.agg.filter( + adj_expr & gt_expr.is_het(), + hl.agg.hist(age_expr, lowest_boundary, highest_boundary, n_bins), + ), + age_hist_hom=hl.agg.filter( + adj_expr & gt_expr.is_hom_var(), + hl.agg.hist(age_expr, lowest_boundary, highest_boundary, n_bins), + ), + )
+ + +
[docs]def get_lowqual_expr( + alleles: hl.expr.ArrayExpression, + qual_approx_expr: Union[hl.expr.ArrayNumericExpression, hl.expr.NumericExpression], + snv_phred_threshold: int = 30, + snv_phred_het_prior: int = 30, # 1/1000 + indel_phred_threshold: int = 30, + indel_phred_het_prior: int = 39, # 1/8,000 +) -> Union[hl.expr.BooleanExpression, hl.expr.ArrayExpression]: + """ + Compute lowqual threshold expression for either split or unsplit alleles based on QUALapprox or AS_QUALapprox. + + .. note:: + + When running This lowqual annotation using QUALapprox, it differs from the GATK LowQual filter. + This is because GATK computes this annotation at the site level, which uses the least stringent prior for mixed sites. + When run using AS_QUALapprox, this implementation can thus be more stringent for certain alleles at mixed sites. + + :param alleles: Array of alleles + :param qual_approx_expr: QUALapprox or AS_QUALapprox + :param snv_phred_threshold: Phred-scaled SNV "emission" threshold (similar to GATK emission threshold) + :param snv_phred_het_prior: Phred-scaled SNV heterozygosity prior (30 = 1/1000 bases, GATK default) + :param indel_phred_threshold: Phred-scaled indel "emission" threshold (similar to GATK emission threshold) + :param indel_phred_het_prior: Phred-scaled indel heterozygosity prior (30 = 1/1000 bases, GATK default) + :return: lowqual expression (BooleanExpression if `qual_approx_expr`is Numeric, Array[BooleanExpression] if `qual_approx_expr` is ArrayNumeric) + """ + min_snv_qual = snv_phred_threshold + snv_phred_het_prior + min_indel_qual = indel_phred_threshold + indel_phred_het_prior + min_mixed_qual = max(min_snv_qual, min_indel_qual) + + if isinstance(qual_approx_expr, hl.expr.ArrayNumericExpression): + return hl.range(1, hl.len(alleles)).map( + lambda ai: hl.if_else( + hl.is_snp(alleles[0], alleles[ai]), + qual_approx_expr[ai - 1] < min_snv_qual, + qual_approx_expr[ai - 1] < min_indel_qual, + ) + ) + else: + return ( + hl.case() + .when( + hl.range(1, hl.len(alleles)).all( + lambda ai: hl.is_snp(alleles[0], alleles[ai]) + ), + qual_approx_expr < min_snv_qual, + ) + .when( + hl.range(1, hl.len(alleles)).all( + lambda ai: hl.is_indel(alleles[0], alleles[ai]) + ), + qual_approx_expr < min_indel_qual, + ) + .default(qual_approx_expr < min_mixed_qual) + )
+ + +
[docs]def get_annotations_hists( + ht: hl.Table, + annotations_hists: Dict[str, Tuple], + log10_annotations: List[str] = ["DP"], +) -> Dict[str, hl.expr.StructExpression]: + """ + Create histograms for variant metrics in ht.info. + + Used when creating site quality distribution json files. + + :param ht: Table with variant metrics + :param annotations_hists: Dictionary of metrics names and their histogram values (start, end, bins) + :param log10_annotations: List of metrics to log scale + :return: Dictionary of merics and their histograms + :rtype: Dict[str, hl.expr.StructExpression] + """ + # Check all fields in ht.info and create histograms if they are in + # annotations_hists dict + return { + field: hl.agg.hist( + hl.log10(ht.info[field]) if field in log10_annotations else ht.info[field], + start, + end, + bins, + ) + for field, (start, end, bins) in annotations_hists.items() + if field in ht.row.info + }
+ + +
[docs]def create_frequency_bins_expr( + AC: hl.expr.NumericExpression, AF: hl.expr.NumericExpression +) -> hl.expr.StringExpression: + """ + Create bins for frequencies in preparation for aggregating QUAL by frequency bin. + + Bins: + - singleton + - doubleton + - 0.00005 + - 0.0001 + - 0.0002 + - 0.0005 + - 0.001, + - 0.002 + - 0.005 + - 0.01 + - 0.02 + - 0.05 + - 0.1 + - 0.2 + - 0.5 + - 1 + + NOTE: Frequencies should be frequencies from raw data. + Used when creating site quality distribution json files. + + :param AC: Field in input that contains the allele count information + :param AF: Field in input that contains the allele frequency information + :return: Expression containing bin name + :rtype: hl.expr.StringExpression + """ + bin_expr = ( + hl.case() + .when(AC == 1, "binned_singleton") + .when(AC == 2, "binned_doubleton") + .when((AC > 2) & (AF < 0.00005), "binned_0.00005") + .when((AF >= 0.00005) & (AF < 0.0001), "binned_0.0001") + .when((AF >= 0.0001) & (AF < 0.0002), "binned_0.0002") + .when((AF >= 0.0002) & (AF < 0.0005), "binned_0.0005") + .when((AF >= 0.0005) & (AF < 0.001), "binned_0.001") + .when((AF >= 0.001) & (AF < 0.002), "binned_0.002") + .when((AF >= 0.002) & (AF < 0.005), "binned_0.005") + .when((AF >= 0.005) & (AF < 0.01), "binned_0.01") + .when((AF >= 0.01) & (AF < 0.02), "binned_0.02") + .when((AF >= 0.02) & (AF < 0.05), "binned_0.05") + .when((AF >= 0.05) & (AF < 0.1), "binned_0.1") + .when((AF >= 0.1) & (AF < 0.2), "binned_0.2") + .when((AF >= 0.2) & (AF < 0.5), "binned_0.5") + .when((AF >= 0.5) & (AF <= 1), "binned_1") + .default(hl.null(hl.tstr)) + ) + return bin_expr
+ + +
[docs]def annotate_and_index_source_mt_for_sex_ploidy( + locus_expr: hl.expr.LocusExpression, + karyotype_expr: hl.expr.StringExpression, + xy_karyotype_str: str = "XY", + xx_karyotype_str: str = "XX", +) -> Tuple[hl.expr.StructExpression, hl.expr.StructExpression]: + """ + Prepare relevant ploidy annotations for downstream calculations on a matrix table. + + This method is used as an optimization for the `get_is_haploid_expr` and + `adjusted_sex_ploidy_expr` methods. + + This method annotates the `locus_expr` source matrix table with the following + fields: + + - `xy`: Boolean indicating if the sample is XY. + - `xx`: Boolean indicating if the sample is XX. + - `in_non_par`: Boolean indicating if the locus is in a non-PAR region. + - `x_nonpar`: Boolean indicating if the locus is in a non-PAR region of the X + chromosome. + - `y_par`: Boolean indicating if the locus is in a PAR region of the Y + chromosome. + - `y_nonpar`: Boolean indicating if the locus is in a non-PAR region of the Y + chromosome. + + :param locus_expr: Locus expression. + :param karyotype_expr: Karyotype expression. + :param xy_karyotype_str: String representing XY karyotype. Default is "XY". + :param xx_karyotype_str: String representing XX karyotype. Default is "XX". + :return: Tuple of index expressions for columns and rows. + """ + source_mt = locus_expr._indices.source + col_ht = source_mt.annotate_cols( + xy=karyotype_expr.upper() == xy_karyotype_str, + xx=karyotype_expr.upper() == xx_karyotype_str, + ).cols() + row_ht = source_mt.annotate_rows( + in_non_par=~locus_expr.in_autosome_or_par(), + in_autosome=locus_expr.in_autosome(), + x_nonpar=locus_expr.in_x_nonpar(), + y_par=locus_expr.in_y_par(), + y_nonpar=locus_expr.in_y_nonpar(), + ).rows() + col_idx = col_ht[source_mt.col_key] + row_idx = row_ht[source_mt.row_key] + + return col_idx, row_idx
+ + +
[docs]def get_is_haploid_expr( + gt_expr: Optional[hl.expr.CallExpression] = None, + locus_expr: Optional[hl.expr.LocusExpression] = None, + karyotype_expr: Optional[hl.expr.StringExpression] = None, + xy_karyotype_str: str = "XY", + xx_karyotype_str: str = "XX", +) -> hl.expr.BooleanExpression: + """ + Determine if a genotype or locus and karyotype combination is haploid. + + .. note:: + + One of `gt_expr` or `locus_expr` and `karyotype_expr` is required. + + :param gt_expr: Optional genotype expression. + :param locus_expr: Optional locus expression. + :param karyotype_expr: Optional sex karyotype expression. + :param xy_karyotype_str: String representing XY karyotype. Default is "XY". + :param xx_karyotype_str: String representing XX karyotype. Default is "XX". + :return: Boolean expression indicating if the genotype is haploid. + """ + if gt_expr is None and locus_expr is None and karyotype_expr is None: + raise ValueError( + "One of 'gt_expr' or 'locus_expr' and 'karyotype_expr' is required." + ) + + if gt_expr is not None: + return gt_expr.is_haploid() + + if locus_expr is None or karyotype_expr is None: + raise ValueError( + "Both 'locus_expr' and 'karyotype_expr' are required if no 'gt_expr' is " + "supplied." + ) + # An optimization that annotates the locus's matrix table with the + # fields in the case statements below as an optimization step + col_idx, row_idx = annotate_and_index_source_mt_for_sex_ploidy( + locus_expr, karyotype_expr, xy_karyotype_str, xx_karyotype_str + ) + + return row_idx.in_non_par & hl.or_missing( + ~(col_idx.xx & (row_idx.y_par | row_idx.y_nonpar)), + col_idx.xy & (row_idx.x_nonpar | row_idx.y_nonpar), + )
+ + +
[docs]def get_gq_dp_adj_expr( + gq_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], + dp_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], + gt_expr: Optional[hl.expr.CallExpression] = None, + locus_expr: Optional[hl.expr.LocusExpression] = None, + karyotype_expr: Optional[hl.expr.StringExpression] = None, + adj_gq: int = 20, + adj_dp: int = 10, + haploid_adj_dp: int = 5, +) -> hl.expr.BooleanExpression: + """ + Get adj annotation using only GQ and DP. + + Default thresholds correspond to gnomAD values. + + .. note:: + + This function can be used to annotate adj taking into account only GQ and DP. + It is useful for cases where the GT field is not available, such as in the + reference data of a VariantDataset. + + .. note:: + + One of `gt_expr` or `locus_expr` and `karyotype_expr` is required. + + :param gq_expr: GQ expression. + :param dp_expr: DP expression. + :param gt_expr: Optional genotype expression. + :param locus_expr: Optional locus expression. + :param karyotype_expr: Optional sex karyotype expression. + :param adj_gq: GQ threshold for adj. Default is 20. + :param adj_dp: DP threshold for adj. Default is 10. + :param haploid_adj_dp: Haploid DP threshold for adj. Default is 5. + :return: Boolean expression indicating adj filter. + """ + return (gq_expr >= adj_gq) & hl.if_else( + get_is_haploid_expr(gt_expr, locus_expr, karyotype_expr), + dp_expr >= haploid_adj_dp, + dp_expr >= adj_dp, + )
+ + +
[docs]def get_het_ab_adj_expr( + gt_expr: hl.expr.CallExpression, + dp_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], + ad_expr: hl.expr.ArrayNumericExpression, + adj_ab: float = 0.2, +) -> hl.expr.BooleanExpression: + """ + Get adj het AB annotation. + + :param gt_expr: Genotype expression. + :param dp_expr: DP expression. + :param ad_expr: AD expression. + :param adj_ab: AB threshold for adj. Default is 0.2. + :return: Boolean expression indicating adj het AB filter. + """ + return ( + hl.case() + .when(~gt_expr.is_het(), True) + .when(gt_expr.is_het_ref(), ad_expr[gt_expr[1]] / dp_expr >= adj_ab) + .default( + (ad_expr[gt_expr[0]] / dp_expr >= adj_ab) + & (ad_expr[gt_expr[1]] / dp_expr >= adj_ab) + ) + )
+ + +
[docs]def get_adj_expr( + gt_expr: hl.expr.CallExpression, + gq_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], + dp_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], + ad_expr: hl.expr.ArrayNumericExpression, + adj_gq: int = 20, + adj_dp: int = 10, + adj_ab: float = 0.2, + haploid_adj_dp: int = 5, +) -> hl.expr.BooleanExpression: + """ + Get adj genotype annotation. + + Defaults correspond to gnomAD values. + """ + return get_gq_dp_adj_expr( + gq_expr, + dp_expr, + gt_expr=gt_expr, + adj_gq=adj_gq, + adj_dp=adj_dp, + haploid_adj_dp=haploid_adj_dp, + ) & get_het_ab_adj_expr(gt_expr, dp_expr, ad_expr, adj_ab)
+ + +
[docs]def annotate_adj( + mt: hl.MatrixTable, + adj_gq: int = 20, + adj_dp: int = 10, + adj_ab: float = 0.2, + haploid_adj_dp: int = 5, +) -> hl.MatrixTable: + """ + Annotate genotypes with adj criteria (assumes diploid). + + Defaults correspond to gnomAD values. + """ + if "GT" not in mt.entry and "LGT" in mt.entry: + logger.warning("No GT field found, using LGT instead.") + gt_expr = mt.LGT + else: + gt_expr = mt.GT + + if "AD" not in mt.entry and "LAD" in mt.entry: + logger.warning("No AD field found, using LAD instead.") + ad_expr = mt.LAD + else: + ad_expr = mt.AD + + return mt.annotate_entries( + adj=get_adj_expr( + gt_expr, mt.GQ, mt.DP, ad_expr, adj_gq, adj_dp, adj_ab, haploid_adj_dp + ) + )
+ + +
[docs]def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression: + """Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings.""" + ref = alt_alleles[0] + alts = alt_alleles[1:] + non_star_alleles = hl.filter(lambda a: a != "*", alts) + return hl.struct( + variant_type=hl.if_else( + hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles), + hl.if_else(hl.len(non_star_alleles) > 1, "multi-snv", "snv"), + hl.if_else( + hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles), + hl.if_else(hl.len(non_star_alleles) > 1, "multi-indel", "indel"), + "mixed", + ), + ), + n_alt_alleles=hl.len(non_star_alleles), + )
+ + +
[docs]def annotate_allele_info(ht: hl.Table) -> hl.Table: + """ + Return bi-allelic sites Table with an 'allele_info' annotation. + + .. note:: + + This function requires that the input `ht` is unsplit and returns a split `ht`. + + 'allele_info' is a struct with the following information: + - variant_type: Variant type (snv, indel, multi-snv, multi-indel, or mixed). + - n_alt_alleles: Total number of alternate alleles observed at variant locus. + - has_star: True if the variant contains a star allele. + - allele_type: Allele type (snv, insertion, deletion, or mixed). + - was_mixed: True if the variant was mixed (i.e. contained both SNVs and indels). + - nonsplit_alleles: Array of alleles before splitting. + + :param Table ht: Unsplit input Table. + :return: Split Table with allele data annotation added, + """ + ht = ht.annotate( + allele_info=hl.struct( + **add_variant_type(ht.alleles), + has_star=hl.any(lambda a: a == "*", ht.alleles), + ) + ) + + ht = hl.split_multi(ht) + + ref_expr = ht.alleles[0] + alt_expr = ht.alleles[1] + allele_type_expr = ( + hl.case() + .when(hl.is_snp(ref_expr, alt_expr), "snv") + .when(hl.is_insertion(ref_expr, alt_expr), "ins") + .when(hl.is_deletion(ref_expr, alt_expr), "del") + .default("complex") + ) + ht = ht.transmute( + allele_info=ht.allele_info.annotate( + allele_type=allele_type_expr, + was_mixed=ht.allele_info.variant_type == "mixed", + nonsplit_alleles=ht.old_alleles, + ) + ) + + return ht
+ + +
[docs]def annotation_type_is_numeric(t: Any) -> bool: + """ + Given an annotation type, return whether it is a numerical type or not. + + :param t: Type to test + :return: If the input type is numeric + """ + return t in (hl.tint32, hl.tint64, hl.tfloat32, hl.tfloat64)
+ + +
[docs]def annotation_type_in_vcf_info(t: Any) -> bool: + """ + Given an annotation type, returns whether that type can be natively exported to a VCF INFO field. + + .. note:: + + Types that aren't natively exportable to VCF will be converted to String on export. + + :param t: Type to test + :return: If the input type can be exported to VCF + """ + return ( + annotation_type_is_numeric(t) + or t in (hl.tstr, hl.tbool) + or ( + isinstance(t, (hl.tarray, hl.tset)) + and annotation_type_in_vcf_info(t.element_type) + ) + )
+ + +
[docs]def bi_allelic_site_inbreeding_expr( + call: Optional[hl.expr.CallExpression] = None, + callstats_expr: Optional[hl.expr.StructExpression] = None, +) -> hl.expr.Float32Expression: + """ + Return the site inbreeding coefficient as an expression to be computed on a MatrixTable. + + This is implemented based on the GATK InbreedingCoeff metric: + https://software.broadinstitute.org/gatk/documentation/article.php?id=8032 + + .. note:: + + The computation is run based on the counts of alternate alleles and thus should only be run on bi-allelic sites. + + :param call: Expression giving the calls in the MT + :param callstats_expr: StructExpression containing only alternate allele AC, AN, and homozygote_count as integers. If passed, used to create expression in place of GT calls. + :return: Site inbreeding coefficient expression + """ + if call is None and callstats_expr is None: + raise ValueError("One of `call` or `callstats_expr` must be passed.") + + def inbreeding_coeff( + gt_counts: hl.expr.DictExpression, + ) -> hl.expr.Float32Expression: + n = gt_counts.get(0, 0) + gt_counts.get(1, 0) + gt_counts.get(2, 0) + p = (2 * gt_counts.get(0, 0) + gt_counts.get(1, 0)) / (2 * n) + q = (2 * gt_counts.get(2, 0) + gt_counts.get(1, 0)) / (2 * n) + return 1 - (gt_counts.get(1, 0) / (2 * p * q * n)) + + if callstats_expr is not None: + # Check that AC, AN, and homozygote count are all ints + if not ( + ( + (callstats_expr.AC.dtype == hl.tint32) + | (callstats_expr.AC.dtype == hl.tint64) + ) + & ( + (callstats_expr.AN.dtype == hl.tint32) + | (callstats_expr.AN.dtype == hl.tint64) + ) + & ( + (callstats_expr.homozygote_count.dtype == hl.tint32) + | (callstats_expr.homozygote_count.dtype == hl.tint64) + ) + ): + raise ValueError( + "callstats_expr must be a StructExpression containing fields 'AC'," + " 'AN', and 'homozygote_count' of types int32 or int64." + ) + n = callstats_expr.AN / 2 + q = callstats_expr.AC / callstats_expr.AN + p = 1 - q + return 1 - (callstats_expr.AC - (2 * callstats_expr.homozygote_count)) / ( + 2 * p * q * n + ) + else: + return hl.bind(inbreeding_coeff, hl.agg.counter(call.n_alt_alleles()))
+ + +
[docs]def fs_from_sb( + sb: Union[hl.expr.ArrayNumericExpression, hl.expr.ArrayExpression], + normalize: bool = True, + min_cell_count: int = 200, + min_count: int = 4, + min_p_value: float = 1e-320, +) -> hl.expr.Int64Expression: + """ + Compute `FS` (Fisher strand balance) annotation from the `SB` (strand balance table) field. + + `FS` is the phred-scaled value of the double-sided Fisher exact test on strand balance. + + Using default values will have the same behavior as the GATK implementation, that is: + - If sum(counts) > 2*`min_cell_count` (default to GATK value of 200), they are normalized + - If sum(counts) < `min_count` (default to GATK value of 4), returns missing + - Any p-value < `min_p_value` (default to GATK value of 1e-320) is truncated to that value + + In addition to the default GATK behavior, setting `normalize` to `False` will perform a chi-squared test + for large counts (> `min_cell_count`) instead of normalizing the cell values. + + .. note:: + + This function can either take + - an array of length four containing the forward and reverse strands' counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] + - a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]] + + GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java + + :param sb: Count of ref/alt reads on each strand + :param normalize: Whether to normalize counts is sum(counts) > min_cell_count (normalize=True), or use a chi sq instead of FET (normalize=False) + :param min_cell_count: Maximum count for performing a FET + :param min_count: Minimum total count to output FS (otherwise null it output) + :return: FS value + """ + if not isinstance(sb, hl.expr.ArrayNumericExpression): + sb = hl.bind(lambda x: hl.flatten(x), sb) + + sb_sum = hl.bind(lambda x: hl.sum(x), sb) + + # Normalize table if counts get too large + if normalize: + fs_expr = hl.bind( + lambda sb, sb_sum: hl.if_else( + sb_sum <= 2 * min_cell_count, + sb, + sb.map(lambda x: hl.int(x / (sb_sum / min_cell_count))), + ), + sb, + sb_sum, + ) + + # FET + fs_expr = to_phred( + hl.max( + hl.fisher_exact_test( + fs_expr[0], fs_expr[1], fs_expr[2], fs_expr[3] + ).p_value, + min_p_value, + ) + ) + else: + fs_expr = to_phred( + hl.max( + hl.contingency_table_test( + sb[0], sb[1], sb[2], sb[3], min_cell_count=min_cell_count + ).p_value, + min_p_value, + ) + ) + + # Return null if counts <= `min_count` + return hl.or_missing( + sb_sum > min_count, hl.max(0, fs_expr) # Needed to avoid -0.0 values + )
+ + +
[docs]def sor_from_sb( + sb: Union[hl.expr.ArrayNumericExpression, hl.expr.ArrayExpression], +) -> hl.expr.Float64Expression: + """ + Compute `SOR` (Symmetric Odds Ratio test) annotation from the `SB` (strand balance table) field. + + .. note:: + + This function can either take + - an array of length four containing the forward and reverse strands' counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] + - a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]] + + GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java + + :param sb: Count of ref/alt reads on each strand + :return: SOR value + """ + if not isinstance(sb, hl.expr.ArrayNumericExpression): + sb = hl.bind(lambda x: hl.flatten(x), sb) + + sb = sb.map(lambda x: hl.float64(x) + 1) + + ref_fw = sb[0] + ref_rv = sb[1] + alt_fw = sb[2] + alt_rv = sb[3] + symmetrical_ratio = ((ref_fw * alt_rv) / (alt_fw * ref_rv)) + ( + (alt_fw * ref_rv) / (ref_fw * alt_rv) + ) + ref_ratio = hl.min(ref_rv, ref_fw) / hl.max(ref_rv, ref_fw) + alt_ratio = hl.min(alt_fw, alt_rv) / hl.max(alt_fw, alt_rv) + sor = hl.log(symmetrical_ratio) + hl.log(ref_ratio) - hl.log(alt_ratio) + + return sor
+ + +
[docs]def pab_max_expr( + gt_expr: hl.expr.CallExpression, + ad_expr: hl.expr.ArrayExpression, + la_expr: Optional[hl.expr.ArrayExpression] = None, + n_alleles_expr: Optional[hl.expr.Int32Expression] = None, +) -> hl.expr.ArrayExpression: + """ + Compute the maximum p-value of the binomial test for the alternate allele balance (PAB) for each allele. + + .. note:: + + This function can take a `gt_expr` and `ad_expr` that use local or global + alleles. If they use local alleles, `la_expr` and `n_alleles_expr` should be + provided to transform `gt_expr` and `ad_expr` to global alleles. + + :param gt_expr: Genotype call expression. + :param ad_expr: Allele depth expression. + :param la_expr: Allele local index expression. When provided `gt_expr` and + `ad_expr` are transformed from using local alleles to global alleles using + `la_expr`. + :param n_alleles_expr: Number of alleles expression. Required when 'la_expr' is + provided. + :return: Array expression of maximum p-values. + """ + if la_expr is not None: + if n_alleles_expr is None: + raise ValueError("Must provide `n_alleles_expr` if `la_expr` is provided!") + + ad_expr = hl.vds.local_to_global( + ad_expr, la_expr, n_alleles_expr, fill_value=0, number="R" + ) + gt_expr = hl.vds.lgt_to_gt(gt_expr, la_expr) + + expr = hl.agg.array_agg( + lambda x: hl.agg.filter( + gt_expr.is_het(), + hl.agg.max(hl.binom_test(x, hl.sum(ad_expr), 0.5, "two-sided")), + ), + ad_expr[1:], # Skip ref allele + ) + + return expr
+ + +
[docs]def bi_allelic_expr(t: Union[hl.Table, hl.MatrixTable]) -> hl.expr.BooleanExpression: + """ + Return a boolean expression selecting bi-allelic sites only, accounting for whether the input MT/HT was split. + + :param t: Input HT/MT + :return: Boolean expression selecting only bi-allelic sites + """ + return ~t.was_split if "was_split" in t.row else (hl.len(t.alleles) == 2)
+ + +
[docs]def unphase_call_expr(call_expr: hl.expr.CallExpression) -> hl.expr.CallExpression: + """ + Generate unphased version of a call expression (which can be phased or not). + + :param call_expr: Input call expression + :return: unphased call expression + """ + return ( + hl.case() + .when(call_expr.is_diploid(), hl.call(call_expr[0], call_expr[1], phased=False)) + .when(call_expr.is_haploid(), hl.call(call_expr[0], phased=False)) + .default(hl.null(hl.tcall)) + )
+ + +
[docs]def region_flag_expr( + t: Union[hl.Table, hl.MatrixTable], + non_par: bool = True, + prob_regions: Dict[str, hl.Table] = None, +) -> hl.expr.StructExpression: + """ + Create a `region_flag` struct that contains flags for problematic regions (i.e., LCR, decoy, segdup, and nonpar regions). + + .. note:: No hg38 resources for decoy or self chain are available yet. + + :param t: Input Table/MatrixTable + :param non_par: If True, flag loci that occur within pseudoautosomal regions on sex chromosomes + :param prob_regions: If supplied, flag loci that occur within regions defined in Hail Table(s) + :return: `region_flag` struct row annotation + """ + prob_flags_expr = ( + {"non_par": t.locus.in_x_nonpar() | t.locus.in_y_nonpar()} if non_par else {} + ) + + if prob_regions is not None: + prob_flags_expr.update( + { + region_name: hl.is_defined(region_table[t.locus]) + for region_name, region_table in prob_regions.items() + } + ) + + return hl.struct(**prob_flags_expr)
+ + +
[docs]def missing_callstats_expr() -> hl.expr.StructExpression: + """ + Create a missing callstats struct for insertion into frequency annotation arrays when data is missing. + + :return: Hail Struct with missing values for each callstats element + """ + return hl.struct( + AC=hl.missing(hl.tint32), + AF=hl.missing(hl.tfloat64), + AN=hl.missing(hl.tint32), + homozygote_count=hl.missing(hl.tint32), + )
+ + +
[docs]def set_female_y_metrics_to_na_expr( + t: Union[hl.Table, hl.MatrixTable], + freq_expr: Union[hl.expr.ArrayExpression, str] = "freq", + freq_meta_expr: Union[hl.expr.ArrayExpression, str] = "freq_meta", + freq_index_dict_expr: Union[hl.expr.DictExpression, str] = "freq_index_dict", +) -> hl.expr.ArrayExpression: + """ + Set Y-variant frequency callstats for female-specific metrics to missing structs. + + :param t: Table or MatrixTable for which to adjust female metrics. + :param freq_expr: Array expression or string annotation name for the frequency + array. Default is "freq". + :param freq_meta_expr: Array expression or string annotation name for the frequency + metadata. Default is "freq_meta". + :param freq_index_dict_expr: Dict expression or string annotation name for the + frequency metadata index dictionary. Default is "freq_index_dict". + :return: Hail array expression to set female Y-variant metrics to missing values. + """ + if isinstance(freq_expr, str): + freq_expr = t[freq_expr] + if isinstance(freq_meta_expr, str): + freq_meta_expr = t[freq_meta_expr] + if isinstance(freq_index_dict_expr, str): + freq_index_dict_expr = t[freq_index_dict_expr] + + female_idx = hl.map( + lambda x: freq_index_dict_expr[x], + hl.filter(lambda x: x.contains("XX"), freq_index_dict_expr.keys()), + ) + freq_idx_range = hl.range(hl.len(freq_meta_expr)) + + new_freq_expr = hl.if_else( + (t.locus.in_y_nonpar() | t.locus.in_y_par()), + hl.map( + lambda x: hl.if_else( + female_idx.contains(x), missing_callstats_expr(), freq_expr[x] + ), + freq_idx_range, + ), + freq_expr, + ) + + return new_freq_expr
+ + +
[docs]def hemi_expr( + locus: hl.expr.LocusExpression, + sex_expr: hl.expr.StringExpression, + gt: hl.expr.CallExpression, + male_str: str = "XY", +) -> hl.expr.BooleanExpression: + """ + Return whether genotypes are hemizygous. + + Return missing expression if locus is not in chrX/chrY non-PAR regions. + + :param locus: Input locus. + :param sex_expr: Input StringExpression indicating whether sample is XX or XY. + :param gt: Input genotype. + :param xy_str: String indicating whether sample is XY. Default is "XY". + :return: BooleanExpression indicating whether genotypes are hemizygous. + """ + return hl.or_missing( + locus.in_x_nonpar() | locus.in_y_nonpar(), + # Haploid genotypes have a single integer, so checking if + # mt.GT[0] is alternate allele + gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1), + )
+ + +
[docs]def merge_freq_arrays( + farrays: List[hl.expr.ArrayExpression], + fmeta: List[List[Dict[str, str]]], + operation: str = "sum", + set_negatives_to_zero: bool = False, + count_arrays: Optional[Dict[str, List[hl.expr.ArrayExpression]]] = None, +) -> Union[ + Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]], + Tuple[ + hl.expr.ArrayExpression, + List[Dict[str, int]], + Dict[str, List[hl.expr.ArrayExpression]], + ], +]: + """ + Merge a list of frequency arrays based on the supplied `operation`. + + .. warning:: + Arrays must be on the same Table. + + .. note:: + + Arrays do not have to contain the same groupings or order of groupings but + the array indices for a freq array in `farrays` must be the same as its associated + frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta` + must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information + for freq1. + + If `operation` is set to "sum", groups in the merged array + will be the union of groupings found within the arrays' metadata and all arrays + with be summed by grouping. If `operation` is set to "diff", the merged array + will contain groups only found in the first array of `fmeta`. Any array containing + any of these groups will have thier values subtracted from the values of the first array. + + :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. All arrays must be on the same Table. + :param fmeta: List of frequency metadata for arrays being merged. + :param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it. + :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is False. + :param count_arrays: Dictionary of Lists of arrays containing counts to merge using the passed operation. Must use the same group indexing as fmeta. Keys are the descriptor names, values are Lists of arrays to merge. Default is None. + :return: Tuple of merged frequency array, frequency metadata list and if `count_arrays` is not None, a dictionary of merged count arrays. + """ + if len(farrays) < 2: + raise ValueError("Must provide at least two frequency arrays to merge!") + if len(farrays) != len(fmeta): + raise ValueError("Length of farrays and fmeta must be equal!") + if operation not in ["sum", "diff"]: + raise ValueError("Operation must be either 'sum' or 'diff'!") + if count_arrays is not None: + for k, count_array in count_arrays.items(): + if len(count_array) != len(fmeta): + raise ValueError( + f"Length of count_array '{k}' and fmeta must be equal!" + ) + + # Create a list where each entry is a dictionary whose key is an aggregation + # group and the value is the corresponding index in the freq array. + fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta] + all_keys = hl.fold(lambda i, j: (i | j.key_set()), fmeta[0].key_set(), fmeta[1:]) + + # Merge dictionaries in the list into a single dictionary where key is aggregation + # group and the value is a list of the group's index in each of the freq arrays, if + # it exists. For "sum" operation, use keys, aka groups, found in all freq dictionaries. + # For "diff" operations, only use key_set from the first entry. + fmeta = hl.fold( + lambda i, j: hl.dict( + (hl.if_else(operation == "sum", all_keys, i.key_set())).map( + lambda k: ( + k, + i.get(k, [hl.missing(hl.tint32)]).extend( + j.get(k, [hl.missing(hl.tint32)]) + ), + ) + ) + ), + fmeta[0], + fmeta[1:], + ) + + # Create a list of tuples from the dictionary, sorted by the list of indices for + # each aggregation group. + fmeta = hl.sorted(fmeta.items(), key=lambda f: f[1]) + + # Create a list of the aggregation groups, maintaining the sorted order. + new_freq_meta = fmeta.map(lambda x: x[0]) + + # Create array for each aggregation group of arrays containing the group's freq + # values from each freq array. + freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]])) + + def _sum_or_diff_fields( + field_1_expr: str, field_2_expr: str + ) -> hl.expr.Int32Expression: + """ + Sum or subtract fields in call statistics struct. + + :param field_1_expr: First field to sum or diff. + :param field_2_expr: Second field to sum or diff. + :return: Merged field value. + """ + return hl.if_else( + operation == "sum", + hl.or_else(field_1_expr, 0) + hl.or_else(field_2_expr, 0), + hl.or_else(field_1_expr, 0) - hl.or_else(field_2_expr, 0), + ) + + # Iterate through the groups and their freq lists to merge callstats. + callstat_ann = ["AC", "AN", "homozygote_count"] + callstat_ann_af = ["AC", "AF", "AN", "homozygote_count"] + new_freq = freq_meta_idx.map( + lambda x: hl.bind( + lambda y: y.annotate(AF=hl.or_missing(y.AN > 0, y.AC / y.AN)).select( + *callstat_ann_af + ), + hl.fold( + lambda i, j: hl.struct( + **{ann: _sum_or_diff_fields(i[ann], j[ann]) for ann in callstat_ann} + ), + x[0].select(*callstat_ann), + x[1:], + ), + ) + ) + # Create count_array_meta_idx using the fmeta then iterate through each group + # in the list of tuples to access each group's entry per array. Sum or diff the + # values for each group across arrays to make a new_counts_array annotation. + if count_arrays: + new_counts_array_dict = {} + for k, count_array in count_arrays.items(): + count_array_meta_idx = fmeta.map( + lambda x: hl.zip(count_array, x[1]).map(lambda i: i[0][i[1]]) + ) + + new_counts_array_dict[k] = count_array_meta_idx.map( + lambda x: hl.fold( + lambda i, j: _sum_or_diff_fields(i, j), + x[0], + x[1:], + ), + ) + # Check and see if any annotation within the merged array is negative. If so, + # raise an error if set_negatives_to_zero is False or set the value to 0 if + # set_negatives_to_zero is True. + if operation == "diff": + negative_value_error_msg = ( + "Negative values found in merged %s array. Review data or set" + " `set_negatives_to_zero` to True to set negative values to 0." + ) + callstat_ann.append("AF") + new_freq = new_freq.map( + lambda x: x.annotate( + **{ + ann: ( + hl.case() + .when(set_negatives_to_zero, hl.max(x[ann], 0)) + .when(x[ann] >= 0, x[ann]) + .or_error(negative_value_error_msg % "freq") + ) + for ann in callstat_ann + } + ) + ) + if count_arrays: + for k, new_counts_array in new_counts_array_dict.items(): + new_counts_array_dict[k] = new_counts_array.map( + lambda x: hl.case() + .when(set_negatives_to_zero, hl.max(x, 0)) + .when(x >= 0, x) + .or_error(negative_value_error_msg % "counts") + ) + + new_freq_meta = hl.eval(new_freq_meta) + if count_arrays: + return new_freq, new_freq_meta, new_counts_array_dict + else: + return new_freq, new_freq_meta
+ + +
[docs]def merge_histograms(hists: List[hl.expr.StructExpression]) -> hl.expr.Expression: + """ + Merge a list of histogram annotations. + + This function merges a list of histogram annotations by summing the arrays + in an element-wise fashion. It keeps one 'bin_edge' annotation but merges the + 'bin_freq', 'n_smaller', and 'n_larger' annotations by summing them. + + .. note:: + + Bin edges are assumed to be the same for all histograms. + + :param hists: List of histogram structs to merge. + :return: Merged histogram struct. + """ + return hl.fold( + lambda i, j: hl.struct( + **{ + "bin_edges": hl.or_else(i.bin_edges, j.bin_edges), + "bin_freq": hl.zip( + hl.or_else(i.bin_freq, hl.literal([hl.missing(hl.tint)])), + hl.or_else(j.bin_freq, hl.literal([hl.missing(hl.tint)])), + fill_missing=True, + ).map(lambda x: hl.or_else(x[0], 0) + hl.or_else(x[1], 0)), + "n_smaller": hl.or_else(i.n_smaller, 0) + hl.or_else(j.n_smaller, 0), + "n_larger": hl.or_else(i.n_larger, 0) + hl.or_else(j.n_larger, 0), + } + ), + hists[0].select("bin_edges", "bin_freq", "n_smaller", "n_larger"), + hists[1:], + )
+ + +# Functions used for computing allele frequency. +
[docs]def annotate_freq( + mt: hl.MatrixTable, + sex_expr: Optional[hl.expr.StringExpression] = None, + pop_expr: Optional[hl.expr.StringExpression] = None, + subpop_expr: Optional[hl.expr.StringExpression] = None, + additional_strata_expr: Optional[ + Union[ + List[Dict[str, hl.expr.StringExpression]], + Dict[str, hl.expr.StringExpression], + ] + ] = None, + downsamplings: Optional[List[int]] = None, + downsampling_expr: Optional[hl.expr.StructExpression] = None, + ds_pop_counts: Optional[Dict[str, int]] = None, + entry_agg_funcs: Optional[Dict[str, Tuple[Callable, Callable]]] = None, + annotate_mt: bool = True, +) -> Union[hl.Table, hl.MatrixTable]: + """ + Annotate `mt` with stratified allele frequencies. + + The output Matrix table will include: + - row annotation `freq` containing the stratified allele frequencies + - global annotation `freq_meta` with metadata + - global annotation `freq_meta_sample_count` with sample count information + + .. note:: + + Currently this only supports bi-allelic sites. + + The input `mt` needs to have the following entry fields: + - GT: a CallExpression containing the genotype + - adj: a BooleanExpression containing whether the genotype is of high quality + or not. + + All expressions arguments need to be expression on the input `mt`. + + .. rubric:: `freq` row annotation + + The `freq` row annotation is an Array of Structs, with each Struct containing the + following fields: + + - AC: int32 + - AF: float64 + - AN: int32 + - homozygote_count: int32 + + Each element of the array corresponds to a stratification of the data, and the + metadata about these annotations is stored in the globals. + + .. rubric:: Global `freq_meta` metadata annotation + + The global annotation `freq_meta` is added to the input `mt`. It is a list of dict. + Each element of the list contains metadata on a frequency stratification and the + index in the list corresponds to the index of that frequency stratification in the + `freq` row annotation. + + .. rubric:: Global `freq_meta_sample_count` annotation + + The global annotation `freq_meta_sample_count` is added to the input `mt`. This is a + sample count per sample grouping defined in the `freq_meta` global annotation. + + .. rubric:: The `additional_strata_expr` parameter + + If the `additional_strata_expr` parameter is used, frequencies will be computed for + each of the strata dictionaries across all values. For example, if + `additional_strata_expr` is set to `[{'platform': mt.platform}, + {'platform':mt.platform, 'pop': mt.pop}, {'age_bin': mt.age_bin}]`, then + frequencies will be computed for each of the values of `mt.platform`, each of the + combined values of `mt.platform` and `mt.pop`, and each of the values of + `mt.age_bin`. + + .. rubric:: The `downsamplings` parameter + + If the `downsamplings` parameter is used without the `downsampling_expr`, + frequencies will be computed for all samples and by population (if `pop_expr` is + specified) by downsampling the number of samples without replacement to each of the + numbers specified in the `downsamplings` array, provided that there are enough + samples in the dataset. In addition, if `pop_expr` is specified, a downsampling to + each of the exact number of samples present in each population is added. Note that + samples are randomly sampled only once, meaning that the lower downsamplings are + subsets of the higher ones. If the `downsampling_expr` parameter is used with the + `downsamplings` parameter, the `downsamplings` parameter informs the function which + downsampling groups were already created and are to be used in the frequency + calculation. + + .. rubric:: The `downsampling_expr` and `ds_pop_counts` parameters + + If the `downsampling_expr` parameter is used, `downsamplings` must also be set + and frequencies will be computed for all samples and by population (if `pop_expr` + is specified) using the downsampling indices to each of the numbers specified in + the `downsamplings` array. The function expects a 'global_idx', and if `pop_expr` + is used, a 'pop_idx' within the `downsampling_expr` to be used to determine if a + sample belongs within a certain downsampling group, i.e. the index is less than + the group size. `The function `annotate_downsamplings` can be used to to create + the `downsampling_expr`, `downsamplings`, and `ds_pop_counts` expressions. + + .. rubric:: The `entry_agg_funcs` parameter + + If the `entry_agg_funcs` parameter is used, the output MatrixTable will also + contain the annotations specified in the `entry_agg_funcs` parameter. The keys of + the dict are the names of the annotations and the values are tuples of functions. + The first function is used to transform the `mt` entries in some way, and the + second function is used to aggregate the output from the first function. For + example, if `entry_agg_funcs` is set to {'adj_samples': (get_adj_expr, hl.agg.sum)}`, + then the output MatrixTable will contain an annotation `adj_samples` which is an + array of the number of adj samples per strata in each row. + + :param mt: Input MatrixTable + :param sex_expr: When specified, frequencies are stratified by sex. If `pop_expr` + is also specified, then a pop/sex stratifiction is added. + :param pop_expr: When specified, frequencies are stratified by population. If + `sex_expr` is also specified, then a pop/sex stratifiction is added. + :param subpop_expr: When specified, frequencies are stratified by sub-continental + population. Note that `pop_expr` is required as well when using this option. + :param additional_strata_expr: When specified, frequencies are stratified by the + given additional strata. This can e.g. be used to stratify by platform, + platform-pop, platform-pop-sex. + :param downsamplings: When specified, frequencies are computed by downsampling the + data to the number of samples given in the list. Note that if `pop_expr` is + specified, downsamplings by population is also computed. + :param downsampling_expr: When specified, frequencies are computed using the + downsampling indices in the provided StructExpression. Note that if `pop_idx` + is specified within the struct, downsamplings by population is also computed. + :param ds_pop_counts: When specified, frequencies are computed by downsampling the + data to the number of samples per pop in the dict. The key is the population + and the value is the number of samples. + :param entry_agg_funcs: When specified, additional annotations are added to the + output Table/MatrixTable. The keys of the dict are the names of the annotations + and the values are tuples of functions. The first function is used to transform + the `mt` entries in some way, and the second function is used to aggregate the + output from the first function. + :param annotate_mt: Whether to return the full MatrixTable with annotations added + instead of only a Table with `freq` and other annotations. Default is True. + :return: MatrixTable or Table with `freq` annotation. + """ + errors = [] + if downsampling_expr is not None: + if downsamplings is None: + errors.append( + "annotate_freq requires `downsamplings` when using `downsampling_expr`" + ) + if downsampling_expr.get("pop_idx") is not None: + if ds_pop_counts is None: + errors.append( + "annotate_freq requires `ds_pop_counts` when using " + "`downsampling_expr` with pop_idx" + ) + if errors: + raise ValueError("The following errors were found: \n" + "\n".join(errors)) + + # Generate downsamplings and assign downsampling_expr if it is None when + # downsamplings is supplied. + if downsamplings is not None and downsampling_expr is None: + ds_ht = annotate_downsamplings(mt, downsamplings, pop_expr=pop_expr).cols() + downsamplings = hl.eval(ds_ht.downsamplings) + ds_pop_counts = hl.eval(ds_ht.ds_pop_counts) + downsampling_expr = ds_ht[mt.col_key].downsampling + + # Build list of all stratification groups to be used in the frequency calculation. + strata_expr = build_freq_stratification_list( + sex_expr=sex_expr, + pop_expr=pop_expr, + subpop_expr=subpop_expr, + additional_strata_expr=additional_strata_expr, + downsampling_expr=downsampling_expr, + ) + + # Annotate the MT cols with each of the expressions in strata_expr and redefine + # strata_expr based on the column HT with added annotations. + ht = mt.annotate_cols(**{k: v for d in strata_expr for k, v in d.items()}).cols() + strata_expr = [{k: ht[k] for k in d} for d in strata_expr] + + # Annotate HT with a freq_meta global and group membership array for each sample + # indicating whether the sample belongs to the group defined by freq_meta elements. + ht = generate_freq_group_membership_array( + ht, + strata_expr, + downsamplings=downsamplings, + ds_pop_counts=ds_pop_counts, + ) + + freq_ht = compute_freq_by_strata( + mt.annotate_cols(group_membership=ht[mt.col_key].group_membership), + entry_agg_funcs=entry_agg_funcs, + ) + freq_ht = freq_ht.annotate_globals(**ht.index_globals()) + + if annotate_mt: + mt = mt.annotate_rows(**freq_ht[mt.row_key]) + mt = mt.annotate_globals(**freq_ht.index_globals()) + return mt + + else: + return freq_ht
+ + +
[docs]def annotate_downsamplings( + t: Union[hl.MatrixTable, hl.Table], + downsamplings: List[int], + pop_expr: Optional[hl.expr.StringExpression] = None, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Annotate MatrixTable or Table with downsampling groups. + + :param t: Input MatrixTable or Table. + :param downsamplings: List of downsampling sizes. + :param pop_expr: Optional expression for population group. When provided, population + sample sizes are added as values to downsamplings. + :return: MatrixTable or Table with downsampling annotations. + """ + if isinstance(t, hl.MatrixTable): + if pop_expr is not None: + ht = t.annotate_cols(pop=pop_expr).cols() + else: + ht = t.cols() + else: + if pop_expr is not None: + ht = t.annotate(pop=pop_expr) + else: + ht = t + + ht = ht.key_by(r=hl.rand_unif(0, 1)) + + # Add a global index for use in computing frequencies, or other aggregate stats on + # the downsamplings. + scan_expr = {"global_idx": hl.scan.count()} + + # If pop_expr is provided, add all pop counts to the downsamplings list. + if pop_expr is not None: + pop_counts = ht.aggregate( + hl.agg.filter(hl.is_defined(ht.pop), hl.agg.counter(ht.pop)) + ) + downsamplings = [x for x in downsamplings if x <= sum(pop_counts.values())] + downsamplings = sorted(set(downsamplings + list(pop_counts.values()))) + # Add an index by pop for use in computing frequencies, or other aggregate stats + # on the downsamplings. + scan_expr["pop_idx"] = hl.scan.counter(ht.pop).get(ht.pop, 0) + else: + pop_counts = None + logger.info("Found %i downsamplings: %s", len(downsamplings), downsamplings) + + ht = ht.annotate(**scan_expr) + ht = ht.key_by("s").select(*scan_expr) + + if isinstance(t, hl.MatrixTable): + t = t.annotate_cols(downsampling=ht[t.s]) + else: + t = t.annotate(downsampling=ht[t.s]) + + t = t.annotate_globals( + downsamplings=downsamplings, + ds_pop_counts=pop_counts, + ) + + return t
+ + +
[docs]def build_freq_stratification_list( + sex_expr: Optional[hl.expr.StringExpression] = None, + pop_expr: Optional[hl.expr.StringExpression] = None, + subpop_expr: Optional[hl.expr.StringExpression] = None, + additional_strata_expr: Optional[ + Union[ + List[Dict[str, hl.expr.StringExpression]], + Dict[str, hl.expr.StringExpression], + ] + ] = None, + downsampling_expr: Optional[hl.expr.StructExpression] = None, +) -> List[Dict[str, hl.expr.StringExpression]]: + """ + Build a list of stratification groupings to be used in frequency calculations based on supplied parameters. + + .. note:: + This function is primarily used through `annotate_freq` but can be used + independently if desired. The returned list of stratifications can be passed to + `generate_freq_group_membership_array`. + + :param sex_expr: When specified, the returned list contains a stratification for + sex. If `pop_expr` is also specified, then the returned list also contains a + pop/sex stratification. + :param pop_expr: When specified, the returned list contains a stratification for + population. If `sex_expr` is also specified, then the returned list also + contains a pop/sex stratification. + :param subpop_expr: When specified, the returned list contains a stratification for + sub-continental population. Note that `pop_expr` is required as well when using + this option. + :param additional_strata_expr: When specified, the returned list contains a + stratification for each of the additional strata. This can e.g. be used to + stratify by platform, platform-pop, platform-pop-sex. + :param downsampling_expr: When specified, the returned list contains a + stratification for downsampling. If `pop_expr` is also specified, then the + returned list also contains a downsampling/pop stratification. + :return: List of dictionaries specifying stratification groups where the keys of + each dictionary are strings and the values are corresponding expressions that + define the values to stratify frequency calculations by. + """ + errors = [] + if subpop_expr is not None and pop_expr is None: + errors.append("annotate_freq requires pop_expr when using subpop_expr") + + if downsampling_expr is not None: + if downsampling_expr.get("global_idx") is None: + errors.append( + "annotate_freq requires `downsampling_expr` with key 'global_idx'" + ) + if downsampling_expr.get("pop_idx") is None: + if pop_expr is not None: + errors.append( + "annotate_freq requires `downsampling_expr` with key 'pop_idx' when" + " using `pop_expr`" + ) + else: + if pop_expr is None: + errors.append( + "annotate_freq requires `pop_expr` when using `downsampling_expr` " + "with pop_idx" + ) + + if errors: + raise ValueError("The following errors were found: \n" + "\n".join(errors)) + + # Build list of strata expressions based on supplied parameters. + strata_expr = [] + if pop_expr is not None: + strata_expr.append({"pop": pop_expr}) + if sex_expr is not None: + strata_expr.append({"sex": sex_expr}) + if pop_expr is not None: + strata_expr.append({"pop": pop_expr, "sex": sex_expr}) + if subpop_expr is not None: + strata_expr.append({"pop": pop_expr, "subpop": subpop_expr}) + + # Add downsampling to strata expressions, include pop in the strata if supplied. + if downsampling_expr is not None: + downsampling_strata = {"downsampling": downsampling_expr} + if pop_expr is not None: + downsampling_strata["pop"] = pop_expr + strata_expr.append(downsampling_strata) + + # Add additional strata expressions. + if additional_strata_expr is not None: + if isinstance(additional_strata_expr, dict): + additional_strata_expr = [additional_strata_expr] + strata_expr.extend(additional_strata_expr) + + return strata_expr
+ + +
[docs]def generate_freq_group_membership_array( + ht: hl.Table, + strata_expr: List[Dict[str, hl.expr.StringExpression]], + downsamplings: Optional[List[int]] = None, + ds_pop_counts: Optional[Dict[str, int]] = None, + remove_zero_sample_groups: bool = False, + no_raw_group: bool = False, +) -> hl.Table: + """ + Generate a Table with a 'group_membership' array for each sample indicating whether the sample belongs to specific stratification groups. + + .. note:: + This function is primarily used through `annotate_freq` but can be used + independently if desired. Please see the `annotate_freq` function for more + complete documentation. + + The following global annotations are added to the returned Table: + - freq_meta: Each element of the list contains metadata on a stratification + group. + - freq_meta_sample_count: sample count per grouping defined in `freq_meta`. + - If downsamplings or ds_pop_counts are specified, they are also added as + global annotations on the returned Table. + + Each sample is annotated with a 'group_membership' array indicating whether the + sample belongs to specific stratification groups. All possible value combinations + are determined for each stratification grouping in the `strata_expr` list. + + :param ht: Input Table that contains Expressions specified by `strata_expr`. + :param strata_expr: List of dictionaries specifying stratification groups where + the keys of each dictionary are strings and the values are corresponding + expressions that define the values to stratify frequency calculations by. + :param downsamplings: List of downsampling values to include in the stratifications. + :param ds_pop_counts: Dictionary of population counts for each downsampling value. + :param remove_zero_sample_groups: Whether to remove groups with a sample count of 0. + Default is False. + :param no_raw_group: Whether to remove the raw group from the 'group_membership' + annotation and the 'freq_meta' and 'freq_meta_sample_count' global annotations. + Default is False. + :return: Table with the 'group_membership' array annotation. + """ + errors = [] + ds_in_strata = any("downsampling" in s for s in strata_expr) + global_idx_in_ds_expr = any( + "global_idx" in s["downsampling"] for s in strata_expr if "downsampling" in s + ) + pop_in_strata = any("pop" in s for s in strata_expr) + pop_idx_in_ds_expr = any( + "pop_idx" in s["downsampling"] + for s in strata_expr + if "downsampling" in s and ds_pop_counts is not None + ) + + if downsamplings is not None and not ds_in_strata: + errors.append( + "Strata must contain a downsampling expression when downsamplings" + "are provided." + ) + if downsamplings is not None and not global_idx_in_ds_expr: + errors.append( + "Strata must contain a downsampling expression with 'global_idx' when " + "downsamplings are provided." + ) + if ds_pop_counts is not None and not pop_in_strata: + errors.append( + "Strata must contain a population expression 'pop' when ds_pop_counts " + " are provided." + ) + if ds_pop_counts is not None and not pop_idx_in_ds_expr: + errors.append( + "Strata must contain a downsampling expression with 'pop_idx' when " + "ds_pop_counts are provided." + ) + + if errors: + raise ValueError("The following errors were found: \n" + "\n".join(errors)) + + # Get counters for all strata. + strata_counts = ht.aggregate( + hl.struct( + **{ + k: hl.agg.filter(hl.is_defined(v), hl.agg.counter({k: v})) + for strata in strata_expr + for k, v in strata.items() + } + ) + ) + + # Add all desired strata to sample group filters. + sample_group_filters = [({}, True)] + for strata in strata_expr: + downsampling_expr = strata.get("downsampling") + strata_values = [] + # Add to all downsampling groups, both global and population-specific, to + # strata. + for s in strata: + if s == "downsampling": + v = [("downsampling", d) for d in downsamplings] + else: + v = [(s, k[s]) for k in strata_counts.get(s, {})] + if s == "pop" and downsampling_expr is not None: + v.append(("pop", "global")) + strata_values.append(v) + + # Get all combinations of strata values. + strata_combinations = itertools.product(*strata_values) + # Create sample group filters that are evaluated on each sample for each strata + # combination. Strata combinations are evaluated as a logical AND, e.g. + # {"pop":nfe, "downsampling":1000} or "nfe-10000" creates the filter expression + # pop == nfe AND downsampling pop_idx < 10000. + for combo in strata_combinations: + combo = dict(combo) + ds = combo.get("downsampling") + pop = combo.get("pop") + # If combo contains downsampling, determine the downsampling index + # annotation to use. + downsampling_idx = "global_idx" + if ds is not None: + if pop is not None and pop != "global": + # Don't include population downsamplings where the downsampling is + # larger than the number of samples in the population. + if ds > ds_pop_counts[pop]: + continue + downsampling_idx = "pop_idx" + + # If combo contains downsampling, add downsampling filter expression. + combo_filter_exprs = [] + for s, v in combo.items(): + if s == "downsampling": + combo_filter_exprs.append(downsampling_expr[downsampling_idx] < v) + else: + if s != "pop" or v != "global": + combo_filter_exprs.append(strata[s] == v) + combo = {k: str(v) for k, v in combo.items()} + sample_group_filters.append((combo, hl.all(combo_filter_exprs))) + + n_groups = len(sample_group_filters) + logger.info("number of filters: %i", n_groups) + + # Get sample count per strata group. + freq_meta_sample_count = ht.aggregate( + [hl.agg.count_where(x[1]) for x in sample_group_filters] + ) + + if remove_zero_sample_groups: + filter_freq = hl.enumerate(freq_meta_sample_count).filter(lambda x: x[1] > 0) + freq_meta_sample_count = filter_freq.map(lambda x: x[1]) + idx_keep = hl.eval(filter_freq.map(lambda x: x[0])) + sample_group_filters = [sample_group_filters[i] for i in idx_keep] + + # Annotate columns with group_membership. + ht = ht.select(group_membership=[x[1] for x in sample_group_filters]) + + # Create and annotate global expression with meta and sample count information. + freq_meta = [ + dict(**sample_group[0], group="adj") for sample_group in sample_group_filters + ] + + if not no_raw_group: + # Sample group membership for the "raw" group, representing all samples, is + # the same as the first group in the group_membership array. + ht = ht.annotate( + group_membership=hl.array([ht.group_membership[0]]).extend( + ht.group_membership + ) + ) + # Add the "raw" group, representing all samples, to the freq_meta_expr list. + freq_meta.insert(1, {"group": "raw"}) + freq_meta_sample_count = hl.array([freq_meta_sample_count[0]]).extend( + freq_meta_sample_count + ) + + global_expr = { + "freq_meta": freq_meta, + "freq_meta_sample_count": freq_meta_sample_count, + } + + if downsamplings is not None: + global_expr["downsamplings"] = downsamplings + if ds_pop_counts is not None: + global_expr["ds_pop_counts"] = ds_pop_counts + + ht = ht.select_globals(**global_expr) + ht = ht.checkpoint(hl.utils.new_temp_file("group_membership", "ht")) + + return ht
+ + +
[docs]def compute_freq_by_strata( + mt: hl.MatrixTable, + entry_agg_funcs: Optional[Dict[str, Tuple[Callable, Callable]]] = None, + select_fields: Optional[List[str]] = None, + group_membership_includes_raw_group: bool = True, +) -> hl.Table: + """ + Compute call statistics and, when passed, entry aggregation function(s) by strata. + + The computed call statistics are AC, AF, AN, and homozygote_count. The entry + aggregation functions are applied to the MatrixTable entries and aggregated. The + MatrixTable must contain a 'group_membership' annotation (like the one added by + `generate_freq_group_membership_array`) that is a list of bools to aggregate the + columns by. + + .. note:: + This function is primarily used through `annotate_freq` but can be used + independently if desired. Please see the `annotate_freq` function for more + complete documentation. + + :param mt: Input MatrixTable. + :param entry_agg_funcs: Optional dict of entry aggregation functions. When + specified, additional annotations are added to the output Table/MatrixTable. + The keys of the dict are the names of the annotations and the values are tuples + of functions. The first function is used to transform the `mt` entries in some + way, and the second function is used to aggregate the output from the first + function. + :param select_fields: Optional list of row fields from `mt` to keep on the output + Table. + :param group_membership_includes_raw_group: Whether the 'group_membership' + annotation includes an entry for the 'raw' group, representing all samples. If + False, the 'raw' group is inserted as the second element in all added + annotations using the same 'group_membership', resulting + in array lengths of 'group_membership'+1. If True, the second element of each + added annotation is still the 'raw' group, but the group membership is + determined by the values in the second element of 'group_membership', and the + output annotations will be the same length as 'group_membership'. Default is + True. + :return: Table or MatrixTable with allele frequencies by strata. + """ + if not group_membership_includes_raw_group: + # Add the 'raw' group to the 'group_membership' annotation. + mt = mt.annotate_cols( + group_membership=hl.array([mt.group_membership[0]]).extend( + mt.group_membership + ) + ) + + # Add adj_groups global annotation indicating that the second element in + # group_membership is 'raw' and all others are 'adj'. + mt = mt.annotate_globals( + adj_groups=hl.range(hl.len(mt.group_membership.take(1)[0])).map( + lambda x: x != 1 + ) + ) + + if entry_agg_funcs is None: + entry_agg_funcs = {} + + def _get_freq_expr(gt_expr: hl.expr.CallExpression) -> hl.expr.StructExpression: + """ + Get struct expression with call statistics. + + :param gt_expr: CallExpression to compute call statistics on. + :return: StructExpression with call statistics. + """ + # Get the source Table for the CallExpression to grab alleles. + ht = gt_expr._indices.source + freq_expr = hl.agg.call_stats(gt_expr, ht.alleles) + # Select non-ref allele (assumes bi-allelic). + freq_expr = freq_expr.annotate( + AC=freq_expr.AC[1], + AF=freq_expr.AF[1], + homozygote_count=freq_expr.homozygote_count[1], + ) + + return freq_expr + + entry_agg_funcs["freq"] = (lambda x: x.GT, _get_freq_expr) + + return agg_by_strata(mt, entry_agg_funcs, select_fields).drop("adj_groups")
+ + +
[docs]def agg_by_strata( + mt: hl.MatrixTable, + entry_agg_funcs: Dict[str, Tuple[Callable, Callable]], + select_fields: Optional[List[str]] = None, + group_membership_ht: Optional[hl.Table] = None, + entry_agg_group_membership: Optional[Dict[str, List[dict]]] = None, +) -> hl.Table: + """ + Get row expression for annotations of each entry aggregation function(s) by strata. + + The entry aggregation functions are applied to the MatrixTable entries and + aggregated. If no `group_membership_ht` (like the one returned by + `generate_freq_group_membership_array`) is supplied, `mt` must contain a + 'group_membership' annotation that is a list of bools to aggregate the columns by. + + :param mt: Input MatrixTable. + :param entry_agg_funcs: Dict of entry aggregation functions where the + keys of the dict are the names of the annotations and the values are tuples + of functions. The first function is used to transform the `mt` entries in some + way, and the second function is used to aggregate the output from the first + function. + :param select_fields: Optional list of row fields from `mt` to keep on the output + Table. + :param group_membership_ht: Optional Table containing group membership annotations + to stratify the aggregations by. If not provided, the 'group_membership' + annotation is expected to be present on `mt`. + :param entry_agg_group_membership: Optional dict indicating the subset of group + strata in 'freq_meta' to run the entry aggregation functions on. The keys of + the dict can be any of the keys in `entry_agg_funcs` and the values are lists + of dicts. Each dict in the list contains the strata in 'freq_meta' to use for + the corresponding entry aggregation function. If provided, 'freq_meta' must be + present in `group_membership_ht` or `mt` and represent the same strata as those + in 'group_membership'. If not provided, all entries of the 'group_membership' + annotation will have the entry aggregation functions applied to them. + :return: Table with annotations of stratified aggregations. + """ + if group_membership_ht is None and "group_membership" not in mt.col: + raise ValueError( + "The 'group_membership' annotation is not found in the input MatrixTable " + "and 'group_membership_ht' is not specified." + ) + + if select_fields is None: + select_fields = [] + + if group_membership_ht is None: + logger.info( + "'group_membership_ht' is not specified, using sample stratification " + "indicated by the 'group_membership' annotation on the input MatrixTable." + ) + group_globals = mt.index_globals() + else: + logger.info( + "'group_membership_ht' is specified, using sample stratification indicated " + "by its 'group_membership' annotation." + ) + group_globals = group_membership_ht.index_globals() + mt = mt.annotate_cols( + group_membership=group_membership_ht[mt.col_key].group_membership + ) + + global_expr = {} + n_groups = len(mt.group_membership.take(1)[0]) + if "adj_groups" in group_globals: + logger.info( + "Using the 'adj_groups' global annotation to determine adj filtered " + "stratification groups." + ) + global_expr["adj_groups"] = group_globals.adj_groups + elif "freq_meta" in group_globals: + logger.info( + "No 'adj_groups' global annotation found, using the 'freq_meta' global " + "annotation to determine adj filtered stratification groups." + ) + global_expr["adj_groups"] = group_globals.freq_meta.map( + lambda x: x.get("group", "NA") == "adj" + ) + else: + logger.info( + "No 'adj_groups' or 'freq_meta' global annotations found. All groups will " + "be considered non-adj." + ) + global_expr["adj_groups"] = hl.range(n_groups).map(lambda x: False) + + if entry_agg_group_membership is not None and "freq_meta" not in group_globals: + raise ValueError( + "The 'freq_meta' global annotation must be supplied when the" + " 'entry_agg_group_membership' is specified." + ) + + entry_agg_group_membership = entry_agg_group_membership or {} + entry_agg_group_membership = { + ann: [group_globals["freq_meta"].index(s) for s in strata] + for ann, strata in entry_agg_group_membership.items() + } + + n_adj_groups = hl.eval(hl.len(global_expr["adj_groups"])) + if n_adj_groups != n_groups: + raise ValueError( + f"The number of elements in the 'adj_groups' ({n_adj_groups}) global " + "annotation does not match the number of elements in the " + f"'group_membership' annotation ({n_groups})!", + ) + + # Keep only the entries needed for the aggregation functions. + select_expr = {**{ann: f[0](mt) for ann, f in entry_agg_funcs.items()}} + has_adj = hl.eval(hl.any(global_expr["adj_groups"])) + if has_adj: + select_expr["adj"] = mt.adj + + mt = mt.select_entries(**select_expr) + + # Convert MT to HT with a row annotation that is an array of all samples entries + # for that variant. + ht = mt.localize_entries("entries", "cols") + + # For each stratification group in group_membership, determine the indices of the + # samples that belong to that group. + global_expr["indices_by_group"] = hl.range(n_groups).map( + lambda g_i: hl.range(mt.count_cols()).filter( + lambda s_i: ht.cols[s_i].group_membership[g_i] + ) + ) + ht = ht.annotate_globals(**global_expr) + + # Pull out each annotation that will be used in the array aggregation below as its + # own ArrayExpression. This is important to prevent memory issues when performing + # the below array aggregations. + ht = ht.select( + *select_fields, + **{ann: ht.entries.map(lambda e: e[ann]) for ann in select_expr.keys()}, + ) + + def _agg_by_group( + indices_by_group_expr: hl.expr.ArrayExpression, + adj_groups_expr: hl.expr.ArrayExpression, + agg_func: Callable, + ann_expr: hl.expr.ArrayExpression, + ) -> hl.expr.ArrayExpression: + """ + Aggregate `agg_expr` by group using the `agg_func` function. + + :param indices_by_group_expr: ArrayExpression of indices of samples in each group. + :param adj_groups_expr: ArrayExpression indicating whether each group is adj. + :param agg_func: Aggregation function to apply to `ann_expr`. + :param ann_expr: Expression to aggregate by group. + :return: Aggregated array expression. + """ + f_no_adj = lambda i, *args: agg_func(ann_expr[i]) + if has_adj: + f = lambda i, adj: hl.if_else( + adj, hl.agg.filter(ht.adj[i], f_no_adj(i)), f_no_adj(i) + ) + else: + f = f_no_adj + + return hl.map( + lambda s_indices, adj: s_indices.aggregate(lambda i: f(i, adj)), + indices_by_group_expr, + adj_groups_expr, + ) + + # Add annotations for any supplied entry transform and aggregation functions. + # Filter groups to only those in entry_agg_group_membership if specified. + # If there are no specific entry group indices for an annotation, use ht[g] + # to consider all groups without filtering. + ht = ht.select( + *select_fields, + **{ + ann: _agg_by_group( + *[ + [ht[g][i] for i in entry_agg_group_membership.get(ann, [])] or ht[g] + for g in ["indices_by_group", "adj_groups"] + ], + agg_func=f[1], + ann_expr=ht[ann], + ) + for ann, f in entry_agg_funcs.items() + }, + ) + + return ht.drop("cols")
+ + +
[docs]def update_structured_annotations( + ht: hl.Table, + annotation_update_exprs: Dict[str, hl.Expression], + annotation_update_label: Optional[str] = None, +) -> hl.Table: + """ + Update highly structured annotations on a Table. + + This function recursively updates annotations defined by `annotation_update_exprs` + and if `annotation_update_label` is supplied, it checks if the sample annotations + are different from the input and adds a flag to the Table, indicating which + annotations have been updated for each sample. + + :param ht: Input Table with structured annotations to update. + :param annotation_update_exprs: Dictionary of annotations to update, structured as + they are structured on the input `ht`. + :param annotation_update_label: Optional string of the label to use for an + annotation indicating which annotations have been updated. Default is None, so + no annotation is added. + :return: Table with updated annotations and optionally a flag indicating which + annotations were changed. + """ + + def _update_struct( + struct_expr: hl.expr.StructExpression, + update_exprs: Union[Dict[str, hl.expr.Expression], hl.expr.Expression], + ) -> Tuple[Dict[str, hl.expr.BooleanExpression], Any]: + """ + Update a StructExpression. + + :param struct_expr: StructExpression to update. + :param update_exprs: Dictionary of annotations to update. + :return: Tuple of the updated annotations and the updated flag. + """ + if isinstance(update_exprs, dict): + updated_struct_expr = {} + updated_flag_expr = {} + for ann, expr in update_exprs.items(): + if ann in struct_expr: + updated_flag, updated_ann = _update_struct(struct_expr[ann], expr) + else: + updated_flag = {"": True} + updated_ann = expr + updated_flag_expr.update( + {ann + ("." + k if k else ""): v for k, v in updated_flag.items()} + ) + updated_struct_expr[ann] = updated_ann + return updated_flag_expr, struct_expr.annotate(**updated_struct_expr) + else: + return {"": update_exprs != struct_expr}, update_exprs + + annotation_update_flag, updated_rows = _update_struct( + ht.row_value, annotation_update_exprs + ) + if annotation_update_label is not None: + updated_rows = updated_rows.annotate( + **{ + annotation_update_label: filter_utils.add_filters_expr( + filters=annotation_update_flag + ) + } + ) + + return ht.annotate(**updated_rows)
+ + +
[docs]def add_gks_vrs( + input_locus: hl.locus, + input_vrs: hl.struct, +) -> dict: + """ + Generate a dictionary containing VRS information from a given locus and struct of VRS information. + + Dict will have GA4GH GKS VRS structure. + + :param input_locus: Locus field from a struct (locus of result of running .collect() on a Hail table). + :param input_vrs: VRS struct (such as from a ht.info.vrs field). + :return: Python dictionary conforming to GA4GH GKS VRS structure. + """ + # NOTE: The pinned ga4gh.vrs module breaks logging when this annotations module is + # imported. Importing ga4gh here to avoid this issue. + import ga4gh.core as ga4gh_core + import ga4gh.vrs as ga4gh_vrs + + build_in = input_locus.reference_genome.name + chr_in = input_locus.contig + + chrom_dict = VRS_CHROM_IDS[build_in] + vrs_id = input_vrs.VRS_Allele_IDs[1] + vrs_chrom_id = chrom_dict[chr_in] + vrs_start_value = input_vrs.VRS_Starts[1] + vrs_end_value = input_vrs.VRS_Ends[1] + vrs_state_sequence = input_vrs.VRS_States[1] + + vrs_dict_out = { + "_id": vrs_id, + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequence_id": vrs_chrom_id, + "interval": { + "start": {"type": "Number", "value": vrs_start_value}, + "end": {"type": "Number", "value": vrs_end_value}, + "type": "SequenceInterval", + }, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": vrs_state_sequence}, + } + + location_id = ga4gh_core._internal.identifiers.ga4gh_identify( + ga4gh_vrs.models.SequenceLocation(**vrs_dict_out["location"]) + ) + + vrs_dict_out["location"]["_id"] = location_id + + return vrs_dict_out
+ + +
[docs]def add_gks_va( + input_struct: hl.struct, + label_name: str = "gnomAD", + label_version: str = "3.1.2", + ancestry_groups: list = None, + ancestry_groups_dict: dict = None, + by_sex: bool = False, + freq_index_dict: dict = None, +) -> dict: + """ + Generate Python dictionary containing GKS VA annotations. + + Populate the dictionary with frequency information conforming to the GKS VA frequency schema. + If ancestry_groups or by_sex is provided, also include subcohort schemas for each cohort. + If input_struct has mean_depth, it is added to ancillaryResults. + This annotation is added under the gks_va_freq_dict field of the table. + The focusAllele field is not populated, and must be filled in by the caller. + + :param input_struct: Hail struct for a desired variant (such as result of running .collect()[0] on a Table). + :param label_name: Label name to use within the returned dictionary. Example: "gnomAD". + :param label_version: String listing the version of the table being used. Example: "3.1.2". + :param ancestry_groups: List of strings of shortened names of cohorts to return results for. + Example: ['afr','fin','nfe']. Default is None. + :param ancestry_groups_dict: Dict mapping shortened genetic ancestry group names to full names. + Example: {'afr':'African/African American'}. Default is None. + :param by_sex: Boolean to include breakdown of cohorts by inferred sex (XX and XY) as well. + Default is None. + :freq_index_dict: Dict mapping groups to their index for freq info in ht.freq_index_dict[0]. + Default is None. + :return: Tuple containing a dictionary containing GKS VA frequency information, + (split by ancestry groups and sex if desired) for the specified variant. + """ + # Throw warnings if contradictory arguments passed. + if by_sex and not ancestry_groups: + logger.warning( + "Splitting whole database by sex is not yet supported. If using 'by_sex'," + " please also specify 'ancestry_groups' to stratify by." + ) + + contig = input_struct.locus.contig + pos = input_struct.locus.position + ref = input_struct.alleles[0] + var = input_struct.alleles[1] + gnomad_id = f"{contig}-{pos}-{ref}-{var}" + + # Define function to return a frequency report dictionary for a given group + def _create_group_dicts( + group_id: str, + group_label: str, + group_sex: str = None, + ) -> dict: + """ + Generate a dictionary containing the frequency information of a given variant for a given group. + + :param group_index: Index of frequency within the 'freq' annotation for the desired group. + :param group_id: String containing variant, genetic ancestry group, and sex (if requested). + - Example: "chr19-41094895-C-T.afr.XX". + :param group_label: String containing the full name of genetic ancestry group requested. + - Example: "African/African American". + :param group_sex: String indicating the sex of the group. + - Example: "XX" or "XY". + :return: Dictionary containing variant frequency information, + - (by genetic ancestry group and sex if desired) for specified variant. + """ + if group_sex: + cohort_id = f"{group_id.upper()}.{group_sex}" + freq_index_key = f"{group_id}_{group_sex}_adj" + else: + cohort_id = f"{group_id.upper()}" + freq_index_key = f"{group_id}_adj" + record_id = f"{gnomad_id}.{cohort_id}" + + # Obtain frequency information for the specified variant. + group_freq = input_struct.freq[freq_index_dict[freq_index_key]] + + # Cohort characteristics. + characteristics = [] + characteristics.append({"name": "genetic ancestry", "value": group_label}) + if group_sex is not None: + characteristics.append({"name": "biological sex", "value": group_sex}) + + # Dictionary to be returned containing information for a specified group. + freq_record = { + "id": record_id, + "type": "CohortAlleleFrequency", + "label": f"{group_label} Cohort Allele Frequency for {gnomad_id}", + "focusAllele": "#/focusAllele", + "focusAlleleCount": group_freq["AC"], + "locusAlleleCount": group_freq["AN"], + "alleleFrequency": ( + group_freq["AF"] if group_freq["AF"] is not None else 0.0 + ), + "cohort": {"id": cohort_id, "characteristics": characteristics}, + "ancillaryResults": {"homozygotes": group_freq["homozygote_count"]}, + } + + # Add hemizygote allele count if variant is non-autosomal/non-PAR. + # Only XY groups can be hemizygous. Other group AC is mixed homo/hetero. + # If not a by_sex group, include the XY hemizygote count for XY subgroup. + if not input_struct.in_autosome_or_par: + if group_sex == "XY": + freq_record["ancillaryResults"]["hemizygotes"] = group_freq.AC + elif group_sex is None: + # Group is not by_sex, but still need to report hemizygotes. + hemi_group_freq = input_struct.freq[ + freq_index_dict[f"{group_id}_XY_adj"] + ] + freq_record["ancillaryResults"]["hemizygotes"] = hemi_group_freq.AC + + return freq_record + + # Create a list to then add the dictionaries for frequency reports for + # different ancestry groups to. + list_of_group_info_dicts = [] + + # Iterate through provided groups and generate dictionaries. + if ancestry_groups: + for group in ancestry_groups: + group_result = _create_group_dicts( + group_id=group, + group_label=ancestry_groups_dict[group], + ) + + # If specified, stratify group information by sex. + if by_sex: + sex_list = [] + for sex in ["XX", "XY"]: + sex_result = _create_group_dicts( + group_id=group, + group_label=ancestry_groups_dict[group], + group_sex=sex, + ) + sex_list.append(sex_result) + + group_result["subcohortFrequency"] = sex_list + + list_of_group_info_dicts.append(group_result) + + # Add overall frequency, via label 'adj' which is currently stored at + # position #1 (index 0). + overall_freq = input_struct.freq[0] + + # Create final dictionary to be returned. + final_freq_dict = { + "id": f"{label_name}-{label_version}-{gnomad_id}", + "type": "CohortAlleleFrequency", + "label": f"Overall Cohort Allele Frequency for {gnomad_id}", + "derivedFrom": { + "id": f"{label_name}{label_version}", + "type": "DataSet", + "label": f"{label_name} v{label_version}", + "version": f"{label_version}", + }, + "focusAllele": ( + "" + ), # Information can be populated with the result of add_gks_vrs() + "focusAlleleCount": overall_freq["AC"], + "locusAlleleCount": overall_freq["AN"], + "alleleFrequency": ( + overall_freq["AF"] if overall_freq["AF"] is not None else 0.0 + ), + "cohort": {"id": "ALL"}, + } + + # Create ancillaryResults for additional frequency and popMaxFAF95 information. + ancillaryResults = { + "homozygotes": overall_freq["homozygote_count"], + } + + # Add hemizygote count if not autosomal or PAR. + if not input_struct.in_autosome_or_par: + hemizygote_count = input_struct.freq[freq_index_dict["XY_adj"]].AC + ancillaryResults["hemizygotes"] = hemizygote_count + + # Add group max FAF if it exists + if input_struct.grpMaxFAF95.popmax_population is not None: + ancillaryResults["grpMaxFAF95"] = { + "frequency": input_struct.grpMaxFAF95.popmax, + "confidenceInterval": 0.95, + "groupId": ( + f"{gnomad_id}.{input_struct.grpMaxFAF95.popmax_population.upper()}" + ), + } + + # Add joint group max FAF if it exists. + if ( + "jointGrpMaxFAF95" in input_struct + and input_struct.jointGrpMaxFAF95.popmax_population is not None + ): + ancillaryResults["jointGrpMaxFAF95"] = { + "frequency": input_struct.jointGrpMaxFAF95.popmax, + "confidenceInterval": 0.95, + "groupId": ( + f"{gnomad_id}.{input_struct.jointGrpMaxFAF95.popmax_population.upper()}" + ), + } + + final_freq_dict["ancillaryResults"] = ancillaryResults + + # Check allele balance for heterozygotes values. + # Flagged allele balance values are those in bins > 0.90. + # Each bin is 0.05, so flagged values are in the last 2 bins. + if len(input_struct.ab_hist_alt.bin_freq) != 20: + raise ValueError( + f"{gnomad_id} ab_hist_alt.bin_freq had " + f"{len(input_struct.ab_hist_alt.bin_freq)} items, expected 20" + ) + # The bin_freq should be in order but we can verify the order from bin_edges. + ab_bin_freq = list( + map( + lambda x: x[1], + sorted( + zip( + input_struct.ab_hist_alt.bin_edges, + input_struct.ab_hist_alt.bin_freq, + ), + key=lambda x: x[0], + ), + ) + ) + + qualityMeasures = { + "qcFilters": list(input_struct.filters), + "lowComplexityRegion": input_struct.lcr, + "heterozygousSkewedAlleleCount": sum(ab_bin_freq[-2:]), + } + + # Add coverage depth statistics if the input was annotated + # with coverage information. + if "mean_depth" in input_struct: + qualityMeasures["meanDepth"] = input_struct.mean_depth + + if "fraction_cov_over_20" in input_struct: + qualityMeasures["fractionCoverage20x"] = input_struct.fraction_cov_over_20 + + # Add monoallelic flag (all samples homozygous for alternate allele) + qualityMeasures["monoallelic"] = input_struct.monoallelic + + final_freq_dict["qualityMeasures"] = qualityMeasures + + # If ancestry_groups were passed, add the ancestry group dictionary to the + # final frequency dictionary to be returned. + if ancestry_groups: + final_freq_dict["subcohortFrequency"] = list_of_group_info_dicts + + return final_freq_dict
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/constraint.html b/_modules/gnomad/utils/constraint.html new file mode 100644 index 000000000..2193ef4c7 --- /dev/null +++ b/_modules/gnomad/utils/constraint.html @@ -0,0 +1,1514 @@ + + + + + + gnomad.utils.constraint — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.constraint

+"""Script containing generic constraint functions that may be used in the constraint pipeline."""
+
+import copy
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import hail as hl
+from hail.utils.misc import divide_null, new_temp_file
+
+from gnomad.utils.vep import explode_by_vep_annotation, process_consequences
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger("constraint_utils")
+logger.setLevel(logging.INFO)
+
+COVERAGE_CUTOFF = 30
+"""
+Minimum median exome coverage differentiating high coverage sites from low coverage sites.
+
+Low coverage sites require an extra calibration when computing the proportion of expected variation.
+"""
+
+
+
[docs]def annotate_with_mu( + ht: hl.Table, + mutation_ht: hl.Table, + mu_annotation: str = "mu_snp", +) -> hl.Table: + """ + Annotate SNP mutation rate for the input Table. + + .. note:: + + Function expects that`ht` includes`mutation_ht`'s key fields. Note that these + annotations don't need to be the keys of `ht`. + + :param ht: Input Table to annotate. + :param mutation_ht: Mutation rate Table. + :param mu_annotation: The name of mutation rate annotation in `mutation_ht`. + Default is 'mu_snp'. + :return: Table with mutational rate annotation added. + """ + mu = mutation_ht.index(*[ht[k] for k in mutation_ht.key])[mu_annotation] + return ht.annotate( + **{mu_annotation: hl.case().when(hl.is_defined(mu), mu).or_error("Missing mu")} + )
+ + +
[docs]def count_variants_by_group( + ht: hl.Table, + freq_expr: Optional[hl.expr.ArrayExpression] = None, + freq_meta_expr: Optional[hl.expr.ArrayExpression] = None, + count_singletons: bool = False, + count_downsamplings: Tuple[str] = (), + downsamplings: Optional[List[int]] = None, + additional_grouping: Tuple[str] = (), + partition_hint: int = 100, + omit_methylation: bool = False, + use_table_group_by: bool = False, + singleton_expr: Optional[hl.expr.BooleanExpression] = None, + max_af: Optional[float] = None, +) -> Union[hl.Table, Any]: + """ + Count number of observed or possible variants by context, ref, alt, and optionally methylation_level. + + Performs variant count aggregations based on specified criteria + (`count_singletons`, `count_downsamplings`, and `max_af`), and grouped by: + 'context', 'ref', 'alt', 'methylation_level' (optional), and all annotations + provided in `additional_grouping`. + + If variant allele frequency information is required based on other parameter + selections (described in detail below) and `freq_expr` is not supplied, `freq_expr` + defaults to `ht.freq` if it exists. + + `freq_expr` should be an ArrayExpression of Structs with 'AC' and 'AF' annotations. + This is the same format as the `freq` annotation that is created using + `annotate_freq()`. + + Variant allele frequency information is needed when: + - `max_af` is not None - `freq_expr[0].AF` is used to filter to only variants + with a maximum allele frequency of `max_af` prior to counting variants. In + the standard `freq` ArrayExpression annotated by `annotate_freq()`, this + first element corresponds to the allele frequency information for high quality + genotypes (adj). + - `count_singletons` is True and `singleton_expr` is None - If singleton counts + are requested and no expression is specified to determine whether a variant + is a singleton, `singleton_expr` defaults to `freq_expr[0].AC == 1`. In the + standard `freq` ArrayExpression annotated by `annotate_freq()`, this + corresponds to allele count of only 1 in the callset after filtering to high + quality genotypes. + - `count_downsamplings` is not empty - When downsampling counts are requested, + `freq_expr` needs to contain frequency information for downsamplings within + each population requested. In addition to needing `freq_expr`, this also + requires the use of `freq_meta_expr`. If `freq_meta_expr` is None, + `freq_meta_expr` it defaults to `ht.freq_meta` if it exists. Similar to + `freq_expr`, `freq_meta_expr` is expected to have the same format as + the `freq_meta` global annotation that is created using `annotate_freq()`. + `freq_meta_expr` is used to determine the index of allele frequency + information within `freq_expr` for each population requested and it's + downsamplings. + + This function will return a Table with annotations used for grouping ('context', + 'ref', 'alt', 'methylation_level' (optional), `additional_grouping`) and + 'variant_count' annotation. + + .. note:: + + The following annotations should be present in `ht`: + - ref - the reference allele + - alt - the alternate base + - context - trinucleotide genomic context + - methylation_level - methylation level (optional if omit_methylation==True) + - freq - allele frequency information (AC, AN, AF, homozygote count; not + required if `freq_expr` is given) + - freq_meta - an ordered list containing the frequency aggregation group + for each element of the `freq` array row annotation (not required if + `freq_meta_expr` is given) + + :param ht: Input Hail Table. + :param freq_expr: ArrayExpression of Structs with 'AC' and 'AF' annotations. If + `freq_expr` is None and any of `count_downsamplings`, `max_af`, and + `count_singletons` is True, `freq_expr` would be `ht.freq`. + :param freq_meta_expr: ArrayExpression of meta dictionaries corresponding to + `freq_expr`. If `count_downsamplings` and `freq_meta_expr` is None, + `freq_meta_expr` would be `ht.freq_meta`. + :param count_singletons: Whether to count singletons (defined by `singleton_expr`). + Default is False. + :param count_downsamplings: Tuple of populations to use for downsampling counts. + Default is (). + :param downsamplings: Optional List of integers specifying what downsampling + indices to obtain. Default is None, which will return all downsampling counts. + :param additional_grouping: Additional features to group by. e.g. 'exome_coverage'. + Default is (). + :param partition_hint: Target number of partitions for aggregation. Default is 100. + :param omit_methylation: Whether to omit 'methylation_level' from the grouping when + counting variants. Default is False. + :param use_table_group_by: Whether to group `ht` before aggregating the variant + counts. If `use_table_group_by` is False, function will return a hl. + StructExpression. Default is False. + :param singleton_expr: Expression for defining a singleton. When `count_singletons` + is True and `singleton_expr` is None, `singleton_expression` would be `freq_expr + [0].AC == 1`. Default is None. + :param max_af: Maximum variant allele frequency to keep. By default, no cutoff is + applied. + :return: Table including 'variant_count' annotation and if requested, + `singleton_count` and downsampling counts. + """ + if freq_expr is None and ( + count_downsamplings or max_af or (count_singletons and singleton_expr is None) + ): + logger.warning( + "freq_expr was not provided, using 'freq' as the frequency annotation." + ) + freq_expr = ht.freq + if count_downsamplings and freq_meta_expr is None: + logger.warning( + "freq_meta_expr was not provided, using 'freq_meta' as the frequency" + " metadata annotation." + ) + freq_meta_expr = ht.freq_meta + if count_singletons and singleton_expr is None: + logger.warning( + "count_singletons is True and singleton_expr was not provided, using" + " freq_expr[0].AC == 1 as the singleton expression." + ) + singleton_expr = freq_expr[0].AC == 1 + + grouping = hl.struct(context=ht.context, ref=ht.ref, alt=ht.alt) + if not omit_methylation: + logger.info( + "'methylation_level' annotation is included in the grouping when counting" + " variants." + ) + grouping = grouping.annotate(methylation_level=ht.methylation_level) + for group in additional_grouping: + grouping = grouping.annotate(**{group: ht[group]}) + logger.info( + "The following annotations will be used to group the input Table rows when" + " counting variants: %s.", + ", ".join(grouping.keys()), + ) + + if max_af: + logger.info( + "The maximum variant allele frequency to be included in `variant_count` is" + " %.3f.", + max_af, + ) + agg = {"variant_count": hl.agg.count_where(freq_expr[0].AF <= max_af)} + else: + agg = {"variant_count": hl.agg.count()} + + if count_singletons: + logger.info( + "Counting singleton variants and adding as 'singleton_count' annotation." + ) + agg["singleton_count"] = hl.agg.count_where(singleton_expr) + + for pop in count_downsamplings: + logger.info( + "Counting variants in downsamplings for population '%s', and adding as" + " 'downsampling_counts_%s' annotation.", + pop, + pop, + ) + agg[f"downsampling_counts_{pop}"] = downsampling_counts_expr( + freq_expr, + freq_meta_expr, + pop, + max_af=max_af, + downsamplings=downsamplings, + ) + if count_singletons: + logger.info( + "Counting singleton variants in downsamplings for population '%s', and" + " adding as 'singleton_downsampling_counts_%s' annotation.", + pop, + pop, + ) + agg[f"singleton_downsampling_counts_{pop}"] = downsampling_counts_expr( + freq_expr, + freq_meta_expr, + pop, + max_af=max_af, + downsamplings=downsamplings, + singleton=True, + ) + # Apply each variant count aggregation in `agg` to get counts for all + # combinations of `grouping`. + if use_table_group_by: + return ht.group_by(**grouping).partition_hint(partition_hint).aggregate(**agg) + else: + return ht.aggregate( + hl.struct(**{field: hl.agg.group_by(grouping, agg[field]) for field in agg}) + )
+ + +
[docs]def get_downsampling_freq_indices( + freq_meta_expr: hl.expr.ArrayExpression, + pop: str = "global", + variant_quality: str = "adj", + genetic_ancestry_label: Optional[str] = None, + subset: Optional[str] = None, + downsamplings: Optional[List[int]] = None, +) -> hl.expr.ArrayExpression: + """ + Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified `genetic_ancestry_label` and "variant_quality" values. + + :param freq_meta_expr: ArrayExpression containing the set of groupings for each + element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', + 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). + :param pop: Population to use for filtering by the `genetic_ancestry_label` key in + `freq_meta_expr`. Default is 'global'. + :param variant_quality: Variant quality to use for filtering by the 'group' key in + `freq_meta_expr`. Default is 'adj'. + :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, + "gen_anc" or "pop" is used (in that order of preference) if present. Default is + None. + :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. + Default is None, which will return all downsampling indices without a 'subset' + key in `freq_meta_expr`. + :param downsamplings: Optional List of integers specifying what downsampling + indices to obtain. Default is None, which will return all downsampling indices. + :return: ArrayExpression of indices of dictionaries in `freq_meta_expr` that only + have the "downsampling" key with specified `genetic_ancestry_label` and + "variant_quality" values. + """ + if genetic_ancestry_label is None: + gen_anc = ["gen_anc", "pop"] + else: + gen_anc = [genetic_ancestry_label] + + def _get_filter_expr(m: hl.expr.StructExpression) -> hl.expr.BooleanExpression: + filter_expr = ( + (m.get("group") == variant_quality) + & (hl.any([m.get(l, "") == pop for l in gen_anc])) + & m.contains("downsampling") + ) + if downsamplings is not None: + filter_expr &= hl.literal(downsamplings).contains( + hl.int(m.get("downsampling", "0")) + ) + if subset is None: + filter_expr &= ~m.contains("subset") + else: + filter_expr &= m.get("subset", "") == subset + return filter_expr + + indices = hl.enumerate(freq_meta_expr).filter(lambda f: _get_filter_expr(f[1])) + + # Get an array of indices and meta dictionaries sorted by "downsampling" key. + return hl.sorted(indices, key=lambda f: hl.int(f[1]["downsampling"]))
+ + +
[docs]def downsampling_counts_expr( + freq_expr: hl.expr.ArrayExpression, + freq_meta_expr: hl.expr.ArrayExpression, + pop: str = "global", + variant_quality: str = "adj", + singleton: bool = False, + max_af: Optional[float] = None, + genetic_ancestry_label: Optional[str] = None, + subset: Optional[str] = None, + downsamplings: Optional[List[int]] = None, +) -> hl.expr.ArrayExpression: + """ + Return an aggregation expression to compute an array of counts of all downsamplings found in `freq_expr` where specified criteria is met. + + The frequency metadata (`freq_meta_expr`) should be in a similar format to the + `freq_meta` annotation added by `annotate_freq()`. Each downsampling should have + 'group', `genetic_ancestry_label`, and 'downsampling' keys. Included downsamplings + are those where 'group' == `variant_quality` and `genetic_ancestry_label` == `pop`. + + :param freq_expr: ArrayExpression of Structs with 'AC' and 'AF' annotations. + :param freq_meta_expr: ArrayExpression containing the set of groupings for each + element of the `freq_expr` array (e.g., [{'group': 'adj'}, {'group': 'adj', + 'pop': 'nfe'}, {'downsampling': '5000', 'group': 'adj', 'pop': 'global'}]). + :param pop: Population to use for filtering by the `genetic_ancestry_label` key in + `freq_meta_expr`. Default is 'global'. + :param variant_quality: Variant quality to use for filtering by the 'group' key in + `freq_meta_expr`. Default is 'adj'. + :param singleton: Whether to filter to only singletons before counting (AC == 1). + Default is False. + :param max_af: Maximum variant allele frequency to keep. By default no allele + frequency cutoff is applied. + :param genetic_ancestry_label: Label defining the genetic ancestry groups. If None, + "gen_anc" or "pop" is used (in that order of preference) if present. Default is + None. + :param subset: Subset to use for filtering by the 'subset' key in `freq_meta_expr`. + Default is None, which will return all downsampling counts without a 'subset' + key in `freq_meta_expr`. If specified, only downsamplings with the specified + subset will be included. + :param downsamplings: Optional List of integers specifying what downsampling + indices to obtain. Default is None, which will return all downsampling counts. + :return: Aggregation Expression for an array of the variant counts in downsamplings + for specified population. + """ + # Get an array of indices sorted by "downsampling" key. + sorted_indices = get_downsampling_freq_indices( + freq_meta_expr, + pop, + variant_quality, + genetic_ancestry_label, + subset, + downsamplings, + ).map(lambda x: x[0]) + + def _get_criteria(i: hl.expr.Int32Expression) -> hl.expr.Int32Expression: + """ + Return 1 when variant meets specified criteria (`singleton` or `max_af`), if requested, or with an AC > 0. + + :param i: The index of a downsampling. + :return: Returns 1 if the variant in the downsampling with specified index met + the criteria. Otherwise, returns 0. + """ + if singleton: + return hl.int(freq_expr[i].AC == 1) + elif max_af: + return hl.int((freq_expr[i].AC > 0) & (freq_expr[i].AF <= max_af)) + else: + return hl.int(freq_expr[i].AC > 0) + + # Map `_get_criteria` function to each downsampling indexed by `sorted_indices` to + # generate a list of 1's and 0's for each variant, where the length of the array is + # the total number of downsamplings for the specified population and each element + # in the array indicates if the variant in the downsampling indexed by + # `sorted_indices` meets the specified criteria. + # Return an array sum aggregation that aggregates arrays generated from mapping. + return hl.agg.array_sum(hl.map(_get_criteria, sorted_indices))
+ + +
[docs]def annotate_mutation_type( + t: Union[hl.MatrixTable, hl.Table], + context_length: Optional[int] = None, + num_scan_context_length: Optional[int] = 100, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Annotate mutation types. + + The following annotations are added to the output Table: + - cpg + - transition + - mutation_type - one of "CpG", "non-CpG transition", or "transversion" + - mutation_type_model + + ..note: + + This function uses the term 'mutation_type' because 'variant_type' is already + used in this repo to indicate a variant's multiallelic and SNP/indel status. + + :param t: Input Table or MatrixTable. + :param context_length: Length of the 'context' annotation in 't'. If this is not + specified, the value will be determined by examining the first + `num_scan_context_length` values of the 'context' annotation. Default is None. + :param num_scan_context_length: Number of values in the 'context' annotation to use + for determining `context_length` if it is not specified. If set to None, all + values in 'context' will be used. Default is 100. + :return: Table with mutation type annotations added. + """ + if context_length is None: + # Determine the context length by collecting all the context lengths. + if num_scan_context_length is None: + context_lengths = t.aggregate(hl.agg.collect_as_set(hl.len(t.context))) + msg = "all" + else: + context_lengths = hl.len(t.context).take(num_scan_context_length) + msg = f"the first {num_scan_context_length}" + context_lengths = list(filter(None, set(context_lengths))) + if len(context_lengths) > 1: + raise ValueError( + f"More than one length was found among {msg} 'context' values. Length " + "of 'context' should be consistent.", + ) + else: + context_length = context_lengths[0] + logger.info( + "Detected a length of %d for context length using %s 'context' values.", + context_length, + msg, + ) + + # Determine the middle index of the context annotation. + if context_length == 3: + mid_index = 1 + elif context_length == 7: + mid_index = 3 + else: + raise ValueError( + "The length of context should be either 3 or 7, instead of" + f" {context_length}." + ) + + transition_expr = hl.is_transition(t.ref, t.alt) + cpg_expr = ( + (t.ref == "G") & (t.alt == "A") & (t.context[mid_index - 1 : mid_index] == "C") + ) | ( + (t.ref == "C") + & (t.alt == "T") + & (t.context[mid_index + 1 : mid_index + 2] == "G") + ) + if isinstance(t, hl.MatrixTable): + t = t.annotate_rows(transition=transition_expr, cpg=cpg_expr) + else: + t = t.annotate(transition=transition_expr, cpg=cpg_expr) + mutation_type_expr = ( + hl.switch(hl.len(t.context)) + .when( + context_length, + hl.case() + .when(t.cpg, "CpG") + .when(t.transition, "non-CpG transition") + .default("transversion"), + ) + .or_error("Found 'context' value with unexpected context length!") + ) + mutation_type_model_expr = hl.if_else(t.cpg, t.context, "non-CpG") + if isinstance(t, hl.MatrixTable): + return t.annotate_rows( + mutation_type=mutation_type_expr, + mutation_type_model=mutation_type_model_expr, + ) + else: + return t.annotate( + mutation_type=mutation_type_expr, + mutation_type_model=mutation_type_model_expr, + )
+ + +
[docs]def trimer_from_heptamer( + t: Union[hl.MatrixTable, hl.Table], +) -> Union[hl.MatrixTable, hl.Table]: + """ + Trim heptamer context to create trimer context. + + :param t: Input MatrixTable or Table with context annotation. + :return: MatrixTable or Table with trimer context annotated. + """ + trimer_expr = hl.if_else(hl.len(t.context) == 7, t.context[2:5], t.context) + return ( + t.annotate_rows(context=trimer_expr) + if isinstance(t, hl.MatrixTable) + else t.annotate(context=trimer_expr) + )
+ + +
[docs]def collapse_strand( + t: Union[hl.Table, hl.MatrixTable], +) -> Union[hl.Table, hl.MatrixTable]: + """ + Return the deduplicated context by collapsing DNA strands. + + Function returns the reverse complement for 'ref, 'alt', and 'context' if the + reference allele is either 'G' or 'T'. + + The following annotations are added to the output Table: + - was_flipped - whether the 'ref, 'alt', and 'context' were flipped (reverse + complement taken) + + :param ht: Input Table. + :return: Table with deduplicated context annotation (ref, alt, context, + was_flipped). + """ + ref_g_or_t_expr = (t.ref == "G") | (t.ref == "T") + collapse_expr = { + "ref": hl.if_else(ref_g_or_t_expr, hl.reverse_complement(t.ref), t.ref), + "alt": hl.if_else(ref_g_or_t_expr, hl.reverse_complement(t.alt), t.alt), + "context": hl.if_else( + ref_g_or_t_expr, + hl.reverse_complement(t.context), + t.context, + ), + "was_flipped": ref_g_or_t_expr, + } + return ( + t.annotate(**collapse_expr) + if isinstance(t, hl.Table) + else t.annotate_rows(**collapse_expr) + )
+ + +
[docs]def build_models( + coverage_ht: hl.Table, + weighted: bool = False, + pops: Tuple[str] = (), + keys: Tuple[str] = ( + "context", + "ref", + "alt", + "methylation_level", + "mu_snp", + ), + high_cov_definition: int = COVERAGE_CUTOFF, + upper_cov_cutoff: Optional[int] = None, + skip_coverage_model: bool = False, +) -> Tuple[Optional[Tuple[float, float]], hl.expr.StructExpression]: + """ + Build coverage and plateau models. + + This function builds models (plateau_models) using linear regression to calibrate + mutation rate estimates against the proportion observed of each substitution, + context, and methylation level in `coverage_ht`. + + Two plateau models are fit, one for CpG transitions, and one for the remainder of + sites (transversions and non CpG transitions). + + The plateau models only consider high coverage sites, or sites above a median + coverage of `high_cov_definition` and median coverage below `upper_cov_cutoff`. + + Plateau model: adjusts proportion of expected variation based on location in the + genome and CpG status. + The x and y of the plateau models: + - x: `mu_snp` - mutation rate + - y: proportion observed ('observed_variants' or 'observed_{pop}' / 'possible_variants') + + This function also builds models (coverage models) to calibrate the proportion of + expected variation at low coverage sites (sites below `high_cov_definition`). + + The coverage models are built by creating a scaling factor across all high coverage + sites, applying this ratio to the low coverage sites, and running a linear + regression. + + Coverage model: corrects proportion of expected variation at low coverage sites. + Low coverage sites are defined as sites with median coverage < `high_cov_definition`. + + The x and y of the coverage model: + - x: log10 groupings of exome coverage at low coverage sites + - y: sum('observed_variants')/ (`high_coverage_scale_factor` * sum('possible_variants' * 'mu_snp') at low coverage sites + + `high_coverage_scale_factor` = sum('observed_variants') / + sum('possible_variants' * 'mu_snp') at high coverage sites + + .. note:: + + This function expects that the input Table(`coverage_ht`) was created using + `get_proportion_observed_by_coverage`, which means that `coverage_ht` should + contain only high quality synonymous variants below 0.1% frequency. + + This function also expects that the following fields are present in + `coverage_ht`: + - context - trinucleotide genomic context + - ref - the reference allele + - alt - the alternate allele + - methylation_level - methylation level + - cpg - whether the site is CpG site + - exome_coverage - median exome coverage at integer values between 1-100 + - observed_variants - the number of observed variants in the dataset for each + variant. Note that the term "variant" here refers to a specific substitution, + context, methylation level, and coverage combination + - downsampling_counts_{pop} (optional) - array of observed variant counts per + population after downsampling. Used only when `pops` is specified. + - mu_snp - mutation rate + - possible_variants - the number of possible variants in the dataset for each + variant + + :param coverage_ht: Input coverage Table. + :param weighted: Whether to weight the plateau models (a linear regression + model) by 'possible_variants'. Default is False. + :param pops: List of populations used to build plateau models. + Default is (). + :param keys: Annotations used to group observed and possible variant counts. + Default is ("context", "ref", "alt", "methylation_level", "mu_snp"). + :param high_cov_definition: Lower median coverage cutoff. Sites with coverage above this cutoff + are considered well covered. Default is `COVERAGE_CUTOFF`. + :param upper_cov_cutoff: Upper median coverage cutoff. Sites with coverage above this cutoff + are excluded from the high coverage Table. Default is None. + :param skip_coverage_model: Whether to skip generating the coverage model. If set to True, + None is returned instead of the coverage model. Default is False. + :return: Coverage model and plateau models. + """ + # Filter to sites with coverage equal to or above `high_cov_definition`. + high_cov_ht = coverage_ht.filter(coverage_ht.exome_coverage >= high_cov_definition) + + # Filter to sites with coverage equal to or below `upper_cov_cutoff` if specified. + if upper_cov_cutoff is not None: + high_cov_ht = high_cov_ht.filter(high_cov_ht.exome_coverage <= upper_cov_cutoff) + + agg_expr = { + "observed_variants": hl.agg.sum(high_cov_ht.observed_variants), + "possible_variants": hl.agg.sum(high_cov_ht.possible_variants), + } + for pop in pops: + agg_expr[f"observed_{pop}"] = hl.agg.array_sum( + high_cov_ht[f"downsampling_counts_{pop}"] + ) + + # Generate a Table with all necessary annotations (x and y listed above) + # for the plateau models. + high_cov_group_ht = high_cov_ht.group_by(*keys).aggregate(**agg_expr) + high_cov_group_ht = annotate_mutation_type(high_cov_group_ht) + + # Build plateau models. + plateau_models_agg_expr = build_plateau_models( + cpg_expr=high_cov_group_ht.cpg, + mu_snp_expr=high_cov_group_ht.mu_snp, + observed_variants_expr=high_cov_group_ht.observed_variants, + possible_variants_expr=high_cov_group_ht.possible_variants, + pops_observed_variants_array_expr=[ + high_cov_group_ht[f"observed_{pop}"] for pop in pops + ], + weighted=weighted, + ) + if pops: + # Map the models to their corresponding populations if pops is specified. + _plateau_models = dict( + high_cov_group_ht.aggregate(hl.struct(**plateau_models_agg_expr)) + ) + pop_models = _plateau_models["pop"] + plateau_models = { + pop: hl.literal(pop_models[idx]) for idx, pop in enumerate(pops) + } + plateau_models["total"] = _plateau_models["total"] + plateau_models = hl.struct(**plateau_models) + else: + plateau_models = high_cov_group_ht.aggregate( + hl.struct(**plateau_models_agg_expr) + ) + + if not skip_coverage_model: + # Filter to sites with coverage below `high_cov_definition` and larger than 0. + low_cov_ht = coverage_ht.filter( + (coverage_ht.exome_coverage < high_cov_definition) + & (coverage_ht.exome_coverage > 0) + ) + + # Create a metric that represents the relative mutability of the exome calculated + # on high coverage sites and will be used as scaling factor when building the + # coverage model. + high_coverage_scale_factor = high_cov_ht.aggregate( + hl.agg.sum(high_cov_ht.observed_variants) + / hl.agg.sum(high_cov_ht.possible_variants * high_cov_ht.mu_snp) + ) + + # Generate a Table with all necessary annotations (x and y listed above) + # for the coverage model. + low_cov_group_ht = low_cov_ht.group_by( + log_coverage=hl.log10(low_cov_ht.exome_coverage) + ).aggregate( + low_coverage_oe=hl.agg.sum(low_cov_ht.observed_variants) + / ( + high_coverage_scale_factor + * hl.agg.sum(low_cov_ht.possible_variants * low_cov_ht.mu_snp) + ) + ) + + # Build the coverage model. + # TODO: consider weighting here as well + coverage_model_expr = build_coverage_model( + low_coverage_oe_expr=low_cov_group_ht.low_coverage_oe, + log_coverage_expr=low_cov_group_ht.log_coverage, + ) + coverage_model = tuple(low_cov_group_ht.aggregate(coverage_model_expr).beta) + else: + coverage_model = None + + return coverage_model, plateau_models
+ + +
[docs]def build_plateau_models( + cpg_expr: hl.expr.BooleanExpression, + mu_snp_expr: hl.expr.Float64Expression, + observed_variants_expr: hl.expr.Int64Expression, + possible_variants_expr: hl.expr.Int64Expression, + pops_observed_variants_array_expr: List[hl.expr.ArrayExpression] = [], + weighted: bool = False, +) -> Dict[str, Union[Dict[bool, hl.expr.ArrayExpression], hl.ArrayExpression]]: + """ + Build plateau models to calibrate mutation rate to compute predicted proportion observed value. + + The x and y of the plateau models: + - x: `mu_snp_expr` + - y: `observed_variants_expr` / `possible_variants_expr` + or `pops_observed_variants_array_expr`[index] / `possible_variants_expr` + if `pops` is specified + + :param cpg_expr: BooleanExpression noting whether a site is a CPG site. + :param mu_snp_expr: Float64Expression of the mutation rate. + :param observed_variants_expr: Int64Expression of the observed variant counts. + :param possible_variants_expr: Int64Expression of the possible variant counts. + :param pops_observed_variants_array_expr: Nested ArrayExpression with all observed + variant counts ArrayNumericExpressions for specified populations. e.g., `[[1,1, + 1],[1,1,1]]`. Default is None. + :param weighted: Whether to generalize the model to weighted least squares using + 'possible_variants'. Default is False. + :return: A dictionary of intercepts and slopes of plateau models. The keys are + 'total' (for all sites) and 'pop' (optional; for populations). The values for + 'total' is a dictionary (e.g., <DictExpression of type dict<bool, + array<float64>>>), and the value for 'pop' is a nested list of dictionaries (e. + g., <ArrayExpression of type array<array<dict<bool, array<float64>>>>>). The + key of the dictionary in the nested list is CpG status (BooleanExpression), and + the value is an ArrayExpression containing intercept and slope values. + """ + # Build plateau models for all sites + plateau_models_agg_expr = { + "total": hl.agg.group_by( + cpg_expr, + hl.agg.linreg( + observed_variants_expr / possible_variants_expr, + [1, mu_snp_expr], + weight=possible_variants_expr if weighted else None, + ).beta, + ) + } + if pops_observed_variants_array_expr: + # Build plateau models using sites in population downsamplings if + # population is specified. + plateau_models_agg_expr["pop"] = hl.agg.array_agg( + lambda pop_obs_var_array_expr: hl.agg.array_agg( + lambda pop_observed_variants: hl.agg.group_by( + cpg_expr, + hl.agg.linreg( + pop_observed_variants / possible_variants_expr, + [1, mu_snp_expr], + weight=possible_variants_expr, + ).beta, + ), + pop_obs_var_array_expr, + ), + pops_observed_variants_array_expr, + ) + return plateau_models_agg_expr
+ + +
[docs]def build_coverage_model( + low_coverage_oe_expr: hl.expr.Float64Expression, + log_coverage_expr: hl.expr.Float64Expression, +) -> hl.expr.StructExpression: + """ + Build coverage model. + + This function uses linear regression to build a model of log10(coverage) to correct + proportion of expected variation at low coverage sites. + + The x and y of the coverage model: + - x: `log_coverage_expr` + - y: `low_coverage_oe_expr` + + :param low_coverage_oe_expr: The Float64Expression of observed:expected ratio + for a given coverage level. + :param log_coverage_expr: The Float64Expression of log10 coverage. + :return: StructExpression with intercept and slope of the model. + """ + return hl.agg.linreg(low_coverage_oe_expr, [1, log_coverage_expr])
+ + +
[docs]def get_all_pop_lengths( + ht: hl.Table, + pops: Tuple[str], + obs_expr: hl.expr.StructExpression, +) -> List[Tuple[str, str]]: + """ + Get the minimum length of observed variant counts array for each population downsampling. + + The observed variant counts for each population in `pops` are specified by + annotations on the `obs_expr` expression. + + The function also performs a check that arrays of variant counts within population + downsamplings all have the same lengths. + + :param ht: Input Table containing `obs_expr`. + :param pops: Populations used to categorize observed variant counts in downsamplings. + :param obs_expr: Expression for the population observed variant counts. Should be a + struct containing an array for each pop in `pops`. + :return: A Dictionary with the minimum array length for each population. + """ + # TODO: This function will be converted into doing just the length check if there + # is no usage of pop_lengths in the constraint pipeline. + # Get minimum length of downsamplings for each population. + pop_downsampling_lengths = ht.aggregate( + [hl.agg.min(hl.len(obs_expr[pop])) for pop in pops] + ) + + # Zip population name with their downsampling length. + pop_lengths = list(zip(pop_downsampling_lengths, pops)) + logger.info("Found: %s", "".join(map(str, pop_lengths))) + + assert ht.all( + hl.all( + lambda f: f, + [hl.len(obs_expr[pop]) == length for length, pop in pop_lengths], + ) + ), ( + "The arrays of variant counts within population downsamplings have different" + " lengths!" + ) + + return pop_lengths
+ + +
[docs]def get_constraint_grouping_expr( + vep_annotation_expr: hl.StructExpression, + coverage_expr: Optional[hl.Int32Expression] = None, + include_transcript_group: bool = True, + include_canonical_group: bool = True, + include_mane_select_group: bool = False, +) -> Dict[str, Union[hl.StringExpression, hl.Int32Expression, hl.BooleanExpression]]: + """ + Collect annotations used for constraint groupings. + + Function collects the following annotations: + - annotation - 'most_severe_consequence' annotation in `vep_annotation_expr` + - modifier - classic lof annotation from 'lof' annotation in + `vep_annotation_expr`, LOFTEE annotation from 'lof' annotation in + `vep_annotation_expr`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation_expr`, or "None" if neither is defined + - gene - 'gene_symbol' annotation inside `vep_annotation_expr` + - coverage - exome coverage if `coverage_expr` is specified + - transcript - id from 'transcript_id' in `vep_annotation_expr` (added when + `include_transcript_group` is True) + - canonical from `vep_annotation_expr` (added when `include_canonical_group` is + True) + - mane_select from `vep_annotation_expr` (added when `include_mane_select_group` is + True) + + .. note:: + This function expects that the following fields are present in + `vep_annotation_expr`: + - lof + - polyphen_prediction + - most_severe_consequence + - gene_symbol + - transcript_id (if `include_transcript_group` is True) + - canonical (if `include_canonical_group` is True) + - mane_select (if `include_mane_select_group` is True) + + :param vep_annotation_expr: StructExpression of VEP annotation. + :param coverage_expr: Optional Int32Expression of exome coverage. Default is None. + :param include_transcript_group: Whether to include the transcript annotation in the + groupings. Default is True. + :param include_canonical_group: Whether to include canonical annotation in the + groupings. Default is True. + :param include_mane_select_group: Whether to include mane_select annotation in the + groupings. Default is False. + + :return: A dictionary with keys as annotation names and values as actual + annotations. + """ + lof_expr = vep_annotation_expr.lof + polyphen_prediction_expr = vep_annotation_expr.polyphen_prediction + + # Create constraint annotations to be used for groupings. + groupings = { + "annotation": vep_annotation_expr.most_severe_consequence, + "modifier": hl.coalesce(lof_expr, polyphen_prediction_expr, "None"), + "gene": vep_annotation_expr.gene_symbol, + "gene_id": vep_annotation_expr.gene_id, + } + if coverage_expr is not None: + groupings["coverage"] = coverage_expr + + # Add 'transcript' and 'canonical' annotation if requested. + if include_transcript_group: + groupings["transcript"] = vep_annotation_expr.transcript_id + if include_canonical_group: + groupings["canonical"] = hl.or_else(vep_annotation_expr.canonical == 1, False) + if include_mane_select_group: + groupings["mane_select"] = hl.or_else( + hl.is_defined(vep_annotation_expr.mane_select), False + ) + + return groupings
+ + +
[docs]def annotate_exploded_vep_for_constraint_groupings( + ht: hl.Table, + vep_annotation: str = "transcript_consequences", + include_canonical_group: bool = True, + include_mane_select_group: bool = False, +) -> Tuple[Union[hl.Table, hl.MatrixTable], Tuple[str]]: + """ + Annotate Table with annotations used for constraint groupings. + + Function explodes the specified VEP annotation (`vep_annotation`) and adds the following annotations: + - annotation -'most_severe_consequence' annotation in `vep_annotation` + - modifier - classic lof annotation from 'lof' annotation in + `vep_annotation`, LOFTEE annotation from 'lof' annotation in + `vep_annotation`, PolyPhen annotation from 'polyphen_prediction' in + `vep_annotation`, or "None" if neither is defined + - gene - 'gene_symbol' annotation inside `vep_annotation` + - coverage - exome coverage in `ht` + - transcript - id from 'transcript_id' in `vep_annotation` (added when + `include_transcript_group` is True) + - canonical from `vep_annotation` (added when `include_canonical_group` is + True) + - mane_select from `vep_annotation` (added when `include_mane_select_group` is + True) + + .. note:: + This function expects that the following annotations are present in `ht`: + - vep + - exome_coverage + + :param t: Input Table or MatrixTable. + :param vep_annotation: Name of annotation in 'vep' annotation (one of + "transcript_consequences" and "worst_csq_by_gene") that will be used for + obtaining constraint annotations. Default is "transcript_consequences". + :param include_canonical_group: Whether to include 'canonical' annotation in the + groupings. Default is True. Ignored unless `vep_annotation` is "transcript_consequences". + :param include_mane_select_group: Whether to include 'mane_select' annotation in the + groupings. Default is False. Ignored unless `vep_annotation` is "transcript_consequences". + :return: A tuple of input Table or MatrixTable with grouping annotations added and + the names of added annotations. + """ + if vep_annotation == "transcript_consequences": + if not include_canonical_group and not include_mane_select_group: + raise ValueError( + "If 'vep_annotation' is 'transcript_consequences', one of either" + " 'include_canonical_group' or 'include_mane_select_group' must be set!" + ) + include_transcript_group = True + else: + logger.warning( + "Setting both 'include_canonical_group' and 'include_mane_select_group' to" + " False (options cannot be used unless 'vep_annotation' is" + " 'transcript_consequences')." + ) + include_transcript_group = False + include_canonical_group = False + include_mane_select_group = False + + # Annotate 'worst_csq_by_gene' to `ht` if it's specified for `vep_annotation`. + if vep_annotation == "worst_csq_by_gene": + ht = process_consequences(ht) + + # Explode the specified VEP annotation. + ht = explode_by_vep_annotation(ht, vep_annotation) + + # Collect the annotations used for groupings. + groupings = get_constraint_grouping_expr( + ht[vep_annotation], + coverage_expr=ht.exome_coverage, + include_transcript_group=include_transcript_group, + include_canonical_group=include_canonical_group, + include_mane_select_group=include_mane_select_group, + ) + + return ht.annotate(**groupings), tuple(groupings.keys())
+ + +
[docs]def compute_expected_variants( + ht: hl.Table, + plateau_models_expr: hl.StructExpression, + mu_expr: hl.Float64Expression, + cov_corr_expr: hl.Float64Expression, + possible_variants_expr: hl.Int64Expression, + cpg_expr: hl.BooleanExpression, + pop: Optional[str] = None, +) -> Dict[str, Union[hl.Float64Expression, hl.Int64Expression]]: + """ + Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts. + + :param ht: Input Table. + :param plateau_models_expr: Linear models (output of `build_models()`, with the values + of the dictionary formatted as a StructExpression of intercept and slope, that + calibrates mutation rate to proportion observed for high coverage exomes. It + includes models for CpG, non-CpG sites, and each population if specified. + :param mu_expr: Float64Expression of mutation rate. + :param possible_variants_expr: Int64Expression of possible variant counts. + :param cov_corr_expr: Float64Expression of corrected coverage expression. + :param cpg_expr: BooleanExpression noting whether a site is a CPG site. + :param pop: Optional population to use when applying plateau model. Default is + None. + :return: A dictionary with predicted proportion observed ratio and expected variant + counts. + """ + if pop is None: + pop = "" + plateau_model = hl.literal(plateau_models_expr.total)[cpg_expr] + slope = plateau_model[1] + intercept = plateau_model[0] + agg_func = hl.agg.sum + ann_to_sum = ["observed_variants", "possible_variants"] + else: + plateau_model = hl.literal(plateau_models_expr[pop]) + slope = hl.map(lambda f: f[cpg_expr][1], plateau_model) + intercept = hl.map(lambda f: f[cpg_expr][0], plateau_model) + agg_func = hl.agg.array_sum + pop = f"_{pop}" + ann_to_sum = [f"downsampling_counts{pop}"] + + # Apply plateau models for specified population. + ppo_expr = mu_expr * slope + intercept + + # Generate sum aggregators for 'predicted_proportion_observed' and + # 'expected_variants', for specified population. + agg_expr = { + f"predicted_proportion_observed{pop}": agg_func(ppo_expr), + f"expected_variants{pop}": agg_func( + ppo_expr * cov_corr_expr * possible_variants_expr + ), + } + + # Generate sum aggregators for 'observed_variants' and 'possible_variants' on + # the entire dataset if pop is None, and for 'downsampling_counts' for + # specified population if pop is not None. + agg_expr.update({ann: agg_func(ht[ann]) for ann in ann_to_sum}) + + return agg_expr
+ + +
[docs]def oe_aggregation_expr( + ht: hl.Table, + filter_expr: hl.expr.BooleanExpression, + pops: Tuple[str] = (), + exclude_mu_sum: bool = False, +) -> hl.expr.StructExpression: + """ + Get aggregation expressions to compute the observed:expected ratio for rows defined by `filter_expr`. + + Return a Struct containing aggregation expressions to sum the number of observed + variants, possible variants, expected variants, and mutation rate (if + `exclude_mu_sum` is not True) for rows defined by `filter_expr`. The Struct also + includes an aggregation expression for the observed:expected ratio. + + The following annotations are in the returned StructExpression: + - obs - the sum of observed variants filtered to `filter_expr`. + - mu - the sum of mutation rate of variants filtered to `filter_expr`. + - possible - possible number of variants filtered to `filter_expr`. + - exp - expected number of variants filtered to `filter_expr`. + - oe - observed:expected ratio of variants filtered to `filter_expr`. + + If `pops` is specified: + - pop_exp - Struct with the expected number of variants per population (for + all pop in `pops`) filtered to `filter_expr`. + - pop_obs - Struct with the observed number of variants per population (for + all pop in `pops`) filtered to `filter_expr`. + + .. note:: + The following annotations should be present in `ht`: + - observed_variants + - mu + - possible_variants + - expected_variants + If `pops` is specified, the following annotations should also be present: + - expected_variants_{pop} for all pop in `pops` + - downsampling_counts_{pop} for all pop in `pops` + + :param ht: Input Table to create observed:expected ratio aggregation expressions for. + :param filter_expr: Boolean expression used to filter `ht` before aggregation. + :param pops: List of populations to compute constraint metrics for. Default is (). + :param exclude_mu_sum: Whether to exclude mu sum aggregation expression from + returned struct. Default is False. + :return: StructExpression with observed:expected ratio aggregation expressions. + """ + # Create aggregators that sum the number of observed variants, possible variants, + # and expected variants and compute observed:expected ratio. + agg_expr = { + "obs": hl.agg.sum(ht.observed_variants), + "exp": hl.agg.sum(ht.expected_variants), + "possible": hl.agg.sum(ht.possible_variants), + } + agg_expr["oe"] = divide_null(agg_expr["obs"], agg_expr["exp"]) + + # Create an aggregator that sums the mutation rate. + if not exclude_mu_sum: + agg_expr["mu"] = hl.agg.sum(ht.mu) + + # Create aggregators that sum the number of observed variants + # and expected variants for each population if pops is specified. + if pops: + agg_expr["pop_exp"] = hl.struct( + **{pop: hl.agg.array_sum(ht[f"expected_variants_{pop}"]) for pop in pops} + ) + agg_expr["pop_obs"] = hl.struct( + **{pop: hl.agg.array_sum(ht[f"downsampling_counts_{pop}"]) for pop in pops} + ) + + agg_expr = hl.struct(**agg_expr) + return hl.agg.group_by(filter_expr, agg_expr).get(True, hl.missing(agg_expr.dtype))
+ + +
[docs]def compute_pli( + ht: hl.Table, + obs_expr: hl.expr.Int64Expression, + exp_expr: hl.expr.Float64Expression, + expected_values: Optional[Dict[str, float]] = None, + min_diff_convergence: float = 0.001, +) -> hl.StructExpression: + """ + Compute the pLI score using the observed and expected variant counts. + + Full details on pLI can be found in the ExAC paper: Lek, M., Karczewski, K., + Minikel, E. et al. Analysis of protein-coding genetic variation in 60,706 humans. + Nature 536, 285–291 (2016). + + pLI is the probability of being loss-of-function intolerant, and this function + computes that probability using the expectation-maximization (EM) algorithm. + + We assume a 3 state model, where each gene fits into one of three categories + with respect loss-of-function variation sensitivity: + + - Null: where protein truncating variation is completely tolerated by natural + selection. + - Recessive (Rec): where heterozygous pLoFs are tolerated but homozygous pLoFs + are not. + - Haploinsufficient (LI): where heterozygous pLoFs are not tolerated. + + The function requires the expected amount of loss-of-function depletion for each of + these states. The default provided is based on the observed depletion of + protein-truncating variation in the Blekhman autosomal recessive and ClinGen + dosage sensitivity gene sets (Supplementary Information Table 12 of the above + reference): + + - Null: 1.0, assume tolerant genes have the expected amount of truncating + variation. + - Rec: 0.463, derived from the empirical mean observed/expected rate of + truncating variation for recessive disease genes (0.463). + - LI: 0.089, derived from the empirical mean observed/expected rate of + truncating variation for severe haploinsufficient genes. + + The output StructExpression will include the following annotations: + + - pLI: Probability of loss-of-function intolerance; probability that transcript + falls into distribution of haploinsufficient genes. + - pNull: Probability that transcript falls into distribution of unconstrained + genes. + - pRec: Probability that transcript falls into distribution of recessive genes. + + :param ht: Input Table containing `obs_expr` and `exp_expr`. + :param obs_expr: Expression for the number of observed variants on each gene or + transcript in `ht`. + :param exp_expr: Expression for the number of expected variants on each gene or + transcript in `ht`. + :param expected_values: Dictionary containing the expected values for 'Null', + 'Rec', and 'LI' to use as starting values. + :param min_diff_convergence: Minimum iteration change in LI to consider the EM + model convergence criteria as met. Default is 0.001. + :return: StructExpression for pLI scores. + """ + if expected_values is None: + expected_values = {"Null": 1.0, "Rec": 0.463, "LI": 0.089} + + # Set up initial values. + last_pi = {k: 0 for k in expected_values.keys()} + pi = {k: 1 / len(expected_values.keys()) for k in expected_values.keys()} + + dpois_expr = { + k: hl.or_missing( + exp_expr > 0, hl.dpois(obs_expr, exp_expr * expected_values[k]) + ) + for k in pi + } + _ht = ht.select(dpois=dpois_expr) + # Checkpoint the temp HT because it will need to be aggregated several times. + _ht = _ht.checkpoint(new_temp_file(prefix="compute_pli", extension="ht")) + + # Calculate pLI scores. + while abs(pi["LI"] - last_pi["LI"]) > min_diff_convergence: + last_pi = copy.deepcopy(pi) + pi_expr = {k: v * _ht.dpois[k] for k, v in pi.items()} + row_sum_expr = hl.sum([pi_expr[k] for k in pi]) + pi_expr = {k: pi_expr[k] / row_sum_expr for k, v in pi.items()} + pi = _ht.aggregate({k: hl.agg.mean(pi_expr[k]) for k in pi.keys()}) + + # Get expression for pLI scores. + pli_expr = {k: v * dpois_expr[k] for k, v in pi.items()} + row_sum_expr = hl.sum([pli_expr[k] for k in pi]) + + return hl.struct(**{f"p{k}": pli_expr[k] / row_sum_expr for k in pi.keys()})
+ + +
[docs]def oe_confidence_interval( + obs_expr: hl.expr.Int64Expression, + exp_expr: hl.expr.Float64Expression, + alpha: float = 0.05, +) -> hl.expr.StructExpression: + """ + Determine the confidence interval around the observed:expected ratio. + + For a given pair of observed (`obs_expr`) and expected (`exp_expr`) values, the + function computes the density of the Poisson distribution (performed using Hail's + `dpois` module) with fixed k (`x` in `dpois` is set to the observed number of + variants) over a range of lambda (`lamb` in `dpois`) values, which are given by the + expected number of variants times a varying parameter ranging between 0 and 2 (the + observed:expected ratio is typically between 0 and 1, so we want to extend the + upper bound of the confidence interval to capture this). The cumulative density + function of the Poisson distribution density is computed and the value of the + varying parameter is extracted at points corresponding to `alpha` (defaults to 5%) + and 1-`alpha` (defaults to 95%) to indicate the lower and upper bounds of the + confidence interval. + + The following annotations are in the output StructExpression: + - lower - the lower bound of confidence interval + - upper - the upper bound of confidence interval + + :param obs_expr: Expression for the observed variant counts of pLoF, missense, or + synonymous variants in `ht`. + :param exp_expr: Expression for the expected variant counts of pLoF, missense, or + synonymous variants in `ht`. + :param alpha: The significance level used to compute the confidence interval. + Default is 0.05. + :return: StructExpression for the confidence interval lower and upper bounds. + """ + # Set up range between 0 and 2. + range_expr = hl.range(0, 2000).map(lambda x: hl.float64(x) / 1000) + range_dpois_expr = range_expr.map(lambda x: hl.dpois(obs_expr, exp_expr * x)) + + # Compute cumulative density function of the Poisson distribution density. + cumulative_dpois_expr = hl.cumulative_sum(range_dpois_expr) + max_cumulative_dpois_expr = cumulative_dpois_expr[-1] + norm_dpois_expr = cumulative_dpois_expr.map(lambda x: x / max_cumulative_dpois_expr) + + # Extract the value of the varying parameter within specified range. + lower_idx_expr = hl.argmax( + norm_dpois_expr.map(lambda x: hl.or_missing(x < alpha, x)) + ) + upper_idx_expr = hl.argmin( + norm_dpois_expr.map(lambda x: hl.or_missing(x > 1 - alpha, x)) + ) + return hl.struct( + lower=hl.if_else(obs_expr > 0, range_expr[lower_idx_expr], 0), + upper=range_expr[upper_idx_expr], + )
+ + +
[docs]def calculate_raw_z_score( + obs_expr: hl.expr.Int64Expression, + exp_expr: hl.expr.Float64Expression, +) -> hl.expr.StructExpression: + """ + Compute the signed raw z-score using observed and expected variant counts. + + The raw z-scores are positive when the transcript had fewer variants than expected, + and are negative when transcripts had more variants than expected. + + :param obs_expr: Observed variant count expression. + :param exp_expr: Expected variant count expression. + :return: StructExpression for the raw z-score. + """ + chisq_expr = divide_null((obs_expr - exp_expr) ** 2, exp_expr) + return hl.sqrt(chisq_expr) * hl.if_else(obs_expr > exp_expr, -1, 1)
+ + +
[docs]def get_constraint_flags( + exp_expr: hl.expr.Float64Expression, + raw_z_expr: hl.expr.Float64Expression, + raw_z_lower_threshold: Optional[float] = -5.0, + raw_z_upper_threshold: Optional[float] = 5.0, + flag_postfix: str = "", +) -> Dict[str, hl.expr.Expression]: + """ + Determine the constraint flags that define why constraint will not be calculated. + + Flags which are added: + - "no_exp_{flag_postfix}" - for genes that have missing or zero expected variants. + - "outlier_{flag_postfix}" - for genes that are raw z-score outliers: + (`raw_z_expr` < `raw_z_lower_threshold`) or (`raw_z_expr` > + `raw_z_upper_threshold`). + + :param exp_expr: Expression for the expected variant counts of pLoF, missense, or + synonymous variants. + :param raw_z_expr: Expression for the signed raw z-score of pLoF, missense, or + synonymous variants. + :param raw_z_lower_threshold: Lower threshold for the raw z-score. When `raw_z_expr` + is less than this threshold it is considered an 'outlier'. Default is -5.0. + :param raw_z_upper_threshold: Upper threshold for the raw z-score. When `raw_z_expr` + is greater than this threshold it is considered an 'outlier'. Default is 5.0. + :param flag_postfix: Postfix to add to the end of the constraint flag names. + :return: Dictionary containing expressions for constraint flags. + """ + outlier_expr = False + if raw_z_lower_threshold is not None: + outlier_expr |= raw_z_expr < raw_z_lower_threshold + if raw_z_upper_threshold is not None: + outlier_expr |= raw_z_expr > raw_z_upper_threshold + + if flag_postfix: + flag_postfix = f"_{flag_postfix}" + + constraint_flags = { + f"no_exp{flag_postfix}": hl.or_else(exp_expr <= 0, True), + f"outlier{flag_postfix}": hl.or_else(outlier_expr, False), + } + + return constraint_flags
+ + +
[docs]def calculate_raw_z_score_sd( + raw_z_expr: hl.expr.Float64Expression, + flag_expr: hl.expr.StringExpression, + mirror_neg_raw_z: bool = True, +) -> hl.expr.Expression: + """ + Calculate the standard deviation of the raw z-score. + + When using `mirror_neg_raw_z` is True, all the negative raw z-scores (defined by + `raw_z_expr`) are combined with those same z-scores multiplied by -1 (to create a + mirrored distribution). + + :param raw_z_expr: Expression for the raw z-score. + :param flag_expr: Expression for the constraint flags. z-score will not be + calculated if flags are present. + :param mirror_neg_raw_z: Whether the standard deviation should be computed using a + mirrored distribution of negative `raw_z_expr`. + :return: StructExpression containing standard deviation of the raw z-score and + the z-score. + """ + filter_expr = (hl.len(flag_expr) == 0) & hl.is_defined(raw_z_expr) + + if mirror_neg_raw_z: + filter_expr &= raw_z_expr < 0 + sd_expr = hl.agg.explode( + lambda x: hl.agg.stats(x), [raw_z_expr, -raw_z_expr] + ).stdev + else: + sd_expr = hl.agg.stats(raw_z_expr).stdev + + return hl.agg.filter(filter_expr, sd_expr)
+ + +
[docs]def add_gencode_transcript_annotations( + ht: hl.Table, + gencode_ht: hl.Table, + annotations: List[str] = ["level", "transcript_type"], +) -> hl.Table: + """ + Add GENCODE annotations to Table based on transcript id. + + .. note:: + Added annotations by default are: + - level + - transcript_type + + Computed annotations are: + - chromosome + - cds_length + - num_coding_exons + + :param ht: Input Table. + :param gencode_ht: Table with GENCODE annotations. + :param annotations: List of GENCODE annotations to add. Default is ["level", "transcript_type"]. + Added annotations also become keys for the group by when computing "cds_length" and "num_coding_exons". + :return: Table with transcript annotations from GENCODE added. + """ + gencode_ht = gencode_ht.annotate( + length=gencode_ht.interval.end.position + - gencode_ht.interval.start.position + + 1, + chromosome=gencode_ht.interval.start.contig, + ) + + # Obtain CDS annotations from GENCODE file and calculate CDS length and + # number of exons. + annotations_to_add = set(annotations + ["chromosome", "transcript_id", "length"]) + + gencode_cds = ( + gencode_ht.filter(gencode_ht.feature == "CDS") + .select(*annotations_to_add) + .key_by("transcript_id") + .drop("interval") + ) + + annotations_to_add.remove("length") + + gencode_cds = ( + gencode_cds.group_by(*annotations_to_add) + .aggregate( + cds_length=hl.agg.sum(gencode_cds.length), num_coding_exons=hl.agg.count() + ) + .key_by("transcript_id") + ) + + gencode_cds = gencode_cds.checkpoint( + new_temp_file(prefix="gencode_cds", extension="ht") + ) + + # Add GENCODE annotations to input Table. + ht = ht.annotate(**gencode_cds[ht.transcript]) + + return ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/file_utils.html b/_modules/gnomad/utils/file_utils.html new file mode 100644 index 000000000..617fcfd7a --- /dev/null +++ b/_modules/gnomad/utils/file_utils.html @@ -0,0 +1,326 @@ + + + + + + gnomad.utils.file_utils — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.file_utils

+# noqa: D100
+
+import base64
+import gzip
+import logging
+import os
+import subprocess
+import uuid
+from typing import List, Optional, Tuple, Union
+
+import hail as hl
+
+from gnomad.resources.resource_utils import DataException
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def file_exists(fname: str) -> bool: + """ + Check whether a file exists. + + Supports either local or Google cloud (gs://) paths. + If the file is a Hail file (.ht, .mt, .bm, .parquet, .he, and .vds extensions), it + checks that _SUCCESS is present. + + :param fname: File name. + :return: Whether the file exists. + """ + fext = os.path.splitext(fname)[1] + if fext in {".ht", ".mt", ".bm", ".parquet", ".he"}: + paths = [f"{fname}/_SUCCESS"] + elif fext == ".vds": + paths = [f"{fname}/reference_data/_SUCCESS", f"{fname}/variant_data/_SUCCESS"] + else: + paths = [fname] + + if fname.startswith("gs://"): + exists_func = hl.hadoop_exists + else: + exists_func = os.path.isfile + + exists = all([exists_func(p) for p in paths]) + + return exists
+ + +
[docs]def check_file_exists_raise_error( + fname: Union[str, List[str]], + error_if_exists: bool = False, + error_if_not_exists: bool = False, + error_if_exists_msg: str = "The following files already exist: ", + error_if_not_exists_msg: str = "The following files do not exist: ", +) -> bool: + """ + Check whether the file or all files in a list of files exist and optionally raise an exception. + + This can be useful when writing out to files at the end of a pipeline to first check if the file already + exists and therefore requires the file to be removed or overwrite specified so the pipeline doesn't fail. + + :param fname: File path, or list of file paths to check the existence of. + :param error_if_exists: Whether to raise an exception if any of the files exist. Default is True. + :param error_if_not_exists: Whether to raise an exception if any of the files do not exist. Default is False. + :param error_if_exists_msg: String of the error message to print if any of the files exist. + :param error_if_not_exists_msg: String of the error message to print if any of the files do not exist. + :return: Boolean indicating if `fname` or all files in `fname` exist. + """ + if isinstance(fname, str): + fname = [fname] + + all_exist = True + exist = [] + not_exist = [] + for f in fname: + exists = file_exists(f) + all_exist &= exists + if exists and error_if_exists: + exist.append(f) + if not exists and error_if_not_exists: + not_exist.append(f) + + error_msg = "" + if exist: + error_msg = error_if_exists_msg + ", ".join(exist) + if not_exist: + error_msg = error_msg + "\n" + error_if_not_exists_msg + ", ".join(not_exist) + if error_msg: + raise DataException(error_msg) + + return all_exist
+ + +
[docs]def write_temp_gcs( + t: Union[hl.MatrixTable, hl.Table], + gcs_path: str, + overwrite: bool = False, + temp_path: Optional[str] = None, +) -> None: # noqa: D103 + if not temp_path: + temp_path = f"/tmp_{uuid.uuid4()}.h" + t.write(temp_path, overwrite=True) + t = ( + hl.read_matrix_table(temp_path) + if isinstance(t, hl.MatrixTable) + else hl.read_table(temp_path) + ) + t.write(gcs_path, overwrite=overwrite)
+ + +
[docs]def select_primitives_from_ht(ht: hl.Table) -> hl.Table: + """ + Select only primitive types (string, int, float, bool) from a Table. + + Particularly useful for exporting a Table. + + :param ht: Input Table + :return: Table with only primitive types selected + """ + return ht.select( + **{ + x: v + for x, v in ht.row_value.items() + if v.dtype + in {hl.tstr, hl.tint32, hl.tfloat32, hl.tint64, hl.tfloat64, hl.tbool} + } + )
+ + +
[docs]def get_file_stats(url: str, project_id: Optional[str] = None) -> Tuple[int, str, str]: + """ + Get size (as both int and str) and md5 for file at specified URL. + + Typically used to get stats on VCFs. + + :param url: Path to file of interest. + :param project_id: Google project ID. Specify if URL points to a requester-pays bucket. + :return: Tuple of file size and md5. + """ + one_gibibyte = 2**30 + one_mebibyte = 2**20 + + if project_id: + output = subprocess.check_output( + ["gsutil", "-u", project_id, "stat", url] + ).decode("utf8") + else: + output = subprocess.check_output(["gsutil", "stat", url]).decode("utf8") + lines = output.split("\n") + + info = {} + for line in lines: + if not line: + continue + + label, value = [s.strip() for s in line.split(":", 1)] + if label == "Content-Length": + size = int(value) + if size >= one_gibibyte: + info["size"] = f"{round(size / one_gibibyte, 2)} GiB" + else: + info["size"] = f"{round(size / one_mebibyte, 2)} MiB" + + if label == "Hash (md5)": + info["md5"] = base64.b64decode(value).hex() + + return (size, info["size"], info["md5"])
+ + +
[docs]def read_list_data(input_file_path: str) -> List[str]: + """ + Read a file input into a python list (each line will be an element). + + Supports Google storage paths and .gz compression. + + :param input_file_path: File path + :return: List of lines + """ + if input_file_path.startswith("gs://"): + hl.hadoop_copy(input_file_path, "file:///" + input_file_path.split("/")[-1]) + f = ( + gzip.open("/" + os.path.basename(input_file_path), encoding="utf-8") + if input_file_path.endswith("gz") + else open("/" + os.path.basename(input_file_path), encoding="utf-8") + ) + else: + f = ( + gzip.open(input_file_path, encoding="utf-8") + if input_file_path.endswith("gz") + else open(input_file_path, encoding="utf-8") + ) + output = [] + for line in f: + output.append(line.strip()) + f.close() + return output
+ + +
[docs]def repartition_for_join( + ht_path: str, + new_partition_percent: float = 1.1, +) -> List[hl.expr.IntervalExpression]: + """ + Calculate new partition intervals using input Table. + + Reading in all Tables using the same partition intervals (via + `_intervals`) before they are joined makes the joins much more efficient. + For more information, see: + https://discuss.hail.is/t/room-for-improvement-when-joining-multiple-hts/2278/8 + + :param ht_path: Path to Table to use for interval partition calculation. + :param new_partition_percent: Percent of initial dataset partitions to use. + Value should be greater than 1 so that input Table will have more + partitions for the join. Defaults to 1.1. + :return: List of IntervalExpressions calculated over new set of partitions + (number of partitions in HT * desired percent increase). + """ + ht = hl.read_table(ht_path) + if new_partition_percent < 1: + logger.warning( + "new_partition_percent value is less than 1! The new HT will have fewer" + " partitions than the original HT!" + ) + return ht._calculate_new_partitions(ht.n_partitions() * new_partition_percent)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/filtering.html b/_modules/gnomad/utils/filtering.html new file mode 100644 index 000000000..6d5c65a5c --- /dev/null +++ b/_modules/gnomad/utils/filtering.html @@ -0,0 +1,823 @@ + + + + + + gnomad.utils.filtering — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.filtering

+# noqa: D100
+
+import functools
+import logging
+import operator
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import hail as hl
+
+import gnomad.utils.annotations as annotate_utils
+from gnomad.resources.resource_utils import DataException
+from gnomad.utils.reference_genome import get_reference_genome
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def filter_to_adj(mt: hl.MatrixTable) -> hl.MatrixTable: + """Filter genotypes to adj criteria.""" + if "adj" not in list(mt.entry): + mt = annotate_utils.annotate_adj(mt) + mt = mt.filter_entries(mt.adj) + return mt.drop(mt.adj)
+ + +
[docs]def filter_by_frequency( + t: Union[hl.MatrixTable, hl.Table], + direction: str, + frequency: float = None, + allele_count: int = None, + population: str = None, + subpop: str = None, + downsampling: int = None, + keep: bool = True, + adj: bool = True, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter MatrixTable or Table with gnomAD-format frequency data (assumed bi-allelic/split). + + gnomAD frequency data format expectation is: Array[Struct(Array[AC], Array[AF], AN, homozygote_count, meta)]. + + At least one of frequency or allele_count is required. + + Subpop can be specified without a population if desired. + + :param t: Input MatrixTable or Table + :param direction: One of "above", "below", and "equal" (how to apply the filter) + :param frequency: Frequency to filter by (one of frequency or allele_count is required) + :param allele_count: Allele count to filter by (one of frequency or allele_count is required) + :param population: Population in which to filter frequency + :param subpop: Sub-population in which to filter frequency + :param downsampling: Downsampling in which to filter frequency + :param keep: Whether to keep rows passing this frequency (passed to filter_rows) + :param adj: Whether to use adj frequency + :return: Filtered MatrixTable or Table + """ + if frequency is None and allele_count is None: + raise ValueError("At least one of frequency or allele_count must be specified") + if direction not in ("above", "below", "equal"): + raise ValueError('direction needs to be one of "above", "below", or "equal"') + group = "adj" if adj else "raw" + criteria = [lambda f: f.meta.get("group", "") == group] + if frequency is not None: + if direction == "above": + criteria.append(lambda f: f.AF[1] > frequency) + elif direction == "below": + criteria.append(lambda f: f.AF[1] < frequency) + else: + criteria.append(lambda f: f.AF[1] == frequency) + if allele_count is not None: + if direction == "above": + criteria.append(lambda f: f.AC[1] > allele_count) + elif direction == "below": + criteria.append(lambda f: f.AC[1] < allele_count) + else: + criteria.append(lambda f: f.AC[1] == allele_count) + size = 1 + if population: + criteria.append(lambda f: f.meta.get("pop", "") == population) + size += 1 + if subpop: + criteria.append(lambda f: f.meta.get("subpop", "") == subpop) + size += 1 + # If one supplies a subpop but not a population, this will ensure this + # gets it right + if not population: + size += 1 + if downsampling: + criteria.append(lambda f: f.meta.get("downsampling", "") == str(downsampling)) + size += 1 + if not population: + size += 1 + criteria.append(lambda f: f.meta.get("pop", "") == "global") + if subpop: + raise ValueError("No downsampling data for subpopulations implemented") + criteria.append(lambda f: f.meta.size() == size) + + filt = lambda x: combine_functions(criteria, x) + criteria = hl.any(filt, t.freq) + return ( + t.filter_rows(criteria, keep=keep) + if isinstance(t, hl.MatrixTable) + else t.filter(criteria, keep=keep) + )
+ + +
[docs]def combine_functions( + func_list: List[Callable[[bool], bool]], + x: hl.expr.StructExpression, + operator_func: Callable[[bool, bool], bool] = operator.iand, +) -> bool: + """ + Combine a list of boolean functions to an Expression using the specified operator. + + .. note:: + + The `operator_func` is applied cumulatively from left to right of the `func_list`. + + :param func_list: A list of boolean functions that can be applied to `x`. + :param x: Expression to be passed to each function in `func_list`. + :param operator_func: Operator function to combine the functions in `func_list`. Default is `operator.iand`. + :return: A boolean from the combined operations. + """ + cond = func_list[0](x) + for c in func_list[1:]: + cond = operator_func(cond, c(x)) + return cond
+ + +
[docs]def filter_low_conf_regions( + mt: Union[hl.MatrixTable, hl.Table], + filter_lcr: bool = True, + filter_decoy: bool = True, + filter_segdup: bool = True, + filter_exome_low_coverage_regions: bool = False, + filter_telomeres_and_centromeres: bool = False, + high_conf_regions: Optional[List[str]] = None, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter low-confidence regions. + + :param mt: MatrixTable or Table to filter + :param filter_lcr: Whether to filter LCR regions + :param filter_decoy: Whether to filter decoy regions + :param filter_segdup: Whether to filter Segdup regions + :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions + :param filter_telomeres_and_centromeres: Whether to filter telomeres and centromeres + :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions) + :return: MatrixTable or Table with low confidence regions removed + """ + build = get_reference_genome(mt.locus).name + if build == "GRCh37": + import gnomad.resources.grch37.reference_data as resources + elif build == "GRCh38": + import gnomad.resources.grch38.reference_data as resources + + criteria = [] + if filter_lcr: + lcr = resources.lcr_intervals.ht() + criteria.append(hl.is_missing(lcr[mt.locus])) + + if filter_decoy: + decoy = resources.decoy_intervals.ht() + criteria.append(hl.is_missing(decoy[mt.locus])) + + if filter_segdup: + segdup = resources.seg_dup_intervals.ht() + criteria.append(hl.is_missing(segdup[mt.locus])) + + if filter_exome_low_coverage_regions: + high_cov = resources.high_coverage_intervals.ht() + criteria.append(hl.is_missing(high_cov[mt.locus])) + + if filter_telomeres_and_centromeres: + if build != "GRCh38": + raise DataException( + "The telomeres_and_centromeres resource only exists for GRCh38" + ) + + telomeres_and_centromeres = resources.telomeres_and_centromeres.ht() + criteria.append(hl.is_missing(telomeres_and_centromeres[mt.locus])) + + if high_conf_regions is not None: + for region in high_conf_regions: + region = hl.import_locus_intervals(region) + criteria.append(hl.is_defined(region[mt.locus])) + + if criteria: + filter_criteria = functools.reduce(operator.iand, criteria) + if isinstance(mt, hl.MatrixTable): + mt = mt.filter_rows(filter_criteria) + else: + mt = mt.filter(filter_criteria) + + return mt
+ + +
[docs]def filter_to_autosomes( + t: Union[hl.MatrixTable, hl.Table], +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter the Table or MatrixTable to autosomes only. + + This assumes that the input contains a field named `locus` of type Locus + + :param t: Input MT/HT + :return: MT/HT autosomes + """ + reference = get_reference_genome(t.locus) + autosomes = hl.parse_locus_interval( + f"{reference.contigs[0]}-{reference.contigs[21]}", reference_genome=reference + ) + return hl.filter_intervals(t, [autosomes])
+ + +
[docs]def add_filters_expr( + filters: Dict[str, hl.expr.BooleanExpression], + current_filters: hl.expr.SetExpression = None, +) -> hl.expr.SetExpression: + """ + Create an expression to create or add filters. + + For each entry in the `filters` dictionary, if the value evaluates to `True`, + then the key is added as a filter name. + + Current filters are kept if provided using `current_filters` + + :param filters: The filters and their expressions + :param current_filters: The set of current filters + :return: An expression that can be used to annotate the filters + """ + if current_filters is None: + current_filters = hl.empty_set(hl.tstr) + + return hl.fold( + lambda x, y: x.union(y), + current_filters, + [ + hl.if_else(filter_condition, hl.set([filter_name]), hl.empty_set(hl.tstr)) + for filter_name, filter_condition in filters.items() + ], + )
+ + +
[docs]def subset_samples_and_variants( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + sample_path: str, + header: bool = True, + table_key: str = "s", + sparse: bool = False, + gt_expr: str = "GT", + remove_dead_alleles: bool = False, +) -> Union[hl.MatrixTable, hl.vds.VariantDataset]: + """ + Subset the MatrixTable or VariantDataset to the provided list of samples and their variants. + + :param mtds: Input MatrixTable or VariantDataset + :param sample_path: Path to a file with list of samples + :param header: Whether file with samples has a header. Default is True + :param table_key: Key to sample Table. Default is "s" + :param sparse: Whether the MatrixTable is sparse. Default is False + :param gt_expr: Name of field in MatrixTable containing genotype expression. Default is "GT" + :param remove_dead_alleles: Remove alleles observed in no samples. This option is currently only relevant when `mtds` is a VariantDataset. Default is False + :return: MatrixTable or VariantDataset subsetted to specified samples and their variants + """ + sample_ht = hl.import_table(sample_path, no_header=not header, key=table_key) + sample_count = sample_ht.count() + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + mt = mtds.variant_data + else: + if remove_dead_alleles: + raise ValueError( + "Removal of alleles observed in no samples is currently only" + " implemented when the input dataset is a VariantDataset." + ) + mt = mtds + missing_ht = sample_ht.anti_join(mt.cols()) + missing_ht_count = missing_ht.count() + full_count = mt.count_cols() + + if missing_ht_count != 0: + missing_samples = missing_ht.s.collect() + raise DataException( + f"Only {sample_count - missing_ht_count} out of" + f" {sample_count} subsetting-table IDs matched IDs in the" + f" {'VariantDataset' if is_vds else 'MatrixTable'}.\nIDs that aren't in the" + f" MT: {missing_samples}\n" + ) + + if is_vds: + mtds = hl.vds.filter_samples( + mtds, sample_ht, keep=True, remove_dead_alleles=remove_dead_alleles + ) + n_cols = mtds.variant_data.count_cols() + else: + mtds = mtds.semi_join_cols(sample_ht) + if sparse: + mtds = mtds.filter_rows( + hl.agg.any(mtds[gt_expr].is_non_ref() | hl.is_defined(mtds.END)) + ) + else: + mtds = mtds.filter_rows(hl.agg.any(mtds[gt_expr].is_non_ref())) + n_cols = mtds.count_cols() + + logger.info( + "Finished subsetting samples. Kept %d out of %d samples in %s", + n_cols, + full_count, + "VariantDataset" if is_vds else "MatrixTable", + ) + return mtds
+ + +
[docs]def filter_to_clinvar_pathogenic( + t: Union[hl.MatrixTable, hl.Table], + clnrevstat_field: str = "CLNREVSTAT", + clnsig_field: str = "CLNSIG", + clnsigconf_field: str = "CLNSIGCONF", + remove_no_assertion: bool = True, + remove_conflicting: bool = True, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Return a MatrixTable or Table that filters the clinvar data to pathogenic and likely pathogenic variants. + + Example use: + + .. code-block:: python + + from gnomad.resources.grch38.reference_data import clinvar + clinvar_ht = clinvar.ht() + clinvar_ht = filter_to_clinvar_pathogenic(clinvar_ht) + + :param: t: Input dataset that contains clinvar data, could either be a MatrixTable or Table. + :param clnrevstat_field: The field string for the expression that contains the review status of the clinical significance of clinvar variants. + :param clnsig_field: The field string for the expression that contains the clinical signifcance of the clinvar variant. + :param clnsigconf_field: The field string for the expression that contains the conflicting clinical significance values for the variant. For variants with no conflicting significance, this field should be undefined. + :param remove_no_assertion: Flag for removing entries in which the clnrevstat (clinical significance) has no assertions (zero stars). + :param remove_conflicting: Flag for removing entries with conflicting clinical interpretations. + :return: Filtered MatrixTable or Table + """ + logger.info( + "Found %d variants before filtering", + t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(), + ) + path_expr = ( + t.info[clnsig_field] + .map(lambda x: x.lower()) + .map(lambda x: x.contains("pathogenic")) + .any(lambda x: x) + ) + + if remove_no_assertion: + logger.info("Variants without assertions will be removed.") + no_star_assertions = hl.literal( + { + "no_assertion_provided", + "no_assertion_criteria_provided", + "no_interpretation_for_the_individual_variant", + } + ) + path_expr = path_expr & ( + hl.set(t.info[clnrevstat_field]).intersection(no_star_assertions).length() + == 0 + ) + + if remove_conflicting: + logger.info( + "Variants with conflicting clinical interpretations will be removed." + ) + path_expr = path_expr & hl.is_missing(t.info[clnsigconf_field]) + + if isinstance(t, hl.MatrixTable): + t = t.filter_rows(path_expr) + else: + t = t.filter(path_expr) + + logger.info( + "Found %d variants after filtering to clinvar pathogenic variants.", + t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(), + ) + return t
+ + +
[docs]def filter_to_gencode_cds( + t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None +) -> hl.Table: + """ + Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts. + + Example use: + + .. code-block:: python + + from gnomad.resources.grch37.reference_data import gencode + gencode_ht = gencode.ht() + gencode_ht = filter_gencode_to_cds(gencode_ht) + + .. note:: + + If no Gencode Table is provided, the default version of the Gencode Table + resource for the genome build of the input Table/MatrixTable will be used. + + .. warning:: + + This Gencode CDS interval filter does not take into account the + transcript_id, it filters to any locus that is found in a CDS interval for + any protein coding transcript. Therefore, if downstream analyses require + filtering to CDS intervals by transcript, an additional step must be taken. + For example, when filtering VEP transcript consequences, there may be cases + where a variant is retained with this filter, but is considered outside the + CDS intervals of the transcript per the VEP predicted consequence of the + variant. + + :param t: Input Table/MatrixTable to filter. + :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable + to CDS regions. Default is None, which will use the default version of the + Gencode Table resource. + :return: Table/MatrixTable filtered to loci in Gencode CDS intervals. + """ + if gencode_ht is None: + build = get_reference_genome(t.locus).name + if build == "GRCh37": + from gnomad.resources.grch37.reference_data import gencode + elif build == "GRCh38": + from gnomad.resources.grch38.reference_data import gencode + else: + raise ValueError(f"Unsupported reference genome build: {build}") + + logger.info( + "No Gencode Table was supplied, using Gencode version %s", + gencode.default_version, + ) + gencode_ht = gencode.ht() + + gencode_ht = gencode_ht.filter( + (gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding") + ) + logger.warning( + "This Gencode CDS interval filter does not filter by transcript! Please see the" + " documentation for more details to confirm it's being used as intended." + ) + filter_expr = hl.is_defined(gencode_ht[t.locus]) + + if isinstance(t, hl.MatrixTable): + t = t.filter_rows(filter_expr) + else: + t = t.filter(filter_expr) + + return t
+ + +
[docs]def remove_fields_from_constant( + constant: List[str], fields_to_remove: List[str] +) -> List[str]: + """ + Remove fields from a list and display any field(s) missing from the original list. + + :param constant: List of fields + :param fields_to_remove: List of fields to remove from `constant` + """ + for field in fields_to_remove: + if field in constant: + constant.remove(field) + else: + logger.info("%s missing from %s", field, constant) + + return constant
+ + +
[docs]def filter_x_nonpar( + t: Union[hl.Table, hl.MatrixTable], +) -> Union[hl.Table, hl.MatrixTable]: + """ + Filter to loci that are in non-PAR regions on chromosome X. + + :param t: Input Table or MatrixTable. + :return: Filtered Table or MatrixTable. + """ + rg = t.locus.dtype.reference_genome + t = hl.filter_intervals( + t, + [ + hl.parse_locus_interval(contig, reference_genome=rg.name) + for contig in rg.x_contigs + ], + ) + non_par_expr = t.locus.in_x_nonpar() + + return ( + t.filter(non_par_expr) + if isinstance(t, hl.Table) + else t.filter_rows(non_par_expr) + )
+ + +
[docs]def filter_y_nonpar( + t: Union[hl.Table, hl.MatrixTable], +) -> Union[hl.Table, hl.MatrixTable]: + """ + Filter to loci that are in non-PAR regions on chromosome Y. + + :param t: Input Table or MatrixTable. + :return: Filtered Table or MatrixTable. + """ + rg = t.locus.dtype.reference_genome + t = hl.filter_intervals( + t, + [ + hl.parse_locus_interval(contig, reference_genome=rg.name) + for contig in rg.y_contigs + ], + ) + non_par_expr = t.locus.in_y_nonpar() + + return ( + t.filter(non_par_expr) + if isinstance(t, hl.Table) + else t.filter_rows(non_par_expr) + )
+ + +
[docs]def filter_by_numeric_expr_range( + t: Union[hl.MatrixTable, hl.Table], + filter_expr: hl.NumericExpression, + filter_range: tuple, + keep_between: bool = True, + inclusive: bool = True, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter rows in the Table/MatrixTable based on the range of a numeric expression. + + :param t: Input Table/MatrixTable. + :param filter_expr: NumericExpression to apply `filter_range` to. + :param filter_range: Range of values to apply to `filter_expr`. + :param keep_between: Whether to keep the values between `filter_range` instead of keeping values outside `filter_range`. Default is True. + :param inclusive: Whether or not to include the `filter_range` values themselves. Default is True. + :return: Table/MatrixTable filtered to rows with specified criteria. + """ + if inclusive and keep_between or not inclusive and not keep_between: + criteria = (filter_expr >= filter_range[0]) & (filter_expr <= filter_range[1]) + else: + criteria = (filter_expr > filter_range[0]) & (filter_expr < filter_range[1]) + + if isinstance(t, hl.MatrixTable): + return t.filter_rows(criteria, keep=keep_between) + else: + return t.filter(criteria, keep=keep_between)
+ + +
[docs]def filter_for_mu( + ht: hl.Table, gerp_lower_cutoff: float = -3.9885, gerp_upper_cutoff: float = 2.6607 +) -> hl.Table: + """ + Filter to non-coding annotations and remove GERP outliers. + + .. note:: + + Values for `gerp_lower_cutoff` and `gerp_upper_cutoff` default to -3.9885 and + 2.6607, respectively. These values were precalculated on the GRCh37 context + table and define the 5th and 95th percentiles. + + :param ht: Input Table. + :param gerp_lower_cutoff: Minimum GERP score for variant to be included. Default is -3.9885. + :param gerp_upper_cutoff: Maximum GERP score for variant to be included. Default is 2.6607. + :return: Table filtered to intron or intergenic variants with GERP outliers removed. + """ + ht = filter_by_numeric_expr_range( + ht, + filter_expr=ht.gerp, + filter_range=(gerp_lower_cutoff, gerp_upper_cutoff), + keep_between=True, + inclusive=False, + ) + ht = ht.filter( + (ht.vep.most_severe_consequence == "intron_variant") + | (ht.vep.most_severe_consequence == "intergenic_variant") + ) + + return ht
+ + +
[docs]def split_vds_by_strata( + vds: hl.vds.VariantDataset, strata_expr: hl.expr.Expression +) -> Dict[str, hl.vds.VariantDataset]: + """ + Split a VDS into multiple VDSs based on `strata_expr`. + + :param vds: Input VDS. + :param strata_expr: Expression on VDS variant_data MT to split on. + :return: Dictionary where strata value is key and VDS is value. + """ + vmt = vds.variant_data + s_by_strata = vmt.aggregate_cols( + hl.agg.group_by(strata_expr, hl.agg.collect_as_set(vmt.s)) + ) + + return { + strata: hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items() + }
+ + +
[docs]def filter_arrays_by_meta( + meta_expr: hl.expr.ArrayExpression, + meta_indexed_exprs: Union[ + Dict[str, hl.expr.ArrayExpression], hl.expr.ArrayExpression + ], + items_to_filter: Union[Dict[str, List[str]], List[str]], + keep: bool = True, + combine_operator: str = "and", + exact_match: bool = False, +) -> Tuple[ + hl.expr.ArrayExpression, + Union[Dict[str, hl.expr.ArrayExpression], hl.expr.ArrayExpression], +]: + """ + Filter both metadata array expression and meta data indexed expression by `items_to_filter`. + + The `items_to_filter` can be used to filter in the following ways based on + `meta_expr` items: + - By a list of keys, e.g. ["sex", "downsampling"]. + - By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan' + {"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX' + {"pop": ["afr"], "sex": ["XX"]}. + + The items can be kept or removed from `meta_indexed_expr` and `meta_expr` based on + the value of `keep`. For example if `meta_indexed_exprs` is {'freq': ht.freq, + 'freq_meta_sample_count': ht.index_globals().freq_meta_sample_count} and `meta_expr` + is ht.freq_meta then if `keep` is True, the items specified by `items_to_filter` + such as 'pop' = 'han' will be kept and all other items will be removed from the + ht.freq, ht.freq_meta_sample_count, and ht.freq_meta. `meta_indexed_exprs` can also + be a single array expression such as ht.freq. + + The filtering can also be applied such that all criteria must be met + (`combine_operator` = "and") by the `meta_expr` item in order to be filtered, + or at least one of the specified criteria must be met (`combine_operator` = "or") + by the `meta_expr` item in order to be filtered. + + The `exact_match` parameter can be used to apply the `keep` parameter to only items + specified in the `items_to_filter` parameter. For example, by default, if `keep` is + True, `combine_operator` is "and", and `items_to_filter` is ["sex", "downsampling"], + then all items in `meta_expr` with both "sex" and "downsampling" as keys will be + kept. However, if `exact_match` is True, then the items + in `meta_expr` will only be kept if "sex" and "downsampling" are the only keys in + the meta dict. + + :param meta_expr: Metadata expression that contains the values of the elements in + `meta_indexed_expr`. The most often used expression is `freq_meta` to index into + a 'freq' array. + :param meta_indexed_exprs: Either a Dictionary where the keys are the expression name + and the values are the expressions indexed by the `meta_expr` such as a 'freq' + array or just a single expression indexed by the `meta_expr`. + :param items_to_filter: Items to filter by, either a list or a dictionary. + :param keep: Whether to keep or remove the items specified by `items_to_filter`. + :param combine_operator: Whether to use "and" or "or" to combine the items + specified by `items_to_filter`. + :param exact_match: Whether to apply the `keep` parameter to only the items + specified in the `items_to_filter` parameter or to all items in `meta_expr`. + See the example above for more details. Default is False. + :return: A Tuple of the filtered metadata expression and a dictionary of metadata + indexed expressions when meta_indexed_expr is a Dictionary or a single filtered + array expression when meta_indexed_expr is a single array expression. + """ + meta_expr = meta_expr.collect(_localize=False)[0] + + if isinstance(meta_indexed_exprs, hl.expr.ArrayExpression): + meta_indexed_exprs = {"_tmp": meta_indexed_exprs} + + if combine_operator == "and": + operator_func = hl.all + elif combine_operator == "or": + operator_func = hl.any + else: + raise ValueError( + "combine_operator must be one of 'and' or 'or', but found" + f" {combine_operator}!" + ) + + if isinstance(items_to_filter, list): + items_to_filter_set = hl.set(items_to_filter) + items_to_filter = [[k] for k in items_to_filter] + if exact_match: + filter_func = lambda m, k: ( + hl.len(hl.set(m.keys()).difference(items_to_filter_set)) == 0 + ) & m.contains(k) + else: + filter_func = lambda m, k: m.contains(k) + elif isinstance(items_to_filter, dict): + items_to_filter = [ + [(k, v) for v in values] for k, values in items_to_filter.items() + ] + items_to_filter_set = hl.set(hl.flatten(items_to_filter)) + if exact_match: + filter_func = lambda m, k: ( + (hl.len(hl.set(m.items()).difference(items_to_filter_set)) == 0) + & (m.get(k[0], "") == k[1]) + ) + else: + filter_func = lambda m, k: (m.get(k[0], "") == k[1]) + else: + raise TypeError("items_to_filter must be a list or a dictionary!") + + meta_expr = hl.enumerate(meta_expr).filter( + lambda m: hl.bind( + lambda x: hl.if_else(keep, x, ~x), + operator_func( + [hl.any([filter_func(m[1], v) for v in k]) for k in items_to_filter] + ), + ), + ) + + meta_indexed_exprs = { + k: meta_expr.map(lambda x: v[x[0]]) for k, v in meta_indexed_exprs.items() + } + meta_expr = meta_expr.map(lambda x: x[1]) + + if "_tmp" in meta_indexed_exprs: + meta_indexed_exprs = meta_indexed_exprs["_tmp"] + + return meta_expr, meta_indexed_exprs
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/gen_stats.html b/_modules/gnomad/utils/gen_stats.html new file mode 100644 index 000000000..c95fbf6aa --- /dev/null +++ b/_modules/gnomad/utils/gen_stats.html @@ -0,0 +1,246 @@ + + + + + + gnomad.utils.gen_stats — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.gen_stats

+# noqa: D100
+
+import logging
+
+import hail as hl
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def to_phred(linear_expr: hl.expr.NumericExpression) -> hl.expr.Float64Expression: + """ + Compute the phred-scaled value of the linear-scale input. + + :param linear_expr: input + :return: Phred-scaled value + """ + return -10 * hl.log10(linear_expr)
+ + +
[docs]def from_phred( + phred_score_expr: hl.expr.NumericExpression, +) -> hl.expr.Float64Expression: + """ + Compute the linear-scale value of the phred-scaled input. + + :param phred_score_expr: phred-scaled value + :return: linear-scale value of the phred-scaled input. + """ + return 10 ** -(phred_score_expr / 10)
+ + +
[docs]def get_median_and_mad_expr( + metric_expr: hl.expr.ArrayNumericExpression, k: float = 1.4826 +) -> hl.expr.StructExpression: + """ + Compute the median and median absolute deviation (MAD) for the given expression. + + ..note:: + + The default value of k assumes normally distributed data. + + :param metric_expr: Expression to compute median and MAD for + :param k: The scaling factor for MAD calculation. Default assumes normally distributed data. + :return: Struct with median and MAD + """ + return hl.bind( + lambda x: hl.struct(median=x[1], mad=k * hl.median(hl.abs(x[0] - x[1]))), + hl.bind(lambda x: hl.tuple([x, hl.median(x)]), hl.agg.collect(metric_expr)), + )
+ + +
[docs]def merge_stats_counters_expr( + stats: hl.expr.ArrayExpression, +) -> hl.expr.StructExpression: + """ + Merge multiple stats counters, assuming that they were computed on non-overlapping data. + + Examples: + - Merge stats computed on indel and snv separately + - Merge stats computed on bi-allelic and multi-allelic variants separately + - Merge stats computed on autosomes and sex chromosomes separately + + :param stats: An array of stats counters to merge + :return: Merged stats Struct + """ + + def add_stats( + i: hl.expr.StructExpression, j: hl.expr.StructExpression + ) -> hl.expr.StructExpression: + """ + Merge two stats counters together. + + Assumes that all stats counter fields are present in the struct. + + :param i: accumulator: struct with mean, n and variance + :param j: new element: stats_struct -- needs to contain mean, n and variance + :return: Accumulation over all elements: struct with mean, n and variance + """ + delta = j.mean - i.mean + n_tot = i.n + j.n + return hl.struct( + min=hl.min(i.min, j.min), + max=hl.max(i.max, j.max), + mean=(i.mean * i.n + j.mean * j.n) / n_tot, + variance=i.variance + j.variance + (delta * delta * i.n * j.n) / n_tot, + n=n_tot, + sum=i.sum + j.sum, + ) + + # Gather all metrics present in all stats counters + metrics = set(stats[0]) + dropped_metrics = set() + for stat_expr in stats[1:]: + stat_expr_metrics = set(stat_expr) + dropped_metrics = dropped_metrics.union(stat_expr_metrics.difference(metrics)) + metrics = metrics.intersection(stat_expr_metrics) + if dropped_metrics: + logger.warning( + "The following metrics will be dropped during stats counter merging as they" + " do not appear in all counters: %s", + ", ".join(dropped_metrics), + ) + + # Because merging standard deviation requires having the mean and n, + # check that they are also present if `stdev` is. Otherwise remove stdev + if "stdev" in metrics: + missing_fields = [x for x in ["n", "mean"] if x not in metrics] + if missing_fields: + logger.warning( + "Cannot merge `stdev` from given stats counters since they are missing" + " the following fields: %s", + ",".join(missing_fields), + ) + metrics.remove("stdev") + + # Create a struct with all possible stats for merging. + # This step helps when folding because we can rely on the struct schema + # Note that for intermediate merging, we compute the variance rather than the stdev + all_stats = hl.array(stats).map( + lambda x: hl.struct( + min=x.min if "min" in metrics else hl.null(hl.tfloat64), + max=x.max if "max" in metrics else hl.null(hl.tfloat64), + mean=x.mean if "mean" in metrics else hl.null(hl.tfloat64), + variance=x.stdev * x.stdev if "stdev" in metrics else hl.null(hl.tfloat64), + n=x.n if "n" in metrics else hl.null(hl.tfloat64), + sum=x.sum if "sum" in metrics else hl.null(hl.tfloat64), + ) + ) + + # Merge the stats + agg_stats = all_stats[1:].fold(add_stats, all_stats[0]) + + # Return only the metrics that were present in all independent stats counters + # If `stdev` is present, then compute it from the variance + return agg_stats.select( + **{ + metric: ( + agg_stats[metric] if metric != "stdev" else hl.sqrt(agg_stats.variance) + ) + for metric in metrics + } + )
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/intervals.html b/_modules/gnomad/utils/intervals.html new file mode 100644 index 000000000..1363d5af6 --- /dev/null +++ b/_modules/gnomad/utils/intervals.html @@ -0,0 +1,175 @@ + + + + + + gnomad.utils.intervals — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.intervals

+# noqa: D100
+
+from typing import List
+
+import hail as hl
+
+
+
[docs]def sort_intervals(intervals: List[hl.Interval]): + """ + Sort an array of intervals by start contig, then start position, then end contig, then end position. + + :param intervals: Intervals to sort + :return: Sorted interval list + """ + return sorted( + intervals, + key=lambda interval: ( + interval.start.reference_genome.contigs.index(interval.start.contig), + interval.start.position, + interval.end.reference_genome.contigs.index(interval.end.contig), + interval.end.position, + ), + )
+ + +
[docs]def union_intervals(intervals: List[hl.Interval], is_sorted: bool = False): + """ + Generate a list with the union of all intervals in the input list by merging overlapping intervals. + + :param intervals: Intervals to merge + :param is_sorted: If set, assumes intervals are already sorted, otherwise will sort. + :return: List of merged intervals + """ + sorted_intervals = intervals if is_sorted else sort_intervals(intervals) + merged_intervals = sorted_intervals[:1] + for interval in sorted_intervals[1:]: + if merged_intervals[-1].start.contig == interval.start.contig: + if merged_intervals[-1].end.position < interval.end.position: + if interval.start.position <= merged_intervals[-1].end.position: + merged_intervals[-1] = hl.Interval( + merged_intervals[-1].start, interval.end + ) + else: + merged_intervals.append(interval) + else: + merged_intervals.append(interval) + + return merged_intervals
+ + +
[docs]def interval_length(interval: hl.Interval) -> int: + """ + Return the total number of bases in an Interval. + + :param interval: Input interval + :return: Total length of the interval + """ + if interval.start.contig != interval.end.contig: + ref = interval.start.reference_genome + return ( + ref.contig_length(interval.start.contig) + - interval.start.position + + sum( + ref.contig_length(contig) + for contig in ref.contigs[ + ref.contigs.index(interval.start.contig) + + 1 : ref.contigs.index(interval.end.contig) + ] + ) + + interval.end.position + ) + else: + return interval.end.position - interval.start.position
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/liftover.html b/_modules/gnomad/utils/liftover.html new file mode 100644 index 000000000..3235d578c --- /dev/null +++ b/_modules/gnomad/utils/liftover.html @@ -0,0 +1,271 @@ + + + + + + gnomad.utils.liftover — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.liftover

+# noqa: D100
+
+import logging
+from typing import Tuple, Union
+
+import hail as hl
+
+from gnomad.utils.reference_genome import add_reference_sequence, get_reference_genome
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+GRCH37_to_GRCH38_CHAIN = "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
+"""
+Path to chain file required to lift data from GRCh37 to GRCh38.
+"""
+
+GRCH38_TO_GRCH37_CHAIN = "gs://hail-common/references/grch38_to_grch37.over.chain.gz"
+"""
+Path to chain file required to lift data from GRCh38 to GRCh37.
+"""
+
+
+
[docs]def get_liftover_genome( + t: Union[hl.MatrixTable, hl.Table], +) -> Tuple[hl.genetics.ReferenceGenome, hl.genetics.ReferenceGenome]: + """ + Infer reference genome build of input data and assume destination reference genome build. + + Adds liftover chain to source reference genome and sequence to destination reference genome. + Returns tuple containing both reference genomes in preparation for liftover. + + :param t: Input Table or MatrixTable. + :return: Tuple of source reference genome (with liftover chain added) + and destination reference genome (with sequence loaded) + """ + logger.info("Inferring reference genome of input...") + input_build = get_reference_genome(t.locus).name + source = hl.get_reference(input_build) + + logger.info("Loading fasta sequence for destination build...") + if input_build == "GRCh38": + target = hl.get_reference("GRCh37") + chain = GRCH38_TO_GRCH37_CHAIN + + else: + target = hl.get_reference("GRCh38") + chain = GRCH37_to_GRCH38_CHAIN + + logger.info("Adding liftover chain to input build...") + if source.has_liftover(target): + logger.warning( + "Source reference build %s already has a chain file: %s! Using whichever" + " chain has already been added.", + source.name, + source._liftovers, + ) + else: + source.add_liftover(chain, target) + + return (source, add_reference_sequence(target))
+ + +
[docs]def liftover_expr( + locus: hl.expr.LocusExpression, + alleles: hl.expr.ArrayExpression, + destination_reference: hl.ReferenceGenome, +) -> hl.expr.StructExpression: + """ + Generate struct liftover expression. + + Struct contains: + - locus: Liftover coordinates + - alleles: Liftover alleles + - original_locus: Locus prior to liftover + - original_alleles: Alleles prior to liftover + - locus_fail_liftover: Whether the locus failed liftover + - ref_allele_mismatch: Whether the allele at index 0 of alleles (lifted over reference allele) + doesn't match the allele at that position in the destination reference + + :param locus: Input locus. + :param alleles: Input alleles. + :param destination_reference: Desired reference genome build for liftover. + :return: Struct containing expressions for lifted over locus/alleles as well as original locus/alleles. + """ + lifted_over_locus = hl.liftover(locus, destination_reference, include_strand=True) + lifted_over_alleles = alleles.map( + lambda a: hl.if_else( + lifted_over_locus.is_negative_strand, hl.reverse_complement(a), a + ) + ) + + return hl.struct( + new_locus=lifted_over_locus.result, + new_alleles=lifted_over_alleles, + original_locus=locus, + original_alleles=alleles, + locus_fail_liftover=hl.is_missing(lifted_over_locus), + ref_allele_mismatch=lifted_over_locus.result.sequence_context() + != lifted_over_alleles[0], + )
+ + +
[docs]def default_lift_data( + t: Union[hl.MatrixTable, hl.Table], + remove_failed_sites: bool = True, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Lift input Table or MatrixTable from one reference build to another. + + :param t: Table or MatrixTable. + :return: Table or MatrixTable with liftover annotations. + """ + logger.info("Inferring input reference and destination reference...") + _, target_genome = get_liftover_genome(t) + + logger.info("Annotating input data with liftover coordinates...") + t = ( + t.annotate(**liftover_expr(t.locus, t.alleles, target_genome)) + if isinstance(t, hl.Table) + else t.annotate_rows(**liftover_expr(t.locus, t.alleles, target_genome)) + ) + + no_target_expr = hl.agg.count_where(t.locus_fail_liftover) + num_no_target = ( + t.aggregate(no_target_expr) + if isinstance(t, hl.Table) + else t.aggregate_rows(no_target_expr) + ) + + if remove_failed_sites: + logger.info("Filtering out %d sites that failed to liftover...", num_no_target) + keep_expr = ~t.locus_fail_liftover + t = t.filter(keep_expr) if isinstance(t, hl.Table) else t.filter_rows(keep_expr) + + row_key_expr = {"locus": t.new_locus, "alleles": t.new_alleles} + return ( + t.key_by(**row_key_expr) + if isinstance(t, hl.Table) + else t.key_rows_by(**row_key_expr) + )
+ + +
[docs]def liftover_using_gnomad_map(ht: hl.Table, data_type: str): + """ + Liftover a gnomAD v2 table using already-established liftover file. + + .. note:: + This function shuffles! + + :param ht: Input Hail Table. + :param data_type: Which gnomAD data type to map across. One of "exomes" or "genomes". + :return: Lifted over Table + """ + from gnomad.resources.grch37.gnomad import liftover + + logger.warning("This function will trigger a shuffle! Pre-emptibles may not work.") + lift_ht = liftover(data_type).ht() + ht = ht.key_by(original_locus=ht.locus, original_alleles=ht.alleles).drop( + "locus", "alleles" + ) + return lift_ht.annotate( + **ht[(lift_ht.original_locus, lift_ht.original_alleles)] + ).key_by("locus", "alleles")
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/plotting.html b/_modules/gnomad/utils/plotting.html new file mode 100644 index 000000000..a3f2245cf --- /dev/null +++ b/_modules/gnomad/utils/plotting.html @@ -0,0 +1,876 @@ + + + + + + gnomad.utils.plotting — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.plotting

+# noqa: D100
+
+import json
+import logging
+from typing import Callable, Dict, List, Optional, Union
+
+import bokeh
+import hail as hl
+import numpy as np
+import pandas as pd
+from bokeh.layouts import gridplot
+from bokeh.models import (
+    BooleanFilter,
+    CDSView,
+    Column,
+    ColumnDataSource,
+    DataRange1d,
+    Div,
+    Grid,
+    HoverTool,
+    Legend,
+    TabPanel,
+    Tabs,
+    Title,
+)
+from bokeh.palettes import Spectral8, d3, viridis  # pylint: disable=no-name-in-module
+from bokeh.plotting import figure
+from bokeh.transform import factor_cmap
+
+from gnomad.utils.vep import (
+    CSQ_CODING_HIGH_IMPACT,
+    CSQ_CODING_LOW_IMPACT,
+    CSQ_CODING_MEDIUM_IMPACT,
+    CSQ_NON_CODING,
+    CSQ_ORDER,
+)
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Setting some defaults for Table.show
+if "old_show" not in dir():
+    old_show = hl.Table.show
+
+
[docs] def new_show(t, n=10, width=140, truncate=40, types=True): # noqa: D103 + old_show(t, n, width, truncate, types)
+ + hl.Table.show = new_show + +TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom" + +COLOR_CPG = "#2E9FFE" +COLOR_TI = "#458B00" +COLOR_TV = "#EA4444" +ACG = "#422efe" +CCG = "#2e73fe" +GCG = "#2ec3fe" +TCG = "#2efef7" +COLOR_METHYLATED_CPG = "#2E37FE" + +variant_type_colors = { + "CpG": COLOR_CPG, + "non-CpG": COLOR_TI, + "non-CpG transition": COLOR_TI, + "transversion": COLOR_TV, + "ACG": ACG, + "CCG": CCG, + "GCG": GCG, + "TCG": TCG, + "methylated CpG": COLOR_METHYLATED_CPG, +} + +COLOR_SYN = "#AAAAAA" +COLOR_MIS = "#FF6103" +COLOR_LOF = "#9D1309" + +variant_annotation_colors = {x: COLOR_LOF for x in CSQ_CODING_HIGH_IMPACT} +variant_annotation_colors.update({x: COLOR_MIS for x in CSQ_CODING_MEDIUM_IMPACT}) +variant_annotation_colors.update( + {x: COLOR_SYN for x in CSQ_CODING_LOW_IMPACT + CSQ_NON_CODING} +) +variant_annotation_colors.update( + { + "stop_lost": COLOR_MIS, + "splice_region_variant": COLOR_SYN, + "start_lost": COLOR_SYN, + "Synonymous": COLOR_SYN, + "Missense": COLOR_MIS, + "LoF": COLOR_LOF, + "PTV": COLOR_LOF, + } +) + +variant_annotation_names = dict( + zip(CSQ_ORDER, [x.replace("_variant", "").replace("_", " ") for x in CSQ_ORDER]) +) +variant_annotation_names["stop_gained"] = "nonsense" +variant_annotation_names["5_prime_UTR_variant"] = "5' UTR" +variant_annotation_names["3_prime_UTR_variant"] = "3' UTR" + +dataset_colors = {"ExAC": "#4682B4", "gnomAD": "#73AB3D"} + + +
[docs]def plot_hail_hist( + hist_data: hl.Struct, + title: str = "Plot", + log: bool = False, + fill_color: str = "#033649", + outlier_fill_color: str = "#036564", + line_color: str = "#033649", + hover_mode: str = "mouse", + hide_zeros: bool = False, +) -> bokeh.plotting.figure: + """ + Plot histogram from Hail hist aggregation. + + `hist_data` can (and should) come straight from ht.aggregate(hl.agg.hist(ht.data, start, end, bins)) + + :param hist_data: Data to plot + :param title: Plot title + :param log: Whether the y-axis should be log + :param fill_color: Color to fill the histogram bars that fall within the hist boundaries + :param outlier_fill_color: Color to fill the histogram bars that fall outside the hist boundaries + :param line_color: Color of the lines around the histogram bars + :param hover_mode: Hover mode; one of 'mouse' (default), 'vline' or 'hline' + :param hide_zeros: Remove hist bars with 0 count + :return: Histogram plot + """ + return plot_multi_hail_hist( + {"hist": hist_data}, + title=title, + log=log, + fill_color={"hist": fill_color}, + outlier_fill_color={"hist": outlier_fill_color}, + line_color=line_color, + hover_mode=hover_mode, + hide_zeros=hide_zeros, + alpha=1.0, + )
+ + +
[docs]def plot_multi_hail_hist( + hist_data: Dict[str, hl.Struct], + title: str = "Plot", + log: bool = False, + fill_color: Dict[str, str] = None, + outlier_fill_color: Dict[str, str] = None, + line_color: str = "#033649", + hover_mode: str = "mouse", + hide_zeros: bool = False, + alpha: float = None, +) -> bokeh.plotting.figure: + """ + Plot multiple histograms on the same plot. + + Each histogram can (and should) come straight from ht.aggregate(hl.agg.hist(ht.data, start, end, bins)) + + Example usage: + + .. code-block:: python + + plot_multi_hail_hist(ht.aggregate(hl.agg.group_by(ht.pop, hl.agg.hist(ht.data, start, end, bins)))) + + :param hist_data: Data to plot + :param title: Plot title + :param log: Whether the y-axis should be log + :param fill_color: Color to fill the histogram bars that fall within the hist boundaries + :param outlier_fill_color: Color to fill the histogram bars that fall outside the hist boundaries + :param line_color: Color of the lines around the histogram bars + :param hover_mode: Hover mode; one of 'mouse' (default), 'vline' or 'hline' + :param hide_zeros: Remove hist bars with 0 count + :param alpha: Alpha value (if None, then 1.0/len(hist_data) is used) + :return: Histogram plot + """ + low = int(log) + + if alpha is None: + alpha = 1.0 / len(hist_data) + + if fill_color is None: + color_palette = d3["Category10"][max(3, len(hist_data))] + fill_color = { + hist_name: color_palette[i] for i, hist_name in enumerate(hist_data.keys()) + } + + if outlier_fill_color is None: + outlier_fill_color = fill_color + + p = ( + figure(title=title, y_axis_type="log", tools=TOOLS) + if log + else figure(title=title, tools=TOOLS) + ) + hists = [] + for label, hist in hist_data.items(): + data = {} + distance = abs(hist.bin_edges[0] - hist.bin_edges[1]) + data["top"] = [x + low for x in hist.bin_freq] + data["left"] = hist.bin_edges[:-1] + data["right"] = hist.bin_edges[1:] + data["color"] = [fill_color[label]] * len(hist.bin_freq) + if hist.n_smaller > 0: + data["top"].insert(0, hist.n_smaller + low) + data["left"].insert(0, hist.bin_edges[0] - distance) + data["right"].insert(0, hist.bin_edges[0]) + data["color"].insert(0, outlier_fill_color[label]) + if hist.n_larger > 0: + data["top"].append(hist.n_larger + low) + data["left"].append(hist.bin_edges[-1]) + data["right"].append(hist.bin_edges[-1] + distance) + data["color"].append(outlier_fill_color[label]) + + data["bottom"] = [low] * len(data["top"]) + data["label"] = [label] * len(data["top"]) + + hist_source = ColumnDataSource(data) + + # pylint: disable=unsubscriptable-object + hide_zeros_filter = BooleanFilter([top > 0 for top in hist_source.data["top"]]) + + view = ( + CDSView(source=hist_source, filters=[hide_zeros_filter]) + if hide_zeros + else CDSView(source=hist_source) + ) + hists.append( + ( + label, + [ + p.quad( + top="top", + bottom="bottom", + left="left", + right="right", + source=hist_source, + view=view, + fill_color="color", + alpha=alpha, + line_color=line_color, + ) + ], + ) + ) + + tooltips = [("bin", "$index"), ("bin_edges", "(@left, @right)"), ("freq", "@top")] + if len(hist_data) > 1: + tooltips.insert(0, ("label", "@label")) + p.add_layout( + Legend( + items=hists, + location=(0, 0), + orientation="horizontal", + click_policy="hide", + ), + "above", + ) + p.select_one(HoverTool).tooltips = tooltips + p.select_one(HoverTool).mode = hover_mode + num_data_points = sum([sum(x.bin_freq) for x in hist_data.values()]) + p.add_layout(Title(text=f"({num_data_points:,} data points)"), "above") + + return p
+ + +
[docs]def plot_hail_hist_cumulative( + hist_data: hl.Struct, + title: str = "Plot", + normalize: bool = True, + line_color: str = "#036564", + line_width: int = 3, + log: bool = False, + hover_mode: str = "mouse", +) -> bokeh.plotting.figure: + """ + Plot cumulative histogram from Hail hist aggregation. + + `hist_data` can (and should) come straight from ht.aggregate(hl.agg.hist(ht.data, start, end, bins)) + + :param hist_data: Data to plot + :param title: Plot title + :param normalize: Whether to normalize the data (0,1) + :param line_color: Color of the line + :param line_width: Width of the line + :param log: Whether the y-axis should be log + :param hover_mode: Hover mode; one of 'mouse' (default), 'vline' or 'hline' + :return: Histogram plot + """ + cumulative_data = np.cumsum(hist_data.bin_freq) + hist_data.n_smaller + np.append(cumulative_data, [cumulative_data[-1] + hist_data.n_larger]) + num_data_points = max(cumulative_data) + + if normalize: + cumulative_data = cumulative_data / num_data_points + p = ( + figure(title=title, y_axis_type="log", tools=TOOLS) + if log + else figure(title=title, tools=TOOLS) + ) + p.add_layout(Title(text=f"({num_data_points:,} data points)"), "above") + p.select_one(HoverTool).tooltips = [("index", "$index"), ("(x,y)", "(@x, @y)")] + p.select_one(HoverTool).mode = hover_mode + data_source = ColumnDataSource( + {"x": hist_data.bin_edges[:-1], "y": cumulative_data} + ) + p.line( + x="x", y="y", line_color=line_color, line_width=line_width, source=data_source + ) + return p
+ + +
[docs]def plot_hail_hist_both( + hist_data: hl.Struct, title: str, normalize: bool = True, log: bool = False +): # noqa: D103 + p1 = plot_hail_hist(hist_data, title, log) + p2 = plot_hail_hist_cumulative( + hist_data, f"{title} (Cumulative)", normalize, log=log + ) + return Tabs( + tabs=[TabPanel(child=p1, title="Raw"), TabPanel(child=p2, title="Cumulative")] + )
+ + +
[docs]def set_font_size(p, font_size: str = "12pt"): # noqa: D103 + p.title.text_font_size = font_size + p.legend.label_text_font_size = font_size + p.xaxis.axis_label_text_font_size = font_size + p.yaxis.axis_label_text_font_size = font_size + p.xaxis.major_label_text_font_size = font_size + p.yaxis.major_label_text_font_size = font_size + if hasattr(p.xaxis, "group_text_font_size"): + p.xaxis.group_text_font_size = font_size + return p
+ + +
[docs]def linear_and_log_tabs(plot_func: Callable, **kwargs) -> Tabs: # noqa: D103 + panels = [] + for axis_type in ["linear", "log"]: + fig = plot_func(**kwargs, axis_type=axis_type) + panel = TabPanel(child=fig, title=axis_type) + panels.append(panel) + + return Tabs(tabs=panels)
+ + +
[docs]def plot_hail_file_metadata( + t_path: str, +) -> Optional[Union[Grid, Tabs, bokeh.plotting.figure]]: + """ + Take path to hail Table or MatrixTable (gs://bucket/path/hail.mt), output Grid or Tabs, respectively. + + Or if an unordered Table is provided, a Figure with file sizes is output. + If metadata file or rows directory is missing, returns None. + """ + panel_size = 600 + subpanel_size = 150 + + files = hl.hadoop_ls(t_path) + rows_file = [x["path"] for x in files if x["path"].endswith("rows")] + entries_file = [x["path"] for x in files if x["path"].endswith("entries")] + # cols_file = [x['path'] for x in files if x['path'].endswith('cols')] + success_file = [ + x["modification_time"] for x in files if x["path"].endswith("SUCCESS") + ] + + data_type = "Table" + + metadata_file = [x["path"] for x in files if x["path"].endswith("metadata.json.gz")] + if not metadata_file: + logger.warning("No metadata file found. Exiting...") + return None + + with hl.hadoop_open(metadata_file[0], "rb") as f: + overall_meta = json.loads(f.read()) + rows_per_partition = overall_meta["components"]["partition_counts"]["counts"] + + if not rows_file: + logger.warning("No rows directory found. Exiting...") + return None + rows_files = hl.hadoop_ls(rows_file[0]) + + if entries_file: + data_type = "MatrixTable" + rows_file = [x["path"] for x in rows_files if x["path"].endswith("rows")] + rows_files = hl.hadoop_ls(rows_file[0]) + row_partition_bounds, row_file_sizes = get_rows_data(rows_files) + + total_file_size, row_file_sizes, row_scale = scale_file_sizes(row_file_sizes) + + if not row_partition_bounds: + logger.warning("Table is not partitioned. Only plotting file sizes") + row_file_sizes_hist, row_file_sizes_edges = np.histogram( + row_file_sizes, bins=50 + ) + p_file_size = figure(plot_width=panel_size, plot_height=panel_size) + p_file_size.quad( + right=row_file_sizes_hist, + left=0, + bottom=row_file_sizes_edges[:-1], + top=row_file_sizes_edges[1:], + fill_color="#036564", + line_color="#033649", + ) + p_file_size.yaxis.axis_label = f"File size ({row_scale}B)" + return p_file_size + + all_data = { + "partition_widths": [ + -1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds + ], + "partition_bounds": [ + f"{x[0]}:{x[1]}-{x[2]}:{x[3]}" for x in row_partition_bounds + ], + "spans_chromosome": [ + "Spans chromosomes" if x[0] != x[2] else "Within chromosome" + for x in row_partition_bounds + ], + "row_file_sizes": row_file_sizes, + "row_file_sizes_human": [f"{x:.1f} {row_scale}B" for x in row_file_sizes], + "rows_per_partition": rows_per_partition, + "index": list(range(len(rows_per_partition))), + } + + if entries_file: + entries_rows_files = hl.hadoop_ls(entries_file[0]) + entries_rows_file = [ + x["path"] for x in entries_rows_files if x["path"].endswith("rows") + ] + if entries_rows_file: + entries_files = hl.hadoop_ls(entries_rows_file[0]) + entry_partition_bounds, entry_file_sizes = get_rows_data(entries_files) + total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes( + entry_file_sizes + ) + all_data["entry_file_sizes"] = entry_file_sizes + all_data["entry_file_sizes_human"] = [ + f"{x:.1f} {entry_scale}B" for x in row_file_sizes + ] + + title = f"{data_type}: {t_path}" + + msg = ( + f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions:" + f" {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>" + ) + if success_file[0]: + msg += success_file[0] + + source = ColumnDataSource(pd.DataFrame(all_data)) + p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size) + p.title.text = title + p.xaxis.axis_label = "Number of rows" + p.yaxis.axis_label = f"File size ({row_scale}B)" + color_map = factor_cmap( + "spans_chromosome", + palette=Spectral8, + factors=list(set(all_data["spans_chromosome"])), + ) + p.scatter( + "rows_per_partition", + "row_file_sizes", + color=color_map, + legend="spans_chromosome", + source=source, + ) + p.legend.location = "bottom_right" + p.select_one(HoverTool).tooltips = [ + (x, f"@{x}") + for x in ( + "rows_per_partition", + "row_file_sizes_human", + "partition_bounds", + "index", + ) + ] + + p_stats = Div(text=msg) + p_rows_per_partition = figure( + x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size + ) + p_file_size = figure( + y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size + ) + + rows_per_partition_hist, rows_per_partition_edges = np.histogram( + all_data["rows_per_partition"], bins=50 + ) + p_rows_per_partition.quad( + top=rows_per_partition_hist, + bottom=0, + left=rows_per_partition_edges[:-1], + right=rows_per_partition_edges[1:], + fill_color="#036564", + line_color="#033649", + ) + row_file_sizes_hist, row_file_sizes_edges = np.histogram( + all_data["row_file_sizes"], bins=50 + ) + p_file_size.quad( + right=row_file_sizes_hist, + left=0, + bottom=row_file_sizes_edges[:-1], + top=row_file_sizes_edges[1:], + fill_color="#036564", + line_color="#033649", + ) + + rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) + + if "entry_file_sizes" in all_data: + title = f"Statistics for {data_type}: {t_path}" + + msg = ( + f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions:" + f" {len(all_data['rows_per_partition']):,}<br/>Size:" + f" {total_entry_file_size}<br/>" + ) + if success_file[0]: + msg += success_file[0] + + source = ColumnDataSource(pd.DataFrame(all_data)) + panel_size = 600 + subpanel_size = 150 + p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size) + p.title.text = title + p.xaxis.axis_label = "Number of rows" + p.yaxis.axis_label = f"File size ({entry_scale}B)" + color_map = factor_cmap( + "spans_chromosome", + palette=Spectral8, + factors=list(set(all_data["spans_chromosome"])), + ) + p.scatter( + "rows_per_partition", + "entry_file_sizes", + color=color_map, + legend="spans_chromosome", + source=source, + ) + p.legend.location = "bottom_right" + p.select_one(HoverTool).tooltips = [ + (x, f"@{x}") + for x in ( + "rows_per_partition", + "entry_file_sizes_human", + "partition_bounds", + "index", + ) + ] + + p_stats = Div(text=msg) + p_rows_per_partition = figure( + x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size + ) + p_rows_per_partition.quad( + top=rows_per_partition_hist, + bottom=0, + left=rows_per_partition_edges[:-1], + right=rows_per_partition_edges[1:], + fill_color="#036564", + line_color="#033649", + ) + p_file_size = figure( + y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size + ) + + row_file_sizes_hist, row_file_sizes_edges = np.histogram( + all_data["entry_file_sizes"], bins=50 + ) + p_file_size.quad( + right=row_file_sizes_hist, + left=0, + bottom=row_file_sizes_edges[:-1], + top=row_file_sizes_edges[1:], + fill_color="#036564", + line_color="#033649", + ) + entries_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) + + return Tabs( + tabs=[ + TabPanel(child=entries_grid, title="Entries"), + TabPanel(child=rows_grid, title="Rows"), + ] + ) + else: + return rows_grid
+ + +
[docs]def scale_file_sizes(file_sizes): # noqa: D103 + min_file_size = min(file_sizes) * 1.1 + total_file_size = sum(file_sizes) + all_scales = [("T", 1e12), ("G", 1e9), ("M", 1e6), ("K", 1e3), ("", 1e0)] + for overall_scale, overall_factor in all_scales: + if total_file_size > overall_factor: + total_file_size /= overall_factor + break + for scale, factor in all_scales: + if min_file_size > factor: + file_sizes = [x / factor for x in file_sizes] + break + total_file_size = f"{total_file_size:.1f} {overall_scale}B" + return total_file_size, file_sizes, scale
+ + +
[docs]def get_rows_data(rows_files): # noqa: D103 + file_sizes = [] + partition_bounds = [] + parts_file = [x["path"] for x in rows_files if x["path"].endswith("parts")] + if parts_file: + parts = hl.hadoop_ls(parts_file[0]) + for i, x in enumerate(parts): + index = x["path"].split(f"{parts_file[0]}/part-")[1].split("-")[0] + if i < len(parts) - 1: + test_index = ( + parts[i + 1]["path"] + .split(f"{parts_file[0]}/part-")[1] + .split("-")[0] + ) + if test_index == index: + continue + file_sizes.append(x["size_bytes"]) + metadata_file = [ + x["path"] for x in rows_files if x["path"].endswith("metadata.json.gz") + ] + if metadata_file: + with hl.hadoop_open(metadata_file[0], "rb") as f: + rows_meta = json.loads(f.read()) + try: + partition_bounds = [ + ( + x["start"]["locus"]["contig"], + x["start"]["locus"]["position"], + x["end"]["locus"]["contig"], + x["end"]["locus"]["position"], + ) + for x in rows_meta["jRangeBounds"] + ] + except KeyError: + pass + return partition_bounds, file_sizes
+ + +
[docs]def pair_plot( + data: pd.DataFrame, + label_col: str = None, + colors: Union[List[str], Dict[str, str]] = None, + tools: str = "save,pan,box_zoom,reset,wheel_zoom,box_select,lasso_select,help", + tooltip_cols: List[str] = None, +) -> Column: + """ + Plot each column of `data` against each other and returns a grid of plots. + + The diagonal contains a histogram of each column, or a density plot if labels are provided. + The lower diagonal contains scatter plots of each column against each other. + The upper diagonal is empty. + + All columns should be numerical with the exception of the `label_col` if provided. + If a color dict containing provided mapping labels to specific colors can be specified using `color_dict` + + :param data: Dataframe to plot + :param label_col: Column of the DataFrame containing the labels + :param colors: RGB hex colors. If a dict is provided, it should contain the mapping of label to colors. + :param tools: Tools for the resulting plots + :param tooltip_cols: Additional columns that should be displayed in tooltip + :return: Grid of plots (column of rows) + """ + if tooltip_cols is None: + tooltip_cols = [] if label_col is None else [label_col] + elif label_col not in tooltip_cols: + tooltip_cols.append(label_col) + + if label_col is None and colors is not None: + logger.warning("`colors_dict` ignored since no `label_col` specified") + + colors_col = "__pair_plot_color" + + colors_dict = {} + if label_col is None: + data[colors_col] = viridis(1) * len(data) + else: + if not isinstance(colors, dict): + labels = set(data[label_col]) + color_palette = viridis(len(labels)) if colors is None else colors + colors_dict = {l: color_palette[i] for i, l in enumerate(labels)} + else: + colors_dict = colors + data[colors_col] = [colors_dict.get(l, "gray") for l in data[label_col]] + tools = "hover," + tools + + data_cols = [ + c for c in data.columns if c not in [colors_col, label_col] + tooltip_cols + ] + data_ranges = [ + DataRange1d( + start=rmin - (abs(rmin - rmax) * 0.05), end=rmax + (abs(rmin - rmax) * 0.05) + ) + for rmin, rmax in zip(data[data_cols].min(axis=0), data[data_cols].max(axis=0)) + ] + data_source = ColumnDataSource(data={c: data[c] for c in data.columns}) + + n_cols = len(data_cols) + + plot_grid = [] + for i in range(n_cols): + row = [None] * n_cols + for j in range(i + 1): + p = figure( + x_axis_label=data_cols[j] if i == n_cols - 1 else "", + y_axis_label=data_cols[i] if j == 0 else "", + tools=tools, + ) + p.x_range = data_ranges[j] + + if i == j: + if label_col is None: + hist, edges = np.histogram( + data[data_cols[i]], density=False, bins=50 + ) + p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:]) + else: + density_data = ( + data[[label_col, data_cols[i]]] + .groupby(label_col) + .apply( + lambda df: np.histogram( + df[data_cols[i]], density=True, bins=20 + ) + ) + ) + for label, (hist, edges) in density_data.iteritems(): + line_source = ColumnDataSource( + { + "edges": edges[:-1], + "hist": hist, + label_col: [label] * len(hist), + } + ) + p.line( + "edges", + "hist", + color=colors_dict[label], + legend=label_col, + source=line_source, + ) + p.select_one(HoverTool).tooltips = [ + (label_col, f"@{label_col}") + ] + else: + p.y_range = data_ranges[i] + if label_col is not None: + p.circle( + data_cols[j], + data_cols[i], + source=data_source, + color=colors_col, + legend=label_col, + ) + else: + p.circle( + data_cols[j], data_cols[i], source=data_source, color=colors_col + ) + if tooltip_cols: + p.select_one(HoverTool).tooltips = [ + (x, f"@{x}") for x in tooltip_cols + ] + + row[j] = p + plot_grid.append(row) + + return gridplot(plot_grid, toolbar_location="left")
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/reference_genome.html b/_modules/gnomad/utils/reference_genome.html new file mode 100644 index 000000000..9d1061fe3 --- /dev/null +++ b/_modules/gnomad/utils/reference_genome.html @@ -0,0 +1,244 @@ + + + + + + gnomad.utils.reference_genome — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.utils.reference_genome

+# noqa: D100
+
+import logging
+from typing import List, Optional, Union
+
+import hail as hl
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def get_reference_ht( + ref: hl.ReferenceGenome, + contigs: Optional[List[str]] = None, + excluded_intervals: Optional[List[hl.Interval]] = None, + add_all_substitutions: bool = False, + filter_n: bool = True, +) -> hl.Table: + """ + Create a reference Table with locus and alleles (containing only the reference allele by default) from the given reference genome. + + .. note:: + + If the `contigs` argument is not provided, all contigs (including obscure ones) will be added to the table. + This can be slow as contigs are added one by one. + + :param ref: Input reference genome + :param contigs: An optional list of contigs that the Table should include + :param excluded_intervals: An optional list of intervals to exclude + :param add_all_substitutions: If set, then all possible substitutions are added in the alleles array + :param filter_n: If set, bases where the reference is unknown (n) are filtered. + :return: + """ + if not ref.has_sequence(): + add_reference_sequence(ref) + + if not contigs: + contigs = ref.contigs + + if add_all_substitutions: + SUBSTITUTIONS_TABLE = hl.literal( + { + "a": ["c", "g", "t"], + "c": ["a", "g", "t"], + "g": ["a", "c", "t"], + "t": ["a", "c", "g"], + } + ) + + context = [] + for contig in contigs: + n_partitions = max(1, int(ref.contig_length(contig) / 5000000)) + logger.info( + "Creating reference contig %s with %d partitions.", contig, n_partitions + ) + _context = hl.utils.range_table( + ref.contig_length(contig), n_partitions=n_partitions + ) + + locus_expr = hl.locus(contig=contig, pos=_context.idx + 1, reference_genome=ref) + ref_allele_expr = locus_expr.sequence_context().lower() + if add_all_substitutions: + alleles_expr = hl.array([ref_allele_expr]).extend( + SUBSTITUTIONS_TABLE.get(ref_allele_expr, hl.empty_array(hl.tstr)) + ) + else: + alleles_expr = [ref_allele_expr] + + _context = ( + _context.select(locus=locus_expr, alleles=alleles_expr) + .key_by("locus", "alleles") + .drop("idx") + ) + + if excluded_intervals is not None: + _context = hl.filter_intervals(_context, excluded_intervals, keep=False) + + if filter_n: + _context = _context.filter(_context.alleles[0] == "n", keep=False) + + context.append(_context) + + return context.pop().union(*context)
+ + +
[docs]def add_reference_sequence(ref: hl.ReferenceGenome) -> hl.ReferenceGenome: + """ + Add the fasta sequence to a Hail reference genome. + + Only GRCh37 and GRCh38 references are supported. + + :param ref: Input reference genome. + :return: + """ + if not ref.has_sequence(): + if ref.name == "GRCh38": + ref.add_sequence( + "gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz", + "gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai", + ) + elif ref.name == "GRCh37": + ref.add_sequence( + "gs://hail-common/references/human_g1k_v37.fasta.gz", + "gs://hail-common/references/human_g1k_v37.fasta.fai", + ) + else: + raise NotImplementedError( + f"No known location for the fasta/fai files for genome {ref.name}. Only" + " GRCh37 and GRCh38 are supported at this time." + ) + else: + logger.info( + "Reference genome sequence already present. Ignoring" + " add_reference_sequence." + ) + + return ref
+ + +
[docs]def get_reference_genome( + locus: Union[hl.expr.LocusExpression, hl.expr.IntervalExpression], + add_sequence: bool = False, +) -> hl.ReferenceGenome: + """ + Return the reference genome associated with the input Locus expression. + + :param locus: Input locus + :param add_sequence: If set, the fasta sequence is added to the reference genome + :return: Reference genome + """ + if isinstance(locus, hl.expr.LocusExpression): + ref = locus.dtype.reference_genome + else: + assert isinstance(locus, hl.expr.IntervalExpression) + ref = locus.dtype.point_type.reference_genome + if add_sequence: + ref = add_reference_sequence(ref) + return ref
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/release.html b/_modules/gnomad/utils/release.html new file mode 100644 index 000000000..1a81391b9 --- /dev/null +++ b/_modules/gnomad/utils/release.html @@ -0,0 +1,255 @@ + + + + + + gnomad.utils.release — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.release

+# noqa: D100
+
+import logging
+from typing import Dict, List, Optional
+
+import hail as hl
+
+from gnomad.resources.grch38.gnomad import (
+    CURRENT_MAJOR_RELEASE,
+    GROUPS,
+    POPS,
+    SEXES,
+    SUBSETS,
+)
+from gnomad.utils.vcf import SORT_ORDER, index_globals
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def make_faf_index_dict( + faf_meta: List[Dict[str, str]], + groups: List[str] = ["adj"], + pops: List[str] = POPS[CURRENT_MAJOR_RELEASE]["exomes"], + sexes: List[str] = SEXES, + label_delimiter: str = "_", +) -> Dict[str, int]: + """ + Create a look-up Dictionary for entries contained in the filter allele frequency annotation array. + + :param faf_meta: Global annotation containing the set of groupings for each element of the faf array + (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}]) + :param groups: List of sample groups [adj, raw]. Default is GROUPS + :param pops: List of sample global population names for gnomAD data type. Default is POPS[CURRENT_MAJOR_RELEASE]["exomes"]. + :param sexes: List of sample sexes used in VCF export. Default is SEXES + :param label_delimiter: String used as delimiter when making group label combinations + :return: Dictionary of faf annotation population groupings, where values are the corresponding 0-based indices for the + groupings in the faf_meta array + """ + + def _get_index(label_groups): + return index_globals(faf_meta, label_groups, label_delimiter) + + index_dict = { + **_get_index(dict(group=groups)), + **_get_index(dict(group=groups, pop=pops)), + **_get_index(dict(group=groups, sex=sexes)), + **_get_index(dict(group=groups, pop=pops, sex=sexes)), + } + return index_dict
+ + +
[docs]def make_freq_index_dict( + freq_meta: List[Dict[str, str]], + groups: List[str] = GROUPS, + pops: List[str] = POPS[CURRENT_MAJOR_RELEASE]["exomes"], + sexes: List[str] = SEXES, + subsets: List[str] = SUBSETS[CURRENT_MAJOR_RELEASE], + downsamplings: Optional[List[int]] = None, + label_delimiter: str = "_", +) -> Dict[str, int]: + """ + Create a look-up Dictionary for entries contained in the frequency annotation array. + + .. note: + + Downsampling groupings are only computed on 'adj'-filtered genotypes + + :param freq_meta: List containing the set of groupings for each element of the freq array + (e.g., [{'group': 'adj'}, {'group': 'adj', 'pop': 'nfe'}]) + :param groups: List of sample groups [adj, raw]. Default is GROUPS + :param pops: List of sample global population names for gnomAD data type. Default is POPS[CURRENT_MAJOR_RELEASE]["exomes"]. + :param sexes: List of sample sexes used in VCF export. Default is SEXES + :param subsets: List of sample subsets in dataset. Default is SUBSETS[CURRENT_MAJOR_RELEASE] + :param downsamplings: List of downsampling cohort sizes present in global frequency array + :param label_delimiter: String used as delimiter when making group label combinations + :return: Dictionary keyed by the grouping combinations found in the frequency array, where values are the corresponding + 0-based indices for the groupings in the freq_meta array + """ + + def _get_index(label_groups): + return index_globals(freq_meta, label_groups, label_delimiter) + + index_dict = { + **_get_index(dict(group=groups)), + **_get_index(dict(group=groups, pop=pops)), + **_get_index(dict(group=groups, sex=sexes)), + **_get_index(dict(group=groups, pop=pops, sex=sexes)), + **_get_index(dict(group=groups, subset=subsets)), + **_get_index(dict(group=groups, subset=subsets, pop=pops)), + **_get_index(dict(group=groups, subset=subsets, sex=sexes)), + **_get_index(dict(group=groups, subset=subsets, pop=pops, sex=sexes)), + } + + if downsamplings: + index_dict.update( + {**_get_index(dict(downsampling=downsamplings, group=["adj"], pop=pops))} + ) + + return index_dict
+ + +
[docs]def make_freq_index_dict_from_meta( + freq_meta: List[Dict[str, str]], + label_delimiter: str = "_", + sort_order: Optional[List[str]] = SORT_ORDER, +) -> Dict[str, int]: + """ + Create a dictionary for accessing frequency array. + + The dictionary is keyed by the grouping combinations found in the frequency metadata + array, where values are the corresponding 0-based indices for the groupings in the + frequency array. For example, if the `freq_meta` entry [{'pop': 'nfe'}, {'sex': 'XX'}] + corresponds to the 5th entry in the frequency array, the returned dictionary entry + would be {'nfe_XX': 4}. + + :param freq_meta: List of dictionaries containing frequency metadata. + :param label_delimiter: Delimiter to use when joining frequency metadata labels. + :param sort_order: List of frequency metadata labels to use when sorting the dictionary. + :return: Dictionary of frequency metadata. + """ + # Confirm all groups in freq_meta are in sort_order. Warn user if not. + if sort_order is not None: + diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(sort_order) + if diff: + logger.warning( + "Found unexpected frequency metadata groupings: %s. These groupings" + " are not present in the provided sort_order: %s. These groupings" + " will not be included in the returned dictionary.", + diff, + sort_order, + ) + + index_dict = {} + for i, f in enumerate(hl.eval(freq_meta)): + if sort_order is None or len(set(f.keys()) - set(sort_order)) < 1: + index_dict[ + label_delimiter.join( + [ + f[g] + for g in sorted( + f.keys(), + key=(lambda x: sort_order.index(x)) if sort_order else None, + ) + ] + ) + ] = i + + return index_dict
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/slack.html b/_modules/gnomad/utils/slack.html new file mode 100644 index 000000000..41498e59f --- /dev/null +++ b/_modules/gnomad/utils/slack.html @@ -0,0 +1,280 @@ + + + + + + gnomad.utils.slack — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.slack

+# noqa: D100
+
+import os
+import sys
+import time
+import traceback
+import typing
+from contextlib import contextmanager
+
+from slack import WebClient
+
+
+
[docs]class SlackClient: + """ + Slack API client. + + :param token: Slack API token + """ + + def __init__(self, token: str): + self._client = WebClient(token=token) + self._display_name_map = None + + def _load_display_name_map(self): + display_name_map = {} + response = self._client.users_list(limit=100) + for user in response["members"]: + if not (user["deleted"] or user["is_bot"]): + display_name_map[user["profile"]["display_name"]] = user["id"] + + while response["response_metadata"]["next_cursor"]: + next_cursor = response["response_metadata"]["next_cursor"] + response = self._client.users_list(cursor=next_cursor, limit=100) + for user in response["members"]: + if not (user["deleted"] or user["is_bot"]): + display_name_map[user["profile"]["display_name"]] = user["id"] + + self._display_name_map = display_name_map + + def _get_direct_message_channel(self, user: str): + if not self._display_name_map: + self._load_display_name_map() + + if user.startswith("@"): + user = user[1:] + + try: + user_id = self._display_name_map[user] + except KeyError: + raise ValueError(f"User '{user}' not found in this workspace") + else: + response = self._client.conversations_open(users=[user_id]) + return response["channel"]["id"] + +
[docs] def send_file( + self, + to: typing.Union[str, typing.Iterable[str]], + file: typing.Optional[str] = None, + content: typing.Optional[str] = None, + filename: str = "data.txt", + filetype: str = "text", + comment: typing.Optional[str] = None, + ): + """ + Send a file to Slack channel(s) and/or user(s). + + :param to: Channel(s) (prefixed with '#') and/or user(s) (prefixed with '@') to send message to + :param file: Path of file to upload + :param content: File content to upload + :param filename: Filename of file + :param filetype: File type identifier + :param comment: Text for message sharing file + """ + if not (content or file) or (content and file): + raise ValueError( + "One, but not both, of 'content' or 'file' must be provided" + ) + + if isinstance(to, str): + to = [to] + + for channel_or_user in to: + if channel_or_user.startswith("@"): + channel = self._get_direct_message_channel(channel_or_user) + else: + channel = channel_or_user + + optional_args = {} + if file: + optional_args["file"] = file + else: + optional_args["content"] = content + + if comment: + optional_args["initial_comment"] = comment + + self._client.files_upload( + channels=channel, + filename=filename, + filetype=filetype, + **optional_args, + )
+ +
[docs] def send_message( + self, + to: typing.Union[str, typing.Iterable[str]], + message: str, + icon_emoji: typing.Optional[str] = None, + ): + """ + Send a message to Slack channel(s) and/or user(s). + + :param to: Channel(s) (prefixed with '#') and/or user(s) (prefixed with '@') to send message to + :param message: Message content (long messages will be converted to snippets) + :param icon_emoji: Emoji to use as icon for message + """ + if isinstance(to, str): + to = [to] + + for channel_or_user in to: + if channel_or_user.startswith("@"): + channel = self._get_direct_message_channel(channel_or_user) + else: + channel = channel_or_user + + if len(message) > 4000: + self._client.files_upload( + channels=channel, + content=message, + filename="message.txt", + filetype="text", + ) + else: + optional_args = {} + if icon_emoji: + optional_args["icon_emoji"] = icon_emoji + + self._client.chat_postMessage( + channel=channel, text=message, parse="full", **optional_args + )
+ + +
[docs]@contextmanager +def slack_notifications(token: str, to: typing.Union[str, typing.Iterable[str]]): + """ + Send a Slack notification after some code runs. + + If the wrapped code block raises an exception, the notification will include the exception and stack trace. + + Example usage: + + .. code-block:: python + + with slack_notifications(token, "@username"): + run_analysis() + + :param token: Slack API token + :param to: Channel(s) (prefixed with '#') and/or user(s) (prefixed with '@') to send notification to + """ + process = os.path.basename(sys.argv[0]) + try: + yield + + slack_client = SlackClient(token) + slack_client.send_message( + to, f":white_check_mark: Success! {process} finished!" + ) + except Exception as e: + slack_client = SlackClient(token) + slack_client.send_file( + to, + content=traceback.format_exc(), + filename=f"error_{process}_{time.strftime('%Y-%m-%d_%H:%M')}.log", + filetype="text", + comment=f":x: Error in {process}", + ) + + raise
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/sparse_mt.html b/_modules/gnomad/utils/sparse_mt.html new file mode 100644 index 000000000..ec209e548 --- /dev/null +++ b/_modules/gnomad/utils/sparse_mt.html @@ -0,0 +1,1507 @@ + + + + + + gnomad.utils.sparse_mt — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.sparse_mt

+# noqa: D100
+
+import logging
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+
+import hail as hl
+
+from gnomad.sample_qc.sex import adjusted_sex_ploidy_expr
+from gnomad.utils.annotations import (
+    agg_by_strata,
+    annotate_adj,
+    fs_from_sb,
+    generate_freq_group_membership_array,
+    get_adj_expr,
+    get_lowqual_expr,
+    pab_max_expr,
+    sor_from_sb,
+)
+from gnomad.utils.intervals import interval_length, union_intervals
+from gnomad.utils.reference_genome import get_reference_genome
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+INFO_AGG_FIELDS = {
+    "sum_agg_fields": ["QUALapprox"],
+    "int32_sum_agg_fields": ["VarDP"],
+    "median_agg_fields": ["ReadPosRankSum", "MQRankSum"],
+    "array_sum_agg_fields": ["SB", "RAW_MQandDP"],
+}
+
+AS_INFO_AGG_FIELDS = {
+    "sum_agg_fields": ["AS_QUALapprox", "AS_RAW_MQ"],
+    "int32_sum_agg_fields": ["AS_VarDP"],
+    "median_agg_fields": ["AS_RAW_ReadPosRankSum", "AS_RAW_MQRankSum"],
+    "array_sum_agg_fields": ["AS_SB_TABLE"],
+}
+
+
+
[docs]def compute_last_ref_block_end(mt: hl.MatrixTable) -> hl.Table: + """ + Compute the genomic position of the most upstream reference block overlapping each row on a sparse MT. + + Note that since reference blocks do not extend beyond contig boundaries, only the position is kept. + + This function returns a Table with that annotation. (`last_END_position`). + + :param mt: Input MatrixTable + :return: Output Table with `last_END_position` annotation + """ + mt = mt.select_entries("END") + + # Localize entries, so that they can be viewed as an array and scanned + # over using hl.scan.array_agg + ht = mt._localize_entries("__entries", "__cols") + + # Compute the position by using hl.scan._prev_nonnull. + # This was inspired by hl.experimental.densify + # _prev_non_null is an aggregator that keeps the previous record in memory + # and updates it with the given value at the row if it's not null (missing) + # The following code computes the following annotation for each row: + # 1. Keep a scan of the entries using _prev_nonnull, keeping the start (ht.locus) and end (entry.END) of each ref block (1.1) + # 2. For the current row locus, record the start of the block that starts the furthest away, + # that is the minimum position in the current scan for any block that + # overlaps the current locus (2.1) + ht = ht.select( + last_END_position=hl.or_else( + hl.min( # 2. For the current row locus, record the start of the block that starts the furthest away + hl.scan.array_agg( + lambda entry: hl.scan._prev_nonnull( # 1. Keep a scan of the entries using _prev_nonnull + hl.or_missing( + hl.is_defined( + entry.END + ), # Update the scan whenever a new ref block is encountered + hl.tuple( + [ # 1.1 keep the start (ht.locus) and end (entry.END) of each ref block + ht.locus, + entry.END, + ] + ), + ) + ), + ht.__entries, + ).map( + lambda x: hl.or_missing( # 2.1 get the start position of blocks that overlap the current locus + (x[1] >= ht.locus.position) & (x[0].contig == ht.locus.contig), + x[0].position, + ) + ) + ), + ht.locus.position, + ) + ) + return ht.select_globals().key_by("locus")
+ + +
[docs]def densify_sites( + mt: hl.MatrixTable, + sites_ht: hl.Table, + last_END_positions_ht: hl.Table, + semi_join_rows: bool = True, +) -> hl.MatrixTable: + """ + Create a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required. + + Note that only rows that appear both in `mt` and `sites_ht` are returned. + + :param mt: Input sparse MT + :param sites_ht: Desired sites to densify + :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag) + :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites) + :return: Dense MT filtered to the sites in `sites_ht` + """ + logger.info("Computing intervals to densify from sites Table.") + sites_ht = sites_ht.key_by("locus") + sites_ht = sites_ht.annotate( + interval=hl.locus_interval( + sites_ht.locus.contig, + last_END_positions_ht[sites_ht.key].last_END_position, + end=sites_ht.locus.position, + includes_end=True, + reference_genome=sites_ht.locus.dtype.reference_genome, + ) + ) + sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval)) + + if semi_join_rows: + mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus])) + else: + logger.info("Collecting intervals to densify.") + intervals = sites_ht.interval.collect() + + print( + "Found {0} intervals, totalling {1} bp in the dense Matrix.".format( + len(intervals), + sum( + [ + interval_length(interval) + for interval in union_intervals(intervals) + ] + ), + ) + ) + + mt = hl.filter_intervals(mt, intervals) + + mt = hl.experimental.densify(mt) + + return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
+ + +def _get_info_agg_expr( + mt: hl.MatrixTable, + sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["sum_agg_fields"], + int32_sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["int32_sum_agg_fields"], + median_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["median_agg_fields"], + array_sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.ArrayNumericExpression] + ] = INFO_AGG_FIELDS["array_sum_agg_fields"], + prefix: str = "", + treat_fields_as_allele_specific: bool = False, +) -> Dict[str, hl.expr.Aggregation]: + """ + Create Aggregators for both site or AS info expression aggregations. + + .. note:: + + - If `SB` is specified in array_sum_agg_fields, it will be aggregated as + `AS_SB_TABLE`, according to GATK standard nomenclature. + - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for + the `MQ` calculation and then dropped according to GATK recommendation. + - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation + and then dropped according to GATK recommendation. + - If the fields to be aggregated (`sum_agg_fields`, `int32_sum_agg_fields`, + `median_agg_fields`) are passed as list of str, then they should correspond + to entry fields in `mt` or in mt.gvcf_info`. + - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in + case of a name clash. + + :param mt: Input MT + :param sum_agg_fields: Fields to aggregate using sum. + :param int32_sum_agg_fields: Fields to aggregate using sum using int32. + :param median_agg_fields: Fields to aggregate using (approximate) median. + :param array_sum_agg_fields: Fields to aggregate using element-wise summing over an + array. + :param prefix: Optional prefix for the fields. Used for adding 'AS_' in the AS case. + :param treat_fields_as_allele_specific: Treat info fields as allele-specific. Defaults to False. + :return: Dictionary of expression names and their corresponding aggregation + Expression. + """ + + def _agg_list_to_dict( + mt: hl.MatrixTable, fields: List[str] + ) -> Dict[str, hl.expr.NumericExpression]: + out_fields = {} + if "gvcf_info" in mt.entry: + out_fields = {f: mt.gvcf_info[f] for f in fields if f in mt.gvcf_info} + + out_fields.update({f: mt[f] for f in fields if f in mt.entry}) + + # Check that all fields were found. + missing_fields = [f for f in fields if f not in out_fields] + if missing_fields: + raise ValueError( + "Could not find the following field(s)in the MT entry schema (or nested" + " under mt.gvcf_info: {}".format(",".join(missing_fields)) + ) + + if treat_fields_as_allele_specific: + # TODO: Change to use hl.vds.local_to_global when fill_value can accept + # missing (error in v0.2.119). + out_fields = { + f: hl.bind( + lambda x: hl.if_else(f == "AS_SB_TABLE", x, x[1:]), + hl.range(hl.len(mt.alleles)).map( + lambda i: hl.or_missing( + mt.LA.contains(i), out_fields[f][mt.LA.index(i)] + ) + ), + ) + for f in fields + } + + return out_fields + + # Map str to expressions where needed. + if isinstance(sum_agg_fields, list): + sum_agg_fields = _agg_list_to_dict(mt, sum_agg_fields) + + if isinstance(int32_sum_agg_fields, list): + int32_sum_agg_fields = _agg_list_to_dict(mt, int32_sum_agg_fields) + + if isinstance(median_agg_fields, list): + median_agg_fields = _agg_list_to_dict(mt, median_agg_fields) + + if isinstance(array_sum_agg_fields, list): + array_sum_agg_fields = _agg_list_to_dict(mt, array_sum_agg_fields) + + aggs = [ + (median_agg_fields, lambda x: hl.agg.approx_quantiles(x, 0.5)), + (sum_agg_fields, hl.agg.sum), + (int32_sum_agg_fields, lambda x: hl.int32(hl.agg.sum(x))), + (array_sum_agg_fields, hl.agg.array_sum), + ] + + # Create aggregators. + agg_expr = {} + for agg_fields, agg_func in aggs: + for k, expr in agg_fields.items(): + if treat_fields_as_allele_specific: + # If annotation is of the form 'AS_RAW_*_RankSum' it has a histogram + # representation where keys give the per-variant rank sum value to one + # decimal place followed by a comma and the corresponding count for + # that value, so we want to sum the rank sum value (first element). + # Rename annotation in the form 'AS_RAW_*_RankSum' to 'AS_*_RankSum'. + if k.startswith("AS_RAW_") and k.endswith("RankSum"): + agg_expr[f"{prefix}{k.replace('_RAW', '')}"] = hl.agg.array_agg( + lambda x: agg_func(hl.or_missing(hl.is_defined(x), x[0])), expr + ) + else: + agg_expr[f"{prefix}{k}"] = hl.agg.array_agg( + lambda x: agg_func(x), expr + ) + else: + agg_expr[f"{prefix}{k}"] = agg_func(expr) + + if treat_fields_as_allele_specific: + prefix = "AS_" + + # Handle annotations combinations and casting for specific annotations + # If RAW_MQandDP is in agg_expr or if both MQ_DP and RAW_MQ are, compute MQ instead + mq_tuple = None + if f"{prefix}RAW_MQandDP" in agg_expr: + logger.info( + "Computing %sMQ as sqrt(%sRAW_MQandDP[0]/%sRAW_MQandDP[1]). " + "Note that %sMQ will be set to 0 if %sRAW_MQandDP[1] == 0.", + *[prefix] * 5, + ) + mq_tuple = agg_expr.pop(f"{prefix}RAW_MQandDP") + elif "AS_RAW_MQ" in agg_expr and treat_fields_as_allele_specific: + logger.info( + "Computing AS_MQ as sqrt(AS_RAW_MQ[i]/AD[i+1]). " + "Note that AS_MQ will be set to 0 if AS_RAW_MQ == 0." + ) + ad_expr = hl.vds.local_to_global( + mt.LAD, mt.LA, hl.len(mt.alleles), fill_value=0, number="R" + ) + mq_tuple = hl.zip(agg_expr.pop("AS_RAW_MQ"), hl.agg.array_sum(ad_expr[1:])) + elif f"{prefix}RAW_MQ" in agg_expr and f"{prefix}MQ_DP" in agg_expr: + logger.info( + "Computing %sMQ as sqrt(%sRAW_MQ/%sMQ_DP). " + "Note that MQ will be set to 0 if %sRAW_MQ == 0.", + *[prefix] * 4, + ) + mq_tuple = (agg_expr.pop(f"{prefix}RAW_MQ"), agg_expr.pop(f"{prefix}MQ_DP")) + + if mq_tuple is not None: + if treat_fields_as_allele_specific: + agg_expr[f"{prefix}MQ"] = mq_tuple.map( + lambda x: hl.if_else(x[1] > 0, hl.sqrt(x[0] / x[1]), 0) + ) + else: + agg_expr[f"{prefix}MQ"] = hl.if_else( + mq_tuple[1] > 0, hl.sqrt(mq_tuple[0] / mq_tuple[1]), 0 + ) + + # If both VarDP and QUALapprox are present, also compute QD. + if f"{prefix}VarDP" in agg_expr and f"{prefix}QUALapprox" in agg_expr: + logger.info( + "Computing %sQD as %sQUALapprox/%sVarDP. " + "Note that %sQD will be set to 0 if %sVarDP == 0.", + *[prefix] * 5, + ) + var_dp = agg_expr[f"{prefix}VarDP"] + qual_approx = agg_expr[f"{prefix}QUALapprox"] + if treat_fields_as_allele_specific: + agg_expr[f"{prefix}QD"] = hl.map( + lambda x: hl.if_else(x[1] > 0, x[0] / x[1], 0), + hl.zip(qual_approx, var_dp), + ) + else: + agg_expr[f"{prefix}QD"] = hl.if_else(var_dp > 0, qual_approx / var_dp, 0) + + # SB needs to be cast to int32 for FS down the line. + if f"{prefix}SB" in agg_expr: + agg_expr[f"{prefix}SB"] = agg_expr[f"{prefix}SB"].map(lambda x: hl.int32(x)) + + # SB needs to be cast to int32 for FS down the line. + if "AS_SB_TABLE" in agg_expr: + agg_expr["AS_SB_TABLE"] = agg_expr["AS_SB_TABLE"].map( + lambda x: x.map(lambda y: hl.int32(y)) + ) + + return agg_expr + + +
[docs]def get_as_info_expr( + mt: hl.MatrixTable, + sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["sum_agg_fields"], + int32_sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["int32_sum_agg_fields"], + median_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["median_agg_fields"], + array_sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.ArrayNumericExpression] + ] = INFO_AGG_FIELDS["array_sum_agg_fields"], + alt_alleles_range_array_field: str = "alt_alleles_range_array", + treat_fields_as_allele_specific: bool = False, +) -> hl.expr.StructExpression: + """ + Return an allele-specific annotation Struct containing typical VCF INFO fields from GVCF INFO fields stored in the MT entries. + + .. note:: + + - If `SB` is specified in array_sum_agg_fields, it will be aggregated as + `AS_SB_TABLE`, according to GATK standard nomenclature. + - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for + the `MQ` calculation and then dropped according to GATK recommendation. + - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation + and then dropped according to GATK recommendation. + - If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, + `median_agg_fields`) are passed as list of str, then they should correspond + to entry fields in `mt` or in `mt.gvcf_info`. + - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in + case of a name clash. + - If `treat_fields_as_allele_specific` is False, it's expected that there is a + single value for each entry field to be aggregated. Then when performing the + aggregation per global alternate allele, that value is included in the + aggregation if the global allele is present in the entry's list of local + alleles. If `treat_fields_as_allele_specific` is True, it's expected that + each entry field to be aggregated has one value per local allele, and each + of those is mapped to a global allele for aggregation. + + :param mt: Input Matrix Table + :param sum_agg_fields: Fields to aggregate using sum. + :param int32_sum_agg_fields: Fields to aggregate using sum using int32. + :param median_agg_fields: Fields to aggregate using (approximate) median. + :param array_sum_agg_fields: Fields to aggregate using array sum. + :param alt_alleles_range_array_field: Annotation containing an array of the range + of alternate alleles e.g., `hl.range(1, hl.len(mt.alleles))` + :param treat_fields_as_allele_specific: Treat info fields as allele-specific. + Defaults to False. + :return: Expression containing the AS info fields + """ + if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields): + logger.warning( + "`DP` was included in allele-specific aggregation, however `DP` is" + " typically not aggregated by allele; `VarDP` is.Note that the resulting" + " `AS_DP` field will NOT include reference genotypes." + ) + + agg_expr = _get_info_agg_expr( + mt=mt, + sum_agg_fields=sum_agg_fields, + int32_sum_agg_fields=int32_sum_agg_fields, + median_agg_fields=median_agg_fields, + array_sum_agg_fields=array_sum_agg_fields, + prefix="" if treat_fields_as_allele_specific else "AS_", + treat_fields_as_allele_specific=treat_fields_as_allele_specific, + ) + + if alt_alleles_range_array_field not in mt.row or mt[ + alt_alleles_range_array_field + ].dtype != hl.dtype("array<int32>"): + msg = ( + f"'get_as_info_expr' expected a row field '{alt_alleles_range_array_field}'" + " of type array<int32>" + ) + logger.error(msg) + raise ValueError(msg) + + if not treat_fields_as_allele_specific: + # Modify aggregations to aggregate per allele + agg_expr = { + f: hl.agg.array_agg( + lambda ai: hl.agg.filter(mt.LA.contains(ai), expr), + mt[alt_alleles_range_array_field], + ) + for f, expr in agg_expr.items() + } + + # Run aggregations + info = hl.struct(**agg_expr) + + # Add FS and SOR if SB is present. + if "AS_SB_TABLE" in info or "AS_SB" in info: + drop = [] + # Rename AS_SB to AS_SB_TABLE if present and add SB Ax2 aggregation logic. + if "AS_SB" in agg_expr: + if "AS_SB_TABLE" in agg_expr: + logger.warning( + "Both `AS_SB` and `AS_SB_TABLE` were specified for aggregation." + " `AS_SB` will be used for aggregation." + ) + as_sb_table = hl.array( + [ + info.AS_SB.filter(lambda x: hl.is_defined(x)).fold( + lambda i, j: i[:2] + j[:2], [0, 0] + ) # ref + ] + ).extend( + info.AS_SB.map(lambda x: x[2:]) # each alt + ) + drop = ["AS_SB"] + else: + as_sb_table = info.AS_SB_TABLE + info = info.annotate( + AS_SB_TABLE=as_sb_table, + AS_FS=hl.range(1, hl.len(mt.alleles)).map( + lambda i: fs_from_sb(as_sb_table[0].extend(as_sb_table[i])) + ), + AS_SOR=hl.range(1, hl.len(mt.alleles)).map( + lambda i: sor_from_sb(as_sb_table[0].extend(as_sb_table[i])) + ), + ).drop(*drop) + + return info
+ + +
[docs]def get_site_info_expr( + mt: hl.MatrixTable, + sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["sum_agg_fields"], + int32_sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["int32_sum_agg_fields"], + median_agg_fields: Union[ + List[str], Dict[str, hl.expr.NumericExpression] + ] = INFO_AGG_FIELDS["median_agg_fields"], + array_sum_agg_fields: Union[ + List[str], Dict[str, hl.expr.ArrayNumericExpression] + ] = INFO_AGG_FIELDS["array_sum_agg_fields"], +) -> hl.expr.StructExpression: + """ + Create a site-level annotation Struct aggregating typical VCF INFO fields from GVCF INFO fields stored in the MT entries. + + .. note:: + + - If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for + the `MQ` calculation and then dropped according to GATK recommendation. + - If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation + and then dropped according to GATK recommendation. + - If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, + `median_agg_fields`) are passed as list of str, then they should correspond + to entry fields in `mt` or in `mt.gvcf_info`. + - Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in + case of a name clash. + + :param mt: Input Matrix Table + :param sum_agg_fields: Fields to aggregate using sum. + :param int32_sum_agg_fields: Fields to aggregate using sum using int32. + :param median_agg_fields: Fields to aggregate using (approximate) median. + :return: Expression containing the site-level info fields + """ + if "DP" in list(sum_agg_fields) + list(int32_sum_agg_fields): + logger.warning( + "`DP` was included in site-level aggregation. This requires a densifying" + " prior to running get_site_info_expr" + ) + + agg_expr = _get_info_agg_expr( + mt=mt, + sum_agg_fields=sum_agg_fields, + int32_sum_agg_fields=int32_sum_agg_fields, + median_agg_fields=median_agg_fields, + array_sum_agg_fields=array_sum_agg_fields, + ) + + # Add FS and SOR if SB is present + # This is done outside _get_info_agg_expr as the behavior is different + # in site vs allele-specific versions + if "SB" in agg_expr: + agg_expr["FS"] = fs_from_sb(agg_expr["SB"]) + agg_expr["SOR"] = sor_from_sb(agg_expr["SB"]) + + # Run aggregator on non-ref genotypes + info = hl.agg.filter( + mt.LGT.is_non_ref(), + hl.struct(**{k: v for k, v in agg_expr.items() if k != "DP"}), + ) + + # Add DP, computed over both ref and non-ref genotypes, if present + if "DP" in agg_expr: + info = info.annotate(DP=agg_expr["DP"]) + + return info
+ + +
[docs]def default_compute_info( + mt: hl.MatrixTable, + site_annotations: bool = False, + as_annotations: bool = False, + # Set to True by default to prevent a breaking change. + quasi_as_annotations: bool = True, + n_partitions: Optional[int] = 5000, + lowqual_indel_phred_het_prior: int = 40, + ac_filter_groups: Optional[Dict[str, hl.Expression]] = None, +) -> hl.Table: + """ + Compute a HT with the typical GATK allele-specific (AS) info fields as well as ACs and lowqual fields. + + .. note:: + + - This table doesn't split multi-allelic sites. + - At least one of `site_annotations`, `as_annotations` or `quasi_as_annotations` + must be True. + + :param mt: Input MatrixTable. Note that this table should be filtered to nonref sites. + :param site_annotations: Whether to generate site level info fields. Default is False. + :param as_annotations: Whether to generate allele-specific info fields using + allele-specific annotations in gvcf_info. Default is False. + :param quasi_as_annotations: Whether to generate allele-specific info fields using + non-allele-specific annotations in gvcf_info, but performing per allele + aggregations. This method can be used in cases where genotype data doesn't + contain allele-specific annotations to approximate allele-specific annotations. + Default is True. + :param n_partitions: Optional number of desired partitions for output Table. If + specified, naive_coalesce is performed. Default is 5000. + :param lowqual_indel_phred_het_prior: Phred-scaled prior for a het genotype at a + site with a low quality indel. Default is 40. We use 1/10k bases (phred=40) to + be more consistent with the filtering used by Broad's Data Sciences Platform + for VQSR. + :param ac_filter_groups: Optional dictionary of sample filter expressions to compute + additional groupings of ACs. Default is None. + :return: Table with info fields + :rtype: Table + """ + if not site_annotations and not as_annotations and not quasi_as_annotations: + raise ValueError( + "At least one of `site_annotations`, `as_annotations`, or " + "`quasi_as_annotations` must be True!" + ) + + # Add a temporary annotation for allele count groupings. + ac_filter_groups = {"": True, **(ac_filter_groups or {})} + mt = mt.annotate_cols(_ac_filter_groups=ac_filter_groups) + + # Move gvcf info entries out from nested struct. + mt = mt.transmute_entries(**mt.gvcf_info) + + # Adding alt_alleles_range_array as a required annotation for + # get_as_info_expr to reduce memory usage. + mt = mt.annotate_rows(alt_alleles_range_array=hl.range(1, hl.len(mt.alleles))) + + info_expr = None + quasi_info_expr = None + + # Compute quasi-AS info expr. + if quasi_as_annotations: + info_expr = get_as_info_expr(mt) + + # Compute AS info expr using gvcf_info allele specific annotations. + if as_annotations: + if info_expr is not None: + quasi_info_expr = info_expr + info_expr = get_as_info_expr( + mt, + **AS_INFO_AGG_FIELDS, + treat_fields_as_allele_specific=True, + ) + + if info_expr is not None: + # Add allele specific pab_max + info_expr = info_expr.annotate( + AS_pab_max=pab_max_expr(mt.LGT, mt.LAD, mt.LA, hl.len(mt.alleles)) + ) + + if site_annotations: + site_expr = get_site_info_expr(mt) + if info_expr is None: + info_expr = site_expr + else: + info_expr = info_expr.annotate(**site_expr) + + # Add 'AC' and 'AC_raw' for each allele count filter group requested. + # First compute ACs for each non-ref allele, grouped by adj. + grp_ac_expr = { + f: hl.agg.array_agg( + lambda ai: hl.agg.filter( + mt.LA.contains(ai) & mt._ac_filter_groups[f], + hl.agg.group_by( + get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), + hl.agg.sum( + mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[ + mt.LA.index(ai) + ] + ), + ), + ), + mt.alt_alleles_range_array, + ) + for f in ac_filter_groups + } + + # Then, for each non-ref allele, compute + # 'AC' as the adj group + # 'AC_raw' as the sum of adj and non-adj groups + info_expr = info_expr.annotate( + **{ + f"AC{'_' + f if f else f}_raw": grp.map( + lambda i: hl.int32(i.get(True, 0) + i.get(False, 0)) + ) + for f, grp in grp_ac_expr.items() + }, + **{ + f"AC{'_' + f if f else f}": grp.map(lambda i: hl.int32(i.get(True, 0))) + for f, grp in grp_ac_expr.items() + }, + ) + + ann_expr = {"info": info_expr} + if quasi_info_expr is not None: + ann_expr["quasi_info"] = quasi_info_expr + + info_ht = mt.select_rows(**ann_expr).rows() + + # Add AS lowqual flag + info_ht = info_ht.annotate( + AS_lowqual=get_lowqual_expr( + info_ht.alleles, + info_ht.info.AS_QUALapprox, + indel_phred_het_prior=lowqual_indel_phred_het_prior, + ) + ) + + if site_annotations: + # Add lowqual flag + info_ht = info_ht.annotate( + lowqual=get_lowqual_expr( + info_ht.alleles, + info_ht.info.QUALapprox, + indel_phred_het_prior=lowqual_indel_phred_het_prior, + ) + ) + + if n_partitions is not None: + info_ht = info_ht.naive_coalesce(n_partitions) + + return info_ht
+ + +
[docs]def split_info_annotation( + info_expr: hl.expr.StructExpression, a_index: hl.expr.Int32Expression +) -> hl.expr.StructExpression: + """ + Split multi-allelic allele-specific info fields. + + :param info_expr: Field containing info struct. + :param a_index: Allele index. Output by hl.split_multi or hl.split_multi_hts. + :return: Info struct with split annotations. + """ + # Index AS annotations + info_expr = info_expr.annotate( + **{ + f: info_expr[f][a_index - 1] + for f in info_expr + if f.startswith("AC") or (f.startswith("AS_") and not f == "AS_SB_TABLE") + } + ) + if "AS_SB_TABLE" in info_expr: + info_expr = info_expr.annotate( + AS_SB_TABLE=info_expr.AS_SB_TABLE[0].extend(info_expr.AS_SB_TABLE[a_index]) + ) + + return info_expr
+ + +
[docs]def split_lowqual_annotation( + lowqual_expr: hl.expr.ArrayExpression, a_index: hl.expr.Int32Expression +) -> hl.expr.BooleanExpression: + """ + Split multi-allelic low QUAL annotation. + + :param lowqual_expr: Field containing low QUAL annotation. + :param a_index: Allele index. Output by hl.split_multi or hl.split_multi_hts. + :return: Low QUAL expression for particular allele. + """ + return lowqual_expr[a_index - 1]
+ + +
[docs]def impute_sex_ploidy( + mt: hl.MatrixTable, + excluded_calling_intervals: Optional[hl.Table] = None, + included_calling_intervals: Optional[hl.Table] = None, + normalization_contig: str = "chr20", + chr_x: Optional[str] = None, + chr_y: Optional[str] = None, + use_only_variants: bool = False, +) -> hl.Table: + """ + Impute sex ploidy from a sparse MatrixTable. + + Sex ploidy is imputed by normalizing the coverage of chromosomes X and Y using the coverage of an autosomal + chromosome (by default chr20). + + Coverage is computed using the median block coverage (summed over the block size) and the non-ref coverage at + non-ref genotypes unless the `use_only_variants` argument is set to True and then it will use the mean coverage + defined by only the variants. + + :param mt: Input sparse Matrix Table + :param excluded_calling_intervals: Optional table of intervals to exclude from the computation. Used only when + determining contig size (not used when computing chromosome depth) when `use_only_variants` is False. + :param included_calling_intervals: Optional table of intervals to use in the computation. Used only when + determining contig size (not used when computing chromosome depth) when `use_only_variants` is False. + :param normalization_contig: Which chromosome to normalize by + :param chr_x: Optional X Chromosome contig name (by default uses the X contig in the reference) + :param chr_y: Optional Y Chromosome contig name (by default uses the Y contig in the reference) + :param use_only_variants: Whether to use depth of variant data within calling intervals instead of reference data. + Default will only use reference data. + + :return: Table with mean coverage over chromosomes 20, X and Y and sex chromosomes ploidy based on normalized coverage. + """ + ref = get_reference_genome(mt.locus, add_sequence=True) + if chr_x is None: + if len(ref.x_contigs) != 1: + raise NotImplementedError( + "Found {0} X chromosome contigs ({1}) in Genome reference." + " sparse_impute_sex_ploidy currently only supports a single X" + " chromosome contig. Please use the `chr_x` argument to specify which" + " X chromosome contig to use ".format( + len(ref.x_contigs), ",".join(ref.x_contigs) + ) + ) + chr_x = ref.x_contigs[0] + if chr_y is None: + if len(ref.y_contigs) != 1: + raise NotImplementedError( + "Found {0} Y chromosome contigs ({1}) in Genome reference." + " sparse_impute_sex_ploidy currently only supports a single Y" + " chromosome contig. Please use the `chr_y` argument to specify which" + " Y chromosome contig to use ".format( + len(ref.y_contigs), ",".join(ref.y_contigs) + ) + ) + chr_y = ref.y_contigs[0] + + def get_contig_size(contig: str) -> int: + """ + Compute the size of the specified `contig` using the median block coverage (summed over the block size). + + The size of the contig will be determined using only non par regions if the contig is an X or Y reference contig + and using the intervals specified by `included_calling_intervals` and excluding intervals specified by + `excluded_calling_intervals` if either is defined in the outer function. + + :param contig: Contig to compute the size of + :return: Integer of the contig size + """ + logger.info("Working on %s", contig) + contig_ht = hl.utils.range_table( + ref.contig_length(contig), + n_partitions=int(ref.contig_length(contig) / 500_000), + ) + contig_ht = contig_ht.annotate( + locus=hl.locus(contig=contig, pos=contig_ht.idx + 1, reference_genome=ref) + ) + contig_ht = contig_ht.filter(contig_ht.locus.sequence_context().lower() != "n") + + if contig in ref.x_contigs: + contig_ht = contig_ht.filter(contig_ht.locus.in_x_nonpar()) + if contig in ref.y_contigs: + contig_ht = contig_ht.filter(contig_ht.locus.in_y_nonpar()) + + contig_ht = contig_ht.key_by("locus") + if included_calling_intervals is not None: + contig_ht = contig_ht.filter( + hl.is_defined(included_calling_intervals[contig_ht.key]) + ) + if excluded_calling_intervals is not None: + contig_ht = contig_ht.filter( + hl.is_missing(excluded_calling_intervals[contig_ht.key]) + ) + contig_size = contig_ht.count() + logger.info("Contig %s has %d bases for coverage.", contig, contig_size) + return contig_size + + def get_chr_dp_ann(chrom: str) -> hl.Table: + """ + Compute the mean depth of the specified chromosome. + + The total depth will be determined using the sum DP of either reference and variant data or only variant data + depending on the value of `use_only_variants` in the outer function. + + If `use_only_variants` is set to False then this value is computed using the median block coverage (summed over + the block size). If `use_only_variants` is set to True, this value is computed using the sum of DP for all + variants divided by the total number of variants. + + The depth calculations will be determined using only non par regions if the contig is an X or Y reference contig + and using the intervals specified by `included_calling_intervals` and excluding intervals specified by + `excluded_calling_intervals` if either is defined in the outer function (when `use_only_variants` is not + set this only applies to the contig size estimate and is not used when computing chromosome depth). + + :param chrom: Chromosome to compute the mean depth of + :return: Table of a per sample mean depth of `chrom` + """ + contig_size = get_contig_size(chrom) + chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)]) + + if chrom in ref.x_contigs: + chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar()) + if chrom in ref.y_contigs: + chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar()) + + if use_only_variants: + if included_calling_intervals is not None: + chr_mt = chr_mt.filter_rows( + hl.is_defined(included_calling_intervals[chr_mt.locus]) + ) + if excluded_calling_intervals is not None: + chr_mt = chr_mt.filter_rows( + hl.is_missing(excluded_calling_intervals[chr_mt.locus]) + ) + return chr_mt.select_cols( + **{ + f"{chrom}_mean_dp": hl.agg.filter( + chr_mt.LGT.is_non_ref(), + hl.agg.sum(chr_mt.DP), + ) + / hl.agg.filter(chr_mt.LGT.is_non_ref(), hl.agg.count()) + } + ).cols() + else: + return chr_mt.select_cols( + **{ + f"{chrom}_mean_dp": ( + hl.agg.sum( + hl.if_else( + chr_mt.LGT.is_hom_ref(), + chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position), + chr_mt.DP, + ) + ) + / contig_size + ) + } + ).cols() + + normalization_chrom_dp = get_chr_dp_ann(normalization_contig) + chrX_dp = get_chr_dp_ann(chr_x) + chrY_dp = get_chr_dp_ann(chr_y) + + ht = normalization_chrom_dp.annotate( + **chrX_dp[normalization_chrom_dp.key], + **chrY_dp[normalization_chrom_dp.key], + ) + + return ht.annotate( + **{ + f"{chr_x}_ploidy": ht[f"{chr_x}_mean_dp"] + / (ht[f"{normalization_contig}_mean_dp"] / 2), + f"{chr_y}_ploidy": ht[f"{chr_y}_mean_dp"] + / (ht[f"{normalization_contig}_mean_dp"] / 2), + } + )
+ + +
[docs]def densify_all_reference_sites( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + reference_ht: hl.Table, + interval_ht: Optional[hl.Table] = None, + row_key_fields: Union[Tuple[str], List[str], Set[str]] = ("locus",), + entry_keep_fields: Union[Tuple[str], List[str], Set[str]] = ("GT",), +) -> hl.MatrixTable: + """ + Densify a VariantDataset or Sparse MatrixTable at all sites in a reference Table. + + :param mtds: Input sparse MatrixTable or VariantDataset. + :param reference_ht: Table of reference sites. + :param interval_ht: Optional Table of intervals to filter to. + :param row_key_fields: Fields to use as row key. Defaults to locus. + :param entry_keep_fields: Fields to keep in entries before performing the + densification. Defaults to GT. + :return: Densified MatrixTable. + """ + is_vds = isinstance(mtds, hl.vds.VariantDataset) + + if interval_ht is not None and not is_vds: + raise NotImplementedError( + "Filtering to an interval list for a sparse Matrix Table is currently" + " not supported." + ) + + # Filter datasets to interval list. + if interval_ht is not None: + reference_ht = reference_ht.filter( + hl.is_defined(interval_ht[reference_ht.locus]) + ) + mtds = hl.vds.filter_intervals( + vds=mtds, intervals=interval_ht, split_reference_blocks=False + ) + + entry_keep_fields = set(entry_keep_fields) + if is_vds: + mt = mtds.variant_data + else: + mt = mtds + entry_keep_fields.add("END") + + # Get the total number of samples. + n_samples = mt.count_cols() + mt_col_key_fields = list(mt.col_key) + mt_row_key_fields = list(mt.row_key) + ht = mt.select_entries(*entry_keep_fields).select_cols() + + # Localize entries and perform an outer join with the reference HT. + ht = ht._localize_entries("__entries", "__cols") + ht = ht.key_by(*row_key_fields) + ht = ht.join(reference_ht.key_by(*row_key_fields).select(_in_ref=True), how="outer") + ht = ht.key_by(*mt_row_key_fields) + + # Fill in missing entries with missing values for each entry field. + ht = ht.annotate( + __entries=hl.or_else( + ht.__entries, + hl.range(n_samples).map( + lambda x: hl.missing(ht.__entries.dtype.element_type) + ), + ) + ) + + # Unlocalize entries to turn the HT back to a MT. + mt = ht._unlocalize_entries("__entries", "__cols", mt_col_key_fields) + + # Densify VDS/sparse MT at all sites. + if is_vds: + mt = hl.vds.to_dense_mt( + hl.vds.VariantDataset(mtds.reference_data.select_cols().select_rows(), mt) + ) + else: + mt = hl.experimental.densify(mt) + + # Remove rows where the reference is missing. + mt = mt.filter_rows(mt._in_ref) + + # Unfilter entries so that entries with no ref block overlap aren't null. + mt = mt.unfilter_entries() + + # Rekey by requested row key field and drop unused keys. + mt = mt.key_rows_by(*row_key_fields) + mt = mt.drop(*[k for k in mt_row_key_fields if k not in row_key_fields]) + + return mt
+ + +
[docs]def compute_stats_per_ref_site( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + reference_ht: hl.Table, + entry_agg_funcs: Dict[str, Tuple[Callable, Callable]], + row_key_fields: Union[Tuple[str], List[str]] = ("locus",), + interval_ht: Optional[hl.Table] = None, + entry_keep_fields: Union[Tuple[str], List[str], Set[str]] = None, + row_keep_fields: Union[Tuple[str], List[str], Set[str]] = None, + entry_agg_group_membership: Optional[Dict[str, List[dict[str, str]]]] = None, + strata_expr: Optional[List[Dict[str, hl.expr.StringExpression]]] = None, + group_membership_ht: Optional[hl.Table] = None, + sex_karyotype_field: Optional[str] = None, +) -> hl.Table: + """ + Compute stats per site in a reference Table. + + :param mtds: Input sparse Matrix Table or VariantDataset. + :param reference_ht: Table of reference sites. + :param entry_agg_funcs: Dict of entry aggregation functions to perform on the + VariantDataset/MatrixTable. The keys of the dict are the names of the + annotations and the values are tuples of functions. The first function is used + to transform the `mt` entries in some way, and the second function is used to + aggregate the output from the first function. + :param row_key_fields: Fields to use as row key. Defaults to locus. + :param interval_ht: Optional table of intervals to filter to. + :param entry_keep_fields: Fields to keep in entries before performing the + densification in `densify_all_reference_sites`. Should include any fields + needed for the functions in `entry_agg_funcs`. By default, only GT or LGT is + kept. + :param row_keep_fields: Fields to keep in rows after performing the stats + aggregation. By default, only the row key fields are kept. + :param entry_agg_group_membership: Optional dict indicating the subset of group + strata in 'freq_meta' to use the entry aggregation functions on. The keys of + the dict can be any of the keys in `entry_agg_funcs` and the values are lists + of dicts. Each dict in the list contains the strata in 'freq_meta' to use for + the corresponding entry aggregation function. If provided, 'freq_meta' must be + present in `group_membership_ht` and represent the same strata as those in + 'group_membership'. If not provided, all entries of the 'group_membership' + annotation will have the entry aggregation functions applied to them. + :param strata_expr: Optional list of dicts of expressions to stratify by. + :param group_membership_ht: Optional Table of group membership annotations. + :param sex_karyotype_field: Optional field to use to adjust genotypes for sex + karyotype before stats aggregation. If provided, the field must be present in + the columns of `mtds` (variant_data MT if `mtds` is a VDS) and use "XX" and + "XY" as values. If not provided, no sex karyotype adjustment is performed. + Default is None. + :return: Table of stats per site. + """ + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + mt = mtds.variant_data + else: + mt = mtds + + if sex_karyotype_field is not None and sex_karyotype_field not in mt.col: + raise ValueError( + f"The supplied 'sex_karyotype_field', {sex_karyotype_field}, is not present" + " in the columns of the input!" + ) + + if group_membership_ht is not None and strata_expr is not None: + raise ValueError( + "Only one of 'group_membership_ht' or 'strata_expr' can be specified." + ) + + g = {} if group_membership_ht is None else group_membership_ht.globals + if entry_agg_group_membership is not None and "freq_meta" not in g: + raise ValueError( + "The 'freq_meta' annotation must be present in 'group_membership_ht' if " + "'entry_agg_group_membership' is specified." + ) + + # Determine if the adj annotation is needed. It is only needed if "adj_groups" is + # in the globals of the group_membership_ht and any entry is True, or "freq_meta" + # is in the globals of the group_membership_ht and any entry has "group" == "adj". + adj = hl.eval( + hl.any(g.get("adj_groups", hl.empty_array("bool"))) + | hl.any( + g.get("freq_meta", hl.empty_array("dict<str, str>")).map( + lambda x: x.get("group", "NA") == "adj" + ) + ) + ) + + # Determine the entry fields on mt that should be densified. + # "GT" or "LGT" is required for the genotype. + # If the adj annotation is needed then "adj" must be present on mt, or AD/LAD, DP, + # and GQ must be present. + en = set(mt.entry) + gt_field = en & {"GT"} or en & {"LGT"} + ad_field = en & {"AD"} or en & {"LAD"} + adj_fields = en & {"adj"} or ({"DP", "GQ"} | ad_field) if adj else set([]) + + if not gt_field: + raise ValueError("No genotype field found in entry fields!") + + if adj and not adj_fields.issubset(en): + raise ValueError( + "No 'adj' found in entry fields, and one of AD/LAD, DP, and GQ is missing " + "so adj can't be computed!" + ) + + entry_keep_fields = set(entry_keep_fields or set([])) | gt_field | adj_fields + + # Write the sex karyotype field out to a temp HT so we can annotate the field back + # onto the MT after 'densify_all_reference_sites' removes all column annotations. + if sex_karyotype_field is not None: + sex_karyotype_ht = ( + mt.cols() + .select(sex_karyotype_field) + .checkpoint(hl.utils.new_temp_file("sex_karyotype_ht", "ht")) + ) + else: + sex_karyotype_ht = None + + # Initialize no_strata and default strata_expr if neither group_membership_ht nor + # strata_expr is provided. + no_strata = group_membership_ht is None and strata_expr is None + if no_strata: + strata_expr = {} + + if group_membership_ht is None: + logger.warning( + "'group_membership_ht' is not specified, no stats are adj filtered." + ) + + # Annotate the MT cols with each of the expressions in strata_expr and redefine + # strata_expr based on the column HT with added annotations. + ht = mt.annotate_cols( + **{k: v for d in strata_expr for k, v in d.items()} + ).cols() + strata_expr = [{k: ht[k] for k in d} for d in strata_expr] + + # Use 'generate_freq_group_membership_array' to create a group_membership Table + # that gives stratification group membership info based on 'strata_expr'. The + # returned Table has the following annotations: 'freq_meta', + # 'freq_meta_sample_count', and 'group_membership'. By default, this + # function returns annotations where the second element is a placeholder for the + # "raw" frequency of all samples, where the first 2 elements are the same sample + # set, but 'freq_meta' starts with [{"group": "adj", "group": "raw", ...]. Use + # `no_raw_group` to exclude the "raw" group so there is a single annotation + # representing the full samples set. Update all 'freq_meta' entries' "group" + # to "raw" because `generate_freq_group_membership_array` will return them all + # as "adj" since it was built for frequency computation, but for the coverage + # computation we don't want to do any filtering. + group_membership_ht = generate_freq_group_membership_array( + ht, strata_expr, no_raw_group=True + ) + group_membership_ht = group_membership_ht.annotate_globals( + freq_meta=group_membership_ht.freq_meta.map( + lambda x: hl.dict( + x.items().map( + lambda m: hl.if_else(m[0] == "group", ("group", "raw"), m) + ) + ) + ) + ) + + if is_vds: + rmt = mtds.reference_data + mtds = hl.vds.VariantDataset( + rmt.select_entries(*((set(entry_keep_fields) & set(rmt.entry)) | {"END"})), + mtds.variant_data, + ) + + mt = densify_all_reference_sites( + mtds, + reference_ht, + interval_ht, + row_key_fields, + entry_keep_fields=entry_keep_fields, + ) + + if sex_karyotype_ht is not None: + logger.info("Adjusting genotype ploidy based on sex karyotype.") + gt_field = gt_field.pop() + mt = mt.annotate_cols( + sex_karyotype=sex_karyotype_ht[mt.col_key][sex_karyotype_field] + ) + mt = mt.annotate_entries( + **{ + gt_field: adjusted_sex_ploidy_expr( + mt.locus, mt[gt_field], mt.sex_karyotype + ) + } + ) + + # Annotate with adj if needed. + if adj and "adj" not in mt.entry: + logger.info("Annotating the MT with adj.") + mt = annotate_adj(mt) + + ht = agg_by_strata( + mt, + entry_agg_funcs, + group_membership_ht=group_membership_ht, + select_fields=row_keep_fields, + entry_agg_group_membership=entry_agg_group_membership, + ) + ht = ht.select_globals().checkpoint(hl.utils.new_temp_file("agg_stats", "ht")) + + group_globals = group_membership_ht.index_globals() + global_expr = {} + if no_strata: + # If there was no stratification, move aggregated annotations to the top + # level. + ht = ht.select(**{ann: ht[ann][0] for ann in entry_agg_funcs}) + global_expr["sample_count"] = group_globals.freq_meta_sample_count[0] + else: + # If there was stratification, add the metadata and sample count info for the + # stratification to the globals. + global_expr["strata_meta"] = group_globals.freq_meta + global_expr["strata_sample_count"] = group_globals.freq_meta_sample_count + + ht = ht.annotate_globals(**global_expr) + + return ht
+ + +
[docs]def compute_coverage_stats( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + reference_ht: hl.Table, + interval_ht: Optional[hl.Table] = None, + coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100], + row_key_fields: List[str] = ["locus"], + strata_expr: Optional[List[Dict[str, hl.expr.StringExpression]]] = None, + group_membership_ht: Optional[hl.Table] = None, +) -> hl.Table: + """ + Compute coverage statistics for every base of the `reference_ht` provided. + + The following coverage stats are calculated: + - mean + - median + - total DP + - fraction of samples with coverage above X, for each x in `coverage_over_x_bins` + + The `reference_ht` is a Table that contains a row for each locus coverage that should be + computed on. It needs to be keyed by `locus`. The `reference_ht` can e.g. be + created using `get_reference_ht`. + + :param mtds: Input sparse MT or VDS + :param reference_ht: Input reference HT + :param interval_ht: Optional Table containing intervals to filter to + :param coverage_over_x_bins: List of boundaries for computing samples over X + :param row_key_fields: List of row key fields to use for joining `mtds` with + `reference_ht` + :param strata_expr: Optional list of dicts containing expressions to stratify the + coverage stats by. Only one of `group_membership_ht` or `strata_expr` can be + specified. + :param group_membership_ht: Optional Table containing group membership annotations + to stratify the coverage stats by. Only one of `group_membership_ht` or + `strata_expr` can be specified. + :return: Table with per-base coverage stats. + """ + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + mt = mtds.variant_data + else: + mt = mtds + + # Determine the genotype field. + en = set(mt.entry) + gt_field = en & {"GT"} or en & {"LGT"} + if not gt_field: + raise ValueError("No genotype field found in entry fields!") + + gt_field = gt_field.pop() + + # Add function to compute coverage stats. + cov_bins = sorted(coverage_over_x_bins) + rev_cov_bins = list(reversed(cov_bins)) + max_cov_bin = cov_bins[-1] + cov_bins = hl.array(cov_bins) + entry_agg_funcs = { + "coverage_stats": ( + lambda t: hl.if_else(hl.is_missing(t.DP) | hl.is_nan(t.DP), 0, t.DP), + lambda dp: hl.struct( + # This expression creates a counter DP -> number of samples for DP + # between 0 and max_cov_bin. + coverage_counter=hl.agg.counter(hl.min(max_cov_bin, dp)), + mean=hl.if_else(hl.is_nan(hl.agg.mean(dp)), 0, hl.agg.mean(dp)), + median_approx=hl.or_else(hl.agg.approx_median(dp), 0), + total_DP=hl.agg.sum(dp), + ), + ) + } + + ht = compute_stats_per_ref_site( + mtds, + reference_ht, + entry_agg_funcs, + row_key_fields=row_key_fields, + interval_ht=interval_ht, + entry_keep_fields=[gt_field, "DP"], + strata_expr=strata_expr, + group_membership_ht=group_membership_ht, + ) + + # This expression aggregates the DP counter in reverse order of the cov_bins and + # computes the cumulative sum over them. It needs to be in reverse order because we + # want the sum over samples covered by > X. + def _cov_stats( + cov_stat: hl.expr.StructExpression, n: hl.expr.Int32Expression + ) -> hl.expr.StructExpression: + # The coverage was already floored to the max_coverage_bin, so no more + # aggregation is needed for the max bin. + count_expr = cov_stat.coverage_counter + max_bin_expr = hl.int32(count_expr.get(max_cov_bin, 0)) + + # For each of the other bins, coverage is summed between the boundaries. + bin_expr = hl.range(hl.len(cov_bins) - 1, 0, step=-1) + bin_expr = bin_expr.map( + lambda i: hl.sum( + hl.range(cov_bins[i - 1], cov_bins[i]).map( + lambda j: hl.int32(count_expr.get(j, 0)) + ) + ) + ) + bin_expr = hl.cumulative_sum(hl.array([max_bin_expr]).extend(bin_expr)) + + bin_expr = {f"over_{x}": bin_expr[i] / n for i, x in enumerate(rev_cov_bins)} + + return cov_stat.annotate(**bin_expr).drop("coverage_counter") + + ht_globals = ht.index_globals() + if isinstance(ht.coverage_stats, hl.expr.ArrayExpression): + ht = ht.select_globals( + coverage_stats_meta=ht_globals.strata_meta.map( + lambda x: hl.dict(x.items().filter(lambda m: m[0] != "group")) + ), + coverage_stats_meta_sample_count=ht_globals.strata_sample_count, + ) + cov_stats_expr = { + "coverage_stats": hl.map( + lambda c, n: _cov_stats(c, n), + ht.coverage_stats, + ht_globals.strata_sample_count, + ) + } + else: + cov_stats_expr = _cov_stats(ht.coverage_stats, ht_globals.sample_count) + + ht = ht.transmute(**cov_stats_expr) + + return ht
+ + +
[docs]def get_allele_number_agg_func(gt_field: str = "GT") -> Tuple[Callable, Callable]: + """ + Get a transformation and aggregation function for computing the allele number. + + Can be used as an entry aggregation function in `compute_stats_per_ref_site`. + + :param gt_field: Genotype field to use for computing the allele number. + :return: Tuple of functions to transform and aggregate the allele number. + """ + return lambda t: t[gt_field].ploidy, hl.agg.sum
+ + +
[docs]def compute_allele_number_per_ref_site( + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], + reference_ht: hl.Table, + **kwargs, +) -> hl.Table: + """ + Compute the allele number per reference site. + + :param mtds: Input sparse Matrix Table or VariantDataset. + :param reference_ht: Table of reference sites. + :param kwargs: Keyword arguments to pass to `compute_stats_per_ref_site`. + :return: Table of allele number per reference site. + """ + if isinstance(mtds, hl.vds.VariantDataset): + mt = mtds.variant_data + else: + mt = mtds + + # Determine the genotype field. + en = set(mt.entry) + gt_field = en & {"GT"} or en & {"LGT"} + if not gt_field: + raise ValueError( + "No genotype field found in entry fields, needed for ploidy calculation!" + ) + + # Use ploidy to determine the number of alleles for each sample at each site. + entry_agg_funcs = {"AN": get_allele_number_agg_func(gt_field.pop())} + + return compute_stats_per_ref_site(mtds, reference_ht, entry_agg_funcs, **kwargs)
+ + +
[docs]def filter_ref_blocks( + t: Union[hl.MatrixTable, hl.Table], +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter ref blocks out of the Table or MatrixTable. + + :param t: Input MT/HT + :return: MT/HT with ref blocks removed + """ + if isinstance(t, hl.MatrixTable): + t = t.filter_rows((hl.len(t.alleles) > 1)) + else: + t = t.filter((hl.len(t.alleles) > 1)) + + return t
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/transcript_annotation.html b/_modules/gnomad/utils/transcript_annotation.html new file mode 100644 index 000000000..1de7d08d4 --- /dev/null +++ b/_modules/gnomad/utils/transcript_annotation.html @@ -0,0 +1,527 @@ + + + + + + gnomad.utils.transcript_annotation — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.utils.transcript_annotation

+"""Utils module containing generic functions that are useful for adding transcript expression-aware annotations."""
+
+import logging
+from typing import Callable, List, Optional, Tuple, Union
+
+import hail as hl
+
+from gnomad.utils.filtering import filter_to_gencode_cds
+from gnomad.utils.vep import (
+    CSQ_CODING,
+    CSQ_SPLICE,
+    explode_by_vep_annotation,
+    filter_vep_transcript_csqs,
+    process_consequences,
+)
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger("transcript_annotation_utils")
+logger.setLevel(logging.INFO)
+
+
+
[docs]def summarize_transcript_expression( + mt: hl.MatrixTable, + transcript_expression_expr: Union[ + hl.expr.NumericExpression, str + ] = "transcript_tpm", + tissue_expr: Union[hl.expr.StringExpression, str] = "tissue", + summary_agg_func: Optional[Callable] = None, +) -> hl.Table: + """ + Summarize a transcript expression MatrixTable by transcript, gene, and tissue. + + The `summary_agg_func` argument allows the user to specify a Hail aggregation + function to use to summarize the expression by tissue. By default, the median is + used. + + The returned Table has a row annotation for each tissue containing a struct with the + summarized tissue expression value ('transcript_expression') and the proportion of + expression of transcript to gene per tissue ('expression_proportion'). + + Returned Table Schema example:: + + Row fields: + 'transcript_id': str + 'gene_id': str + 'tissue_1': struct { + transcript_expression: float64, + expression_proportion: float64 + } + 'tissue_2': struct { + transcript_expression: float64, + expression_proportion: float64 + } + + Key: ['transcript_id', 'gene_id'] + + :param mt: MatrixTable of transcript (rows) expression quantifications (entry) by + sample (columns). + :param transcript_expression_expr: Entry expression indicating transcript expression + quantification. Default is 'transcript_tpm'. + :param tissue_expr: Column expression indicating tissue type. Default is 'tissue'. + :param summary_agg_func: Optional aggregation function to use to summarize the + transcript expression quantification by tissue. Example: `hl.agg.mean`. Default + is None, which will use a median aggregation. + :return: A Table of summarized transcript expression by tissue. + """ + if summary_agg_func is None: + summary_agg_func = lambda x: hl.median(hl.agg.collect(x)) + + if isinstance(transcript_expression_expr, str): + transcript_expression_expr = mt[transcript_expression_expr] + + if isinstance(tissue_expr, str): + tissue_expr = mt[tissue_expr] + + mt = mt.group_cols_by(tissue=tissue_expr).aggregate( + tx=summary_agg_func(transcript_expression_expr) + ) + ht = mt.rename({"tx": ""}).make_table().key_by("transcript_id", "gene_id") + + # Annotate with the proportion of expression of transcript to gene per tissue. + ht = ht.annotate(expression_proportion=get_expression_proportion(ht)) + ht = ht.select( + **{ + t: hl.struct( + transcript_expression=ht[t], + expression_proportion=ht.expression_proportion[t], + ) + for t in ht.expression_proportion + } + ) + + return ht
+ + +
[docs]def get_expression_proportion(ht: hl.Table) -> hl.expr.StructExpression: + """ + Calculate the proportion of expression of transcript to gene per tissue. + + :param ht: Table of summarized transcript expression by tissue. + :return: StructExpression containing the proportion of expression of transcript to + gene per tissue. + """ + tissues = list(ht.row_value) + + # Calculate the sum of transcript expression by gene per tissue. + gene_ht = ht.group_by("gene_id").aggregate( + **{tissue: hl.agg.sum(ht[tissue]) for tissue in tissues} + ) + + # Return the proportion of expression of transcript to gene per tissue. + gene = gene_ht[ht.gene_id] + return hl.struct( + **{ + tissue: hl.utils.misc.divide_null(ht[tissue], gene[tissue]) + for tissue in tissues + } + )
+ + +
[docs]def filter_expression_ht_by_tissues( + ht: hl.Table, + tissues_to_keep: Optional[List[str]] = None, + tissues_to_filter: Optional[List[str]] = None, +) -> hl.Table: + """ + Filter a Table with a row annotation for each tissue to only include specified tissues. + + :param ht: Table with a row annotation for each tissue. + :param tissues_to_keep: Optional list of tissues to keep in the Table. Default is + all non-key rows in the Table. + :param tissues_to_filter: Optional list of tissues to exclude from the Table. + :return: Table with only specified tissues. + """ + if tissues_to_keep is None and tissues_to_filter is None: + logger.info( + "No tissues_to_keep or tissues_to_filter specified. Returning input Table." + ) + return ht + + if tissues_to_keep is None: + tissues = list(ht.row_value) + + if tissues_to_filter is not None: + logger.info("Filtering tissues: %s", tissues_to_filter) + tissues = [t for t in tissues if t not in tissues_to_filter] + + ht = ht.select(*tissues) + + return ht
+ + +
[docs]def tissue_expression_ht_to_array( + ht: hl.Table, + tissues_to_keep: Optional[List[str]] = None, + tissues_to_filter: Optional[List[str]] = None, + annotations_to_extract: Optional[Union[Tuple[str], List[str]]] = ( + "transcript_expression", + "expression_proportion", + ), +) -> hl.Table: + """ + Convert a Table with a row annotation for each tissue to a Table with tissues as an array. + + The output is a Table with one of the two formats: + - An annotation of 'tissue_expression' containing an array of structs by + tissue, where each element of the array is the Table's row value for a given + tissue. + + Example:: + + tissue_expression': array<struct { + transcript_expression: float64, + expression_proportion: float64 + }> + + - One array annotation for each field defined in the 'annotations_to_extract' + argument, where each array is an array of the given field values by tissue. + + Example:: + + 'transcript_expression': array<float64> + 'expression_proportion': array<float64> + + The order of tissues in the array is indicated by the "tissues" global annotation. + + :param ht: Table with a row annotation for each tissue. + :param tissues_to_keep: Optional list of tissues to keep in the tissue expression + array. Default is all non-key rows in the Table. + :param tissues_to_filter: Optional list of tissues to exclude from the + tissue expression array. + :param annotations_to_extract: Optional list of tissue struct fields to extract + into top level array annotations. If None, the returned Table will contain a + single top level annotation 'tissue_expression' that contains an array of + structs by tissue. Default is ('transcript_expression', 'expression_proportion'). + :return: Table with requested tissue struct annotations pulled into arrays of + tissue values and a 'tissues' global annotation indicating the order of tissues + in the arrays. + """ + ht = filter_expression_ht_by_tissues(ht, tissues_to_keep, tissues_to_filter) + + tissues = list(ht.row_value) + ht = ht.select_globals(tissues=tissues) + ht = ht.select(tissue_expression=[ht[t] for t in tissues]) + + if annotations_to_extract is not None: + ht = ht.select( + **{ + a: ht.tissue_expression.map(lambda x: x[a]) + for a in annotations_to_extract + } + ) + + return ht
+ + +
[docs]def tx_filter_variants_by_csqs( + ht: hl.Table, + filter_to_cds: bool = True, + gencode_ht: Optional[hl.Table] = None, + filter_to_genes: Optional[List[str]] = None, + match_by_gene_symbol: bool = False, + filter_to_csqs: Optional[List[str]] = None, + ignore_splicing: bool = True, + filter_to_protein_coding: bool = True, + vep_root: str = "vep", +) -> hl.Table: + """ + Prepare a Table of variants with VEP transcript consequences for annotation. + + .. note:: + + When `filter_to_cds` is set to True, the returned Table will be further + filtered by defined 'amino_acids' annotation, which is to filter out certain + consequences, such as 'stop_retained_variant', that are kept by all CDS + intervals but don't belong to CDS of the transcript they fall on. + + :param ht: Table of variants with 'vep' annotations. + :param gencode_ht: Optional Gencode resource Table containing CDS interval + information. This is only used when `filter_to_cds` is set to True. Default is + None, which will use the default version of the Gencode Table resource for + the reference build of the input Table `ht`. + :param filter_to_cds: Whether to filter to CDS regions. Default is True. And it + will be further filtered by defined 'amino_acids' annotation. + :param filter_to_genes: Optional list of genes to filter to. Default is None. + :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID. + Default is False. + :param filter_to_csqs: Optional list of consequences to filter to. Default is None. + :param ignore_splicing: If True, ignore splice consequences. Default is True. + :param filter_to_protein_coding: Whether to filter to protein coding transcripts. + Default is True. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :return: Table of variants with preprocessed/filtered transcript consequences + prepared for annotation. + """ + additional_filtering_criteria = None + if filter_to_cds: + logger.info("Filtering to CDS regions...") + ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht) + additional_filtering_criteria = [ + lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*") + ] + + keep_csqs = True + if ignore_splicing: + if filter_to_csqs is not None: + filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE] + else: + filter_to_csqs = CSQ_SPLICE + keep_csqs = False + + if filter_to_csqs is not None: + logger.info("Adding most severe consequence to VEP transcript consequences...") + ht = process_consequences(ht, vep_root=vep_root) + + return filter_vep_transcript_csqs( + ht, + vep_root=vep_root, + synonymous=False, + canonical=False, + protein_coding=filter_to_protein_coding, + csqs=filter_to_csqs, + keep_csqs=keep_csqs, + genes=filter_to_genes, + match_by_gene_symbol=match_by_gene_symbol, + additional_filtering_criteria=additional_filtering_criteria, + )
+ + +
[docs]def tx_annotate_variants( + ht: hl.Table, + tx_ht: hl.Table, + tissues_to_filter: Optional[List[str]] = None, + vep_root: str = "vep", + vep_annotation: str = "transcript_consequences", +) -> hl.Table: + """ + Annotate variants with transcript-based expression values or expression proportion from GTEx. + + :param ht: Table of variants to annotate, it should contain the nested fields: + `{vep_root}.{vep_annotation}`. + :param tx_ht: Table of transcript expression information. + :param tissues_to_filter: Optional list of tissues to exclude from the output. + Default is None. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :param vep_annotation: Name of annotation under vep_root, one of the processed + consequences: ["transcript_consequences", "worst_csq_by_gene", + "worst_csq_for_variant", "worst_csq_by_gene_canonical", + "worst_csq_for_variant_canonical"]. For example, if you want to annotate + each variant with the worst consequence in each gene it falls on and the + transcript expression, you would use "worst_csq_by_gene". Default is + "transcript_consequences". + :return: Input Table with transcript expression information annotated. + """ + # Filter to tissues of interest. + tx_ht = filter_expression_ht_by_tissues(tx_ht, tissues_to_filter=tissues_to_filter) + tissues = list(tx_ht.row_value) + + # Calculate the mean expression proportion across all tissues. + tx_ht = tx_ht.annotate( + exp_prop_mean=hl.mean([tx_ht[t].expression_proportion for t in tissues]) + ) + + # Explode the processed transcript consequences to be able to key by + # transcript ID. + ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root) + ht = ht.transmute( + **ht[vep_annotation], + **tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id], + ) + ht = ht.annotate_globals(tissues=tissues) + + return ht
+ + +
[docs]def tx_aggregate_variants( + ht: hl.Table, + additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "alleles", + "gene_symbol", + "most_severe_consequence", + "lof", + "lof_flags", + ), +) -> hl.Table: + """ + Aggregate transcript-based expression values or expression proportion from GTEx. + + :param ht: Table of variants annotated with transcript expression information. + :param additional_group_by: Optional list of additional fields to group by before + sum aggregation. If None, the returned Table will be grouped by only "locus" + and "gene_id" before the sum aggregation. + :return: Table of variants with transcript expression information aggregated. + """ + tissues = hl.eval(ht.tissues) + + grouping = ["locus", "gene_id"] + if additional_group_by is not None: + grouping = grouping + list(additional_group_by) + + # Aggregate the transcript expression information by locus, gene_id and + # annotations in additional_group_by. + ht = ht.group_by(*grouping).aggregate( + exp_prop_mean=hl.agg.sum(ht.exp_prop_mean), + **{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues}, + ) + + # If 'alleles' is in the Table, key by 'locus' and 'alleles'. + keys = ["locus"] + if "alleles" in ht.row: + keys.append("alleles") + + ht = ht.key_by(*keys) + + return ht
+ + +
[docs]def perform_tx_annotation_pipeline( + ht: hl.Table, + tx_ht: hl.Table, + tissues_to_filter: Optional[List[str]] = None, + vep_root: str = "vep", + vep_annotation: str = "transcript_consequences", + filter_to_csqs: Optional[List[str]] = CSQ_CODING, + additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "alleles", + "gene_symbol", + "most_severe_consequence", + "lof", + "lof_flags", + ), + **kwargs, +) -> hl.Table: + """ + One-stop usage of `tx_filter_variants_by_csqs`, `tx_annotate_variants` and `tx_aggregate_variants`. + + :param ht: Table of variants to annotate, it should contain the nested fields: + `{vep_root}.{vep_annotation}`. + :param tx_ht: Table of transcript expression information. + :param tissues_to_filter: Optional list of tissues to exclude from the output. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :param vep_annotation: Name of annotation under vep_root. Default is + 'transcript_consequences'. + :param filter_to_csqs: Optional list of consequences to filter to. Default is None. + :param additional_group_by: Optional list of additional fields to group by before + sum aggregation. If None, the returned Table will be grouped by only "locus" + and "gene_id" before the sum aggregation. + :return: Table of variants with transcript expression information aggregated. + """ + tx_ht = tx_annotate_variants( + tx_filter_variants_by_csqs( + ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs + ), + tx_ht, + tissues_to_filter=tissues_to_filter, + vep_root=vep_root, + vep_annotation=vep_annotation, + ) + + tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by) + + return tx_ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/vcf.html b/_modules/gnomad/utils/vcf.html new file mode 100644 index 000000000..59f8da209 --- /dev/null +++ b/_modules/gnomad/utils/vcf.html @@ -0,0 +1,1618 @@ + + + + + + gnomad.utils.vcf — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.vcf

+# noqa: D100
+
+import copy
+import itertools
+import logging
+from typing import Dict, List, Optional, Union
+
+import hail as hl
+
+from gnomad.sample_qc.ancestry import POP_NAMES
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+SORT_ORDER = [
+    "subset",
+    "downsampling",
+    "popmax",
+    "grpmax",
+    "pop",
+    "gen_anc",
+    "subpop",
+    "sex",
+    "group",
+]
+"""
+Order to sort subgroupings during VCF export.
+Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female).
+"""
+
+GROUPS = ["adj", "raw"]
+"""
+Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export.
+"""
+
+HISTS = ["gq_hist_alt", "gq_hist_all", "dp_hist_alt", "dp_hist_all", "ab_hist_alt"]
+"""
+Quality histograms used in VCF export.
+"""
+
+FAF_POPS = {
+    "v3": ["afr", "amr", "eas", "nfe", "sas"],
+    "v4": ["afr", "amr", "eas", "mid", "nfe", "sas"],
+}
+"""
+Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export.
+"""
+
+SEXES = ["XX", "XY"]
+"""
+Sample sexes used in VCF export.
+
+Used to stratify frequency annotations (AC, AN, AF) for each sex.
+Note that sample sexes in gnomAD v3 and earlier were 'male' and 'female'.
+"""
+
+AS_FIELDS = [
+    "AS_FS",
+    "AS_MQ",
+    "AS_MQRankSum",
+    "AS_pab_max",
+    "AS_QUALapprox",
+    "AS_QD",
+    "AS_ReadPosRankSum",
+    "AS_SB_TABLE",
+    "AS_SOR",
+    "AS_VarDP",
+    "InbreedingCoeff",
+]
+"""
+Allele-specific variant annotations.
+"""
+
+SITE_FIELDS = [
+    "FS",
+    "MQ",
+    "MQRankSum",
+    "QUALapprox",
+    "QD",
+    "ReadPosRankSum",
+    "SB",
+    "SOR",
+    "VarDP",
+]
+"""
+Site level variant annotations.
+"""
+
+ALLELE_TYPE_FIELDS = [
+    "allele_type",
+    "has_star",
+    "n_alt_alleles",
+    "original_alleles",
+    "variant_type",
+    "was_mixed",
+]
+"""
+Allele-type annotations.
+"""
+
+REGION_FLAG_FIELDS = ["decoy", "lcr", "nonpar", "non_par", "segdup"]
+"""
+Annotations about variant region type.
+
+.. note::
+    decoy resource files do not currently exist for GRCh38/hg38.
+"""
+
+JOINT_REGION_FLAG_FIELDS = [
+    "fail_interval_qc",
+    "outside_broad_capture_region",
+    "outside_ukb_capture_region",
+    "outside_broad_calling_region",
+    "outside_ukb_calling_region",
+    "not_called_in_exomes",
+    "not_called_in_genomes",
+]
+"""
+Annotations about variant region type that are specifically created for joint dataset of exomes and genomes from gnomAD v4.1.
+"""
+
+RF_FIELDS = [
+    "rf_positive_label",
+    "rf_negative_label",
+    "rf_label",
+    "rf_train",
+    "rf_tp_probability",
+]
+"""
+Annotations specific to the variant QC using a random forest model.
+"""
+
+AS_VQSR_FIELDS = ["AS_culprit", "AS_VQSLOD"]
+"""
+Allele-specific VQSR annotations.
+"""
+
+VQSR_FIELDS = AS_VQSR_FIELDS + ["NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"]
+"""
+Annotations specific to VQSR.
+"""
+
+INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
+    "AS_QUALapprox",
+    "AS_VarDP",
+    "AS_MQ_DP",
+    "AS_RAW_MQ",
+    "AS_SB_TABLE",
+]
+
+INFO_DICT = {
+    "FS": {
+        "Description": "Phred-scaled p-value of Fisher's exact test for strand bias"
+    },
+    "InbreedingCoeff": {
+        "Number": "A",
+        "Description": (
+            "Inbreeding coefficient, the excess heterozygosity at a variant site,"
+            " computed as 1 - (the number of heterozygous genotypes)/(the number of"
+            " heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
+        ),
+    },
+    "inbreeding_coeff": {
+        "Number": "A",
+        "Description": (
+            "Inbreeding coefficient, the excess heterozygosity at a variant site,"
+            " computed as 1 - (the number of heterozygous genotypes)/(the number of"
+            " heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
+        ),
+    },
+    "MQ": {
+        "Description": (
+            "Root mean square of the mapping quality of reads across all samples"
+        )
+    },
+    "MQRankSum": {
+        "Description": (
+            "Z-score from Wilcoxon rank sum test of alternate vs. reference read"
+            " mapping qualities"
+        )
+    },
+    "QD": {
+        "Description": (
+            "Variant call confidence normalized by depth of sample reads supporting a"
+            " variant"
+        )
+    },
+    "ReadPosRankSum": {
+        "Description": (
+            "Z-score from Wilcoxon rank sum test of alternate vs. reference read"
+            " position bias"
+        )
+    },
+    "SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"},
+    "POSITIVE_TRAIN_SITE": {
+        "Description": (
+            "Variant was used to build the positive training set of high-quality"
+            " variants for VQSR"
+        )
+    },
+    "NEGATIVE_TRAIN_SITE": {
+        "Description": (
+            "Variant was used to build the negative training set of low-quality"
+            " variants for VQSR"
+        )
+    },
+    "positive_train_site": {
+        "Description": (
+            "Variant was used to build the positive training set of high-quality"
+            " variants for VQSR"
+        )
+    },
+    "negative_train_site": {
+        "Description": (
+            "Variant was used to build the negative training set of low-quality"
+            " variants for VQSR"
+        )
+    },
+    "BaseQRankSum": {
+        "Description": (
+            "Z-score from Wilcoxon rank sum test of alternate vs. reference base"
+            " qualities"
+        ),
+    },
+    "VarDP": {
+        "Description": (
+            "Depth over variant genotypes (does not include depth of reference samples)"
+        )
+    },
+    "VQSLOD": {
+        "Description": (
+            "Log-odds ratio of being a true variant versus being a false positive under"
+            " the trained VQSR Gaussian mixture model"
+        ),
+    },
+    "culprit": {
+        "Description": "Worst-performing annotation in the VQSR Gaussian mixture model",
+    },
+    "decoy": {"Description": "Variant falls within a reference decoy region"},
+    "lcr": {"Description": "Variant falls within a low complexity region"},
+    "nonpar": {
+        "Description": (
+            "Variant (on sex chromosome) falls outside a pseudoautosomal region"
+        )
+    },
+    "non_par": {
+        "Description": (
+            "Variant (on sex chromosome) falls outside a pseudoautosomal region"
+        )
+    },
+    "segdup": {"Description": "Variant falls within a segmental duplication region"},
+    "fail_interval_qc": {
+        "Description": (
+            "Less than 85 percent of samples meet 20X coverage if variant is in"
+            " autosomal or PAR regions or 10X coverage for non-PAR regions of"
+            " chromosomes X and Y."
+        )
+    },
+    "outside_ukb_capture_region": {
+        "Description": "Variant falls outside of UK Biobank exome capture regions."
+    },
+    "outside_broad_capture_region": {
+        "Description": "Variant falls outside of Broad exome capture regions."
+    },
+    "rf_positive_label": {
+        "Description": (
+            "Variant was labelled as a positive example for training of random forest"
+            " model"
+        )
+    },
+    "rf_negative_label": {
+        "Description": (
+            "Variant was labelled as a negative example for training of random forest"
+            " model"
+        )
+    },
+    "rf_label": {"Description": "Random forest training label"},
+    "rf_train": {"Description": "Variant was used in training random forest model"},
+    "rf_tp_probability": {
+        "Description": (
+            "Probability of a called variant being a true variant as determined by"
+            " random forest model"
+        )
+    },
+    "transmitted_singleton": {
+        "Description": (
+            "Variant was a callset-wide doubleton that was transmitted within a family"
+            " from a parent to a child (i.e., a singleton amongst unrelated samples in"
+            " cohort)"
+        )
+    },
+    "sibling_singleton": {
+        "Description": (
+            "Variant was a callset-wide doubleton that was present only in two siblings"
+            " (i.e., a singleton amongst unrelated samples in cohort)."
+        )
+    },
+    "original_alleles": {"Description": "Alleles before splitting multiallelics"},
+    "variant_type": {
+        "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)"
+    },
+    "allele_type": {
+        "Description": "Allele type (snv, insertion, deletion, or mixed)",
+    },
+    "n_alt_alleles": {
+        "Number": "1",
+        "Description": "Total number of alternate alleles observed at variant locus",
+    },
+    "was_mixed": {"Description": "Variant type was mixed"},
+    "has_star": {
+        "Description": (
+            "Variant locus coincides with a spanning deletion (represented by a star)"
+            " observed elsewhere in the callset"
+        )
+    },
+    "AS_pab_max": {
+        "Number": "A",
+        "Description": (
+            "Maximum p-value over callset for binomial test of observed allele balance"
+            " for a heterozygous genotype, given expectation of 0.5"
+        ),
+    },
+    "monoallelic": {
+        "Description": "All samples are homozygous alternate for the variant"
+    },
+    "only_het": {"Description": "All samples are heterozygous for the variant"},
+    "QUALapprox": {
+        "Number": "1",
+        "Description": "Sum of PL[0] values; used to approximate the QUAL score",
+    },
+    "AS_SB_TABLE": {
+        "Number": ".",
+        "Description": (
+            "Allele-specific forward/reverse read counts for strand bias tests"
+        ),
+    },
+}
+"""
+Dictionary used during VCF export to export row (variant) annotations.
+"""
+
+JOINT_REGION_FLAGS_INFO_DICT = {
+    "fail_interval_qc": {
+        "Description": (
+            "Less than 85 percent of samples meet 20X coverage if variant is in"
+            " autosomal or PAR regions or 10X coverage for non-PAR regions of"
+            " chromosomes X and Y."
+        )
+    },
+    "outside_ukb_capture_region": {
+        "Description": "Variant falls outside of the UK Biobank exome capture regions."
+    },
+    "outside_broad_capture_region": {
+        "Description": "Variant falls outside of the Broad exome capture regions."
+    },
+    "outside_ukb_calling_region": {
+        "Description": (
+            "Variant falls outside of the UK Biobank exome capture regions plus 150 bp"
+            " padding."
+        )
+    },
+    "outside_broad_calling_region": {
+        "Description": (
+            "Variant falls outside of the Broad exome capture regions plus 150 bp"
+            " padding."
+        )
+    },
+    "not_called_in_exomes": {
+        "Description": "Variant was not called in the gnomAD exomes."
+    },
+    "not_called_in_genomes": {
+        "Description": "Variant was not called in the gnomAD genomes."
+    },
+}
+
+
+IN_SILICO_ANNOTATIONS_INFO_DICT = {
+    "cadd_raw_score": {
+        "Number": "1",
+        "Description": (
+            "Raw CADD scores are interpretable as the extent to which the annotation"
+            " profile for a given variant suggests that the variant is likely to be"
+            " 'observed' (negative values) vs 'simulated' (positive values). Larger"
+            " values are more deleterious."
+        ),
+    },
+    "cadd_phred": {
+        "Number": "1",
+        "Description": (
+            "Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on"
+            " the rank of each variant relative to all possible 8.6 billion"
+            " substitutions in the human reference genome. Larger values are more"
+            " deleterious."
+        ),
+    },
+    "revel_max": {
+        "Number": "1",
+        "Description": (
+            "The maximum REVEL score at a site's MANE Select or canonical"
+            " transcript. It's an ensemble score for predicting the pathogenicity of"
+            " missense variants (based on 13 other variant predictors). Scores ranges"
+            " from 0 to 1. Variants with higher scores are predicted to be more likely"
+            " to be deleterious."
+        ),
+    },
+    "spliceai_ds_max": {
+        "Number": "1",
+        "Description": (
+            "Illumina's SpliceAI max delta score; interpreted as the probability of the"
+            " variant being splice-altering."
+        ),
+    },
+    "pangolin_largest_ds": {
+        "Number": "1",
+        "Description": (
+            "Pangolin's largest delta score across 2 splicing consequences, which"
+            " reflects the probability of the variant being splice-altering"
+        ),
+    },
+    "phylop": {
+        "Number": "1",
+        "Description": (
+            "Base-wise conservation score across the 241 placental mammals in the"
+            " Zoonomia project. Score ranges from -20 to 9.28, and reflects"
+            " acceleration (faster evolution than expected under neutral drift,"
+            " assigned negative scores) as well as conservation (slower than expected"
+            " evolution, assigned positive scores)."
+        ),
+    },
+    "sift_max": {
+        "Number": "1",
+        "Description": (
+            "Score reflecting the scaled probability of the amino acid substitution"
+            " being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to"
+            " impact protein function. We prioritize max scores for MANE Select"
+            " transcripts where possible and otherwise report a score for the canonical"
+            " transcript."
+        ),
+    },
+    "polyphen_max": {
+        "Number": "1",
+        "Description": (
+            "Score that predicts the possible impact of an amino acid substitution on"
+            " the structure and function of a human protein, ranging from 0.0"
+            " (tolerated) to 1.0 (deleterious).  We prioritize max scores for MANE"
+            " Select transcripts where possible and otherwise report a score for the"
+            " canonical transcript."
+        ),
+    },
+}
+"""
+Dictionary with in silico score descriptions to include in the VCF INFO header.
+"""
+
+
+VRS_FIELDS_DICT = {
+    "VRS_Allele_IDs": {
+        "Number": "R",
+        "Description": (
+            "The computed identifiers for the GA4GH VRS Alleles corresponding to the"
+            " values in the REF and ALT fields"
+        ),
+    },
+    "VRS_Starts": {
+        "Number": "R",
+        "Description": (
+            "Interresidue coordinates used as the location starts for the GA4GH VRS"
+            " Alleles corresponding to the values in the REF and ALT fields"
+        ),
+    },
+    "VRS_Ends": {
+        "Number": "R",
+        "Description": (
+            "Interresidue coordinates used as the location ends for the GA4GH VRS"
+            " Alleles corresponding to the values in the REF and ALT fields"
+        ),
+    },
+    "VRS_States": {
+        "Number": ".",
+        "Description": (
+            "The literal sequence states used for the GA4GH VRS Alleles corresponding"
+            " to the values in the REF and ALT fields"
+        ),
+    },
+}
+"""
+Dictionary with VRS annotations to include in the VCF INFO field and VCF header.
+"""
+
+
+ENTRIES = ["GT", "GQ", "DP", "AD", "MIN_DP", "PGT", "PID", "PL", "SB"]
+"""
+Densified entries to be selected during VCF export.
+"""
+
+SPARSE_ENTRIES = [
+    "END",
+    "DP",
+    "GQ",
+    "LA",
+    "LAD",
+    "LGT",
+    "LPGT",
+    "LPL",
+    "MIN_DP",
+    "PID",
+    "RGQ",
+    "SB",
+]
+"""
+Sparse entries to be selected and densified during VCF export.
+"""
+
+FORMAT_DICT = {
+    "GT": {"Description": "Genotype", "Number": "1", "Type": "String"},
+    "AD": {
+        "Description": "Allelic depths for the ref and alt alleles in the order listed",
+        "Number": "R",
+        "Type": "Integer",
+    },
+    "DP": {
+        "Description": (
+            "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
+        ),
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "GQ": {
+        "Description": (
+            "Phred-scaled confidence that the genotype assignment is correct. Value is"
+            " the difference between the second lowest PL and the lowest PL (always"
+            " normalized to 0)."
+        ),
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "MIN_DP": {
+        "Description": "Minimum DP observed within the GVCF block",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "PGT": {
+        "Description": (
+            "Physical phasing haplotype information, describing how the alternate"
+            " alleles are phased in relation to one another"
+        ),
+        "Number": "1",
+        "Type": "String",
+    },
+    "PID": {
+        "Description": (
+            "Physical phasing ID information, where each unique ID within a given"
+            " sample (but not across samples) connects records within a phasing group"
+        ),
+        "Number": "1",
+        "Type": "String",
+    },
+    "PL": {
+        "Description": (
+            "Normalized, phred-scaled likelihoods for genotypes as defined in the VCF"
+            " specification"
+        ),
+        "Number": "G",
+        "Type": "Integer",
+    },
+    "SB": {
+        "Description": (
+            "Per-sample component statistics which comprise the Fisher's exact test to"
+            " detect strand bias. Values are: depth of reference allele on forward"
+            " strand, depth of reference allele on reverse strand, depth of alternate"
+            " allele on forward strand, depth of alternate allele on reverse strand."
+        ),
+        "Number": "4",
+        "Type": "Integer",
+    },
+}
+"""
+Dictionary used during VCF export to export MatrixTable entries.
+"""
+
+
+
[docs]def adjust_vcf_incompatible_types( + ht: hl.Table, + pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS, +) -> hl.Table: + """ + Create a Table ready for vcf export. + + In particular, the following conversions are done: + - All int64 are coerced to int32 + - Fields specified by `pipe_delimited_annotations` are converted from arrays to pipe-delimited strings + + :param ht: Input Table. + :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct). + :return: Table ready for VCF export. + """ + + def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression: + return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|") + + # Make sure the HT is keyed by locus, alleles + ht = ht.key_by("locus", "alleles") + + info_type_convert_expr = {} + # Convert int64 fields to int32 (int64 isn't supported by VCF) + for f, ft in ht.info.dtype.items(): + if ft == hl.dtype("int64"): + logger.warning( + "Coercing field info.%s from int64 to int32 for VCF output. Value" + " will be capped at int32 max value.", + f, + ) + info_type_convert_expr.update( + { + f: hl.or_missing( + hl.is_defined(ht.info[f]), + hl.int32(hl.min(2**31 - 1, ht.info[f])), + ) + } + ) + elif ft == hl.dtype("array<int64>"): + logger.warning( + "Coercing field info.%s from array<int64> to array<int32> for VCF" + " output. Array values will be capped at int32 max value.", + f, + ) + info_type_convert_expr.update( + { + f: ht.info[f].map( + lambda x: hl.or_missing( + hl.is_defined(x), hl.int32(hl.min(2**31 - 1, x)) + ) + ) + } + ) + + ht = ht.annotate(info=ht.info.annotate(**info_type_convert_expr)) + + info_expr = {} + + # Make sure to pipe-delimit fields that need to. + # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty) + # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, it + # is handled below + for f in pipe_delimited_annotations: + if f in ht.info and f != "AS_SB_TABLE": + info_expr[f] = "|" + get_pipe_expr(ht.info[f]) + + # Flatten SB if it is an array of arrays + if "SB" in ht.info and not isinstance(ht.info.SB, hl.expr.ArrayNumericExpression): + info_expr["SB"] = ht.info.SB[0].extend(ht.info.SB[1]) + + if "AS_SB_TABLE" in ht.info: + info_expr["AS_SB_TABLE"] = get_pipe_expr( + ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ",")) + ) + + # Annotate with new expression + ht = ht.annotate(info=ht.info.annotate(**info_expr)) + + return ht
+ + +
[docs]def make_label_combos( + label_groups: Dict[str, List[str]], + sort_order: List[str] = SORT_ORDER, + label_delimiter: str = "_", +) -> List[str]: + """ + Make combinations of all possible labels for a supplied dictionary of label groups. + + For example, if label_groups is `{"sex": ["male", "female"], "pop": ["afr", "nfe", "amr"]}`, + this function will return `["afr_male", "afr_female", "nfe_male", "nfe_female", "amr_male", "amr_female']` + + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. + :param label_delimiter: String to use as delimiter when making group label combinations. + :return: list of all possible combinations of values for the supplied label groupings. + """ + copy_label_groups = copy.deepcopy(label_groups) + if len(copy_label_groups) == 1: + return [item for sublist in copy_label_groups.values() for item in sublist] + anchor_group = sorted(copy_label_groups.keys(), key=lambda x: sort_order.index(x))[ + 0 + ] + anchor_val = copy_label_groups.pop(anchor_group) + combos = [] + for x, y in itertools.product( + anchor_val, + make_label_combos(copy_label_groups, label_delimiter=label_delimiter), + ): + combos.append(f"{x}{label_delimiter}{y}") + return combos
+ + +
[docs]def index_globals( + globals_array: List[Dict[str, str]], + label_groups: Dict[str, List[str]], + label_delimiter: str = "_", +) -> Dict[str, int]: + """ + Create a dictionary keyed by the specified label groupings with values describing the corresponding index of each grouping entry in the meta_array annotation. + + :param globals_array: Ordered list containing dictionary entries describing all the grouping combinations contained in the globals_array annotation. + Keys are the grouping type (e.g., 'group', 'pop', 'sex') and values are the grouping attribute (e.g., 'adj', 'eas', 'XY'). + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]) + :param label_delimiter: String used as delimiter when making group label combinations. + :return: Dictionary keyed by specified label grouping combinations, with values describing the corresponding index + of each grouping entry in the globals + """ + combos = make_label_combos(label_groups, label_delimiter=label_delimiter) + index_dict = {} + + for combo in combos: + combo_fields = combo.split(label_delimiter) + for i, v in enumerate(globals_array): + if set(v.values()) == set(combo_fields): + index_dict.update({f"{combo}": i}) + return index_dict
+ + +
[docs]def make_combo_header_text( + preposition: str, + combo_dict: Dict[str, str], + pop_names: Dict[str, str], +) -> str: + """ + Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. + + For example, if preposition is "for", group_types is ["group", "pop", "sex"], and combo_fields is ["adj", "afr", "female"], + this function will return the string " for female samples in the African-American/African genetic ancestry group". + + :param preposition: Relevant preposition to precede automatically generated text. + :param combo_dict: Dict with grouping types as keys and values for grouping type as values. This function generates text for these values. + Possible grouping types are: "group", "pop", "sex", and "subpop". + Example input: {"pop": "afr", "sex": "female"} + :param pop_names: Dict with global population names (keys) and population descriptions (values). + :return: String with automatically generated description text for a given set of combo fields. + """ + header_text = " " + preposition + + if len(combo_dict) == 1: + if combo_dict["group"] == "adj": + return "" + + if "sex" in combo_dict: + header_text = header_text + " " + combo_dict["sex"] + + header_text = header_text + " samples" + + if "subpop" in combo_dict or "pop" in combo_dict: + if "subpop" in combo_dict: + header_text = ( + header_text + + f" in the {pop_names[combo_dict['subpop']]} genetic ancestry subgroup" + ) + + else: + header_text = ( + header_text + + f" in the {pop_names[combo_dict['pop']]} genetic ancestry group" + ) + + if "group" in combo_dict: + if combo_dict["group"] == "raw": + header_text = header_text + ", before removing low-confidence genotypes" + + return header_text
+ + +
[docs]def create_label_groups( + pops: List[str], + sexes: List[str] = SEXES, + all_groups: List[str] = GROUPS, + pop_sex_groups: List[str] = ["adj"], +) -> List[Dict[str, List[str]]]: + """ + Generate a list of label group dictionaries needed to populate info dictionary. + + Label dictionaries are passed as input to `make_info_dict`. + + :param pops: List of population names. + :param sexes: List of sample sexes. + :param all_groups: List of data types (raw, adj). Default is `GROUPS`, which is ["raw", "adj"]. + :param pop_sex_groups: List of data types (raw, adj) to populate with pops and sexes. Default is ["adj"]. + :return: List of label group dictionaries. + """ + return [ + # This is to capture raw frequency fields, which are + # not stratified by sex or population (e.g., only AC_raw exists, not AC_XX_raw) + dict(group=all_groups), + dict(group=pop_sex_groups, sex=sexes), + dict(group=pop_sex_groups, pop=pops), + dict(group=pop_sex_groups, pop=pops, sex=sexes), + ]
+ + +
[docs]def make_info_dict( + prefix: str = "", + suffix: str = "", + prefix_before_metric: bool = True, + pop_names: Dict[str, str] = POP_NAMES, + label_groups: Dict[str, List[str]] = None, + label_delimiter: str = "_", + bin_edges: Dict[str, str] = None, + faf: bool = False, + popmax: bool = False, + grpmax: bool = False, + fafmax: bool = False, + callstats: bool = False, + freq_ctt: bool = False, + freq_cmh: bool = False, + description_text: str = "", + age_hist_distribution: str = None, + sort_order: List[str] = SORT_ORDER, +) -> Dict[str, Dict[str, str]]: + """ + Generate dictionary of Number and Description attributes of VCF INFO fields. + + Used to populate the INFO fields of the VCF header during export. + + Creates: + - INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers) + - INFO fields for popmax AC, AN, AF, nhomalt, and popmax population + - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data + - INFO fields for filtering allele frequency (faf) annotations + + :param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string. + :param suffix: Suffix string for data, e.g. "gnomAD". Default is empty string. + :param prefix_before_metric: Whether prefix should be added before the metric (AC, AN, AF, nhomalt, faf95, faf99) in INFO field. Default is True. + :param pop_names: Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES. + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param label_delimiter: String to use as delimiter when making group label combinations. + :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. + :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. + :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. + :param grpmax: If True, use alternate logic to auto-populate dictionary values associated with grpmax annotations. + :param fafmax: If True, use alternate logic to auto-populate dictionary values associated with fafmax annotations. + :param callstats: If True, use alternate logic to auto-populate dictionary values associated with callstats annotations. + :param freq_contingency: If True, use alternate logic to auto-populate dictionary values associated with frequency contingency table test (CTT) annotations. + :param freq_cmh: If True, use alternate logic to auto-populate dictionary values associated with frequency Cochran-Mantel-Haenszel (CMH) annotations. + :param description_text: Optional text to append to the end of descriptions. Needs to start with a space if specified. + :param str age_hist_distribution: Pipe-delimited string of overall age distribution. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. + :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. + """ + if prefix != "": + prefix = f"{prefix}{label_delimiter}" + if suffix != "": + suffix = f"{label_delimiter}{suffix}" + + info_dict = dict() + + if age_hist_distribution: + age_hist_dict = { + f"{prefix}age_hist_het_bin_freq{suffix}": { + "Number": "A", + "Description": ( + f"Histogram of ages of heterozygous individuals{description_text};" + f" bin edges are: {bin_edges['het']}; total number of individuals" + f" of any genotype bin: {age_hist_distribution}" + ), + }, + f"{prefix}age_hist_het_n_smaller{suffix}": { + "Number": "A", + "Description": ( + "Count of age values falling below lowest histogram bin edge for" + f" heterozygous individuals{description_text}" + ), + }, + f"{prefix}age_hist_het_n_larger{suffix}": { + "Number": "A", + "Description": ( + "Count of age values falling above highest histogram bin edge for" + f" heterozygous individuals{description_text}" + ), + }, + f"{prefix}age_hist_hom_bin_freq{suffix}": { + "Number": "A", + "Description": ( + "Histogram of ages of homozygous alternate" + f" individuals{description_text}; bin edges are:" + f" {bin_edges['hom']}; total number of individuals of any genotype" + f" bin: {age_hist_distribution}" + ), + }, + f"{prefix}age_hist_hom_n_smaller{suffix}": { + "Number": "A", + "Description": ( + "Count of age values falling below lowest histogram bin edge for" + f" homozygous alternate individuals{description_text}" + ), + }, + f"{prefix}age_hist_hom_n_larger{suffix}": { + "Number": "A", + "Description": ( + "Count of age values falling above highest histogram bin edge for" + f" homozygous alternate individuals{description_text}" + ), + }, + } + info_dict.update(age_hist_dict) + + if popmax: + popmax_dict = { + f"{prefix}popmax{suffix}": { + "Number": "A", + "Description": ( + f"Population with the maximum allele frequency{description_text}" + ), + }, + f"{prefix}AC{label_delimiter}popmax{suffix}": { + "Number": "A", + "Description": ( + "Allele count in the population with the maximum allele" + f" frequency{description_text}" + ), + }, + f"{prefix}AN{label_delimiter}popmax{suffix}": { + "Number": "A", + "Description": ( + "Total number of alleles in the population with the maximum allele" + f" frequency{description_text}" + ), + }, + f"{prefix}AF{label_delimiter}popmax{suffix}": { + "Number": "A", + "Description": ( + f"Maximum allele frequency across populations{description_text}" + ), + }, + f"{prefix}nhomalt{label_delimiter}popmax{suffix}": { + "Number": "A", + "Description": ( + "Count of homozygous individuals in the population with the" + f" maximum allele frequency{description_text}" + ), + }, + f"{prefix}faf95{label_delimiter}popmax{suffix}": { + "Number": "A", + "Description": ( + "Filtering allele frequency (using Poisson 95% CI) for the" + f" population with the maximum allele frequency{description_text}" + ), + }, + } + info_dict.update(popmax_dict) + if grpmax: + grpmax_dict = { + f"{prefix}grpmax{suffix}": { + "Number": "A", + "Description": ( + "Genetic ancestry group with the maximum allele" + f" frequency{description_text}" + ), + }, + f"{prefix}AC{label_delimiter}grpmax{suffix}": { + "Number": "A", + "Description": ( + "Allele count in the genetic ancestry group with the maximum allele" + f" frequency{description_text}" + ), + }, + f"{prefix}AN{label_delimiter}grpmax{suffix}": { + "Number": "A", + "Description": ( + "Total number of alleles in the genetic ancestry group with the" + f" maximum allele frequency{description_text}" + ), + }, + f"{prefix}AF{label_delimiter}grpmax{suffix}": { + "Number": "A", + "Description": ( + "Maximum allele frequency across genetic ancestry" + f" groups{description_text}" + ), + }, + f"{prefix}nhomalt{label_delimiter}grpmax{suffix}": { + "Number": "A", + "Description": ( + "Count of homozygous individuals in the genetic ancestry group" + f" with the maximum allele frequency{description_text}" + ), + }, + } + info_dict.update(grpmax_dict) + + if fafmax: + fafmax_dict = { + f"{prefix}fafmax{label_delimiter}faf95{label_delimiter}max{suffix}": { + "Number": "A", + "Description": ( + "Maximum filtering allele frequency (using Poisson 95% CI)" + f" across genetic ancestry groups{description_text}" + ), + }, + f"{prefix}fafmax{label_delimiter}faf95{label_delimiter}max{label_delimiter}gen{label_delimiter}anc{suffix}": { + "Number": "A", + "Description": ( + "Genetic ancestry group with maximum filtering allele" + f" frequency (using Poisson 95% CI){description_text}" + ), + }, + f"{prefix}fafmax{label_delimiter}faf99{label_delimiter}max{suffix}": { + "Number": "A", + "Description": ( + "Maximum filtering allele frequency (using Poisson 99% CI)" + f" across genetic ancestry groups{description_text}" + ), + }, + f"{prefix}fafmax{label_delimiter}faf99{label_delimiter}max{label_delimiter}gen{label_delimiter}anc{suffix}": { + "Number": "A", + "Description": ( + "Genetic ancestry group with maximum filtering allele" + f" frequency (using Poisson 99% CI){description_text}" + ), + }, + } + + info_dict.update(fafmax_dict) + + if callstats or faf or freq_ctt: + group_types = sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) + combos = make_label_combos(label_groups, label_delimiter=label_delimiter) + + for combo in combos: + combo_fields = combo.split(label_delimiter) + group_dict = dict(zip(group_types, combo_fields)) + + for_combo = make_combo_header_text("for", group_dict, pop_names) + in_combo = make_combo_header_text("in", group_dict, pop_names) + + metrics = ["AC", "AN", "AF", "nhomalt", "faf95", "faf99"] + if freq_ctt: + metrics += ["CTT_odds_ratio", "CTT_p_value"] + if prefix_before_metric: + metric_label_dict = { + metric: f"{prefix}{metric}{label_delimiter}{combo}{suffix}" + for metric in metrics + } + else: + metric_label_dict = { + metric: f"{metric}{label_delimiter}{prefix}{combo}{suffix}" + for metric in metrics + } + + if callstats: + combo_dict = { + metric_label_dict["AC"]: { + "Number": "A", + "Description": ( + f"Alternate allele count{for_combo}{description_text}" + ), + }, + metric_label_dict["AN"]: { + "Number": "1", + "Description": ( + f"Total number of alleles{in_combo}{description_text}" + ), + }, + metric_label_dict["AF"]: { + "Number": "A", + "Description": ( + f"Alternate allele frequency{in_combo}{description_text}" + ), + }, + metric_label_dict["nhomalt"]: { + "Number": "A", + "Description": ( + "Count of homozygous" + f" individuals{in_combo}{description_text}" + ), + }, + } + elif faf: + if ("XX" in combo_fields) | ("XY" in combo_fields): + faf_description_text = ( + description_text + " in non-PAR regions of sex chromosomes only" + ) + else: + faf_description_text = description_text + combo_dict = { + metric_label_dict["faf95"]: { + "Number": "A", + "Description": ( + "Filtering allele frequency (using Poisson 95%" + f" CI){for_combo}{faf_description_text}" + ), + }, + metric_label_dict["faf99"]: { + "Number": "A", + "Description": ( + "Filtering allele frequency (using Poisson 99%" + f" CI){for_combo}{faf_description_text}" + ), + }, + } + else: + combo_dict = { + metric_label_dict["CTT_odds_ratio"]: { + "Number": "A", + "Description": ( + "Odds ratio from from Hail's contingency_table_test with" + " `min_cell_count=100` comparing allele frequencies" + f" between exomes and genomes{for_combo}{description_text}" + ), + }, + metric_label_dict["CTT_p_value"]: { + "Number": "A", + "Description": ( + "P-value from Hail's contingency_table_test with" + " `min_cell_count=100` comparing allele frequencies" + f" between exomes and genomes{for_combo}{description_text}" + ), + }, + } + info_dict.update(combo_dict) + if freq_cmh: + cmh_dict = { + f"{prefix}CMH_chisq{suffix}": { + "Number": "A", + "Description": ( + "Chi-squared test statistic from the Cochran-Mantel-Haenszel test" + " comparing allele frequencies between exomes and genomes" + f" stratified by genetic ancestry group{description_text}" + ), + }, + f"{prefix}CMH_p_value{suffix}": { + "Number": "A", + "Description": ( + "Odds ratio from Cochran-Mantel-Haenszel test comparing allele" + " frequencies between exomes and genomes stratified by genetic" + f" ancestry group{description_text}" + ), + }, + } + info_dict.update(cmh_dict) + + return info_dict
+ + +
[docs]def add_as_info_dict( + info_dict: Dict[str, Dict[str, str]] = INFO_DICT, as_fields: List[str] = AS_FIELDS +) -> Dict[str, Dict[str, str]]: + """ + Update info dictionary with allele-specific terms and their descriptions. + + Used in VCF export. + + :param info_dict: Dictionary containing site-level annotations and their descriptions. Default is INFO_DICT. + :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS. + :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. + """ + as_dict = {} + for field in as_fields: + try: + # Strip AS_ from field name + site_field = field[3:] + + # Get site description from info dictionary and make first letter lower case + first_letter = info_dict[site_field]["Description"][0].lower() + rest_of_description = info_dict[site_field]["Description"][1:] + + as_dict[field] = {} + as_dict[field]["Number"] = "A" + as_dict[field][ + "Description" + ] = f"Allele-specific {first_letter}{rest_of_description}" + + except KeyError: + logger.warning("%s is not present in input info dictionary!", field) + + return as_dict
+ + +
[docs]def make_vcf_filter_dict( + snp_cutoff: float, + indel_cutoff: float, + inbreeding_cutoff: float, + variant_qc_filter: str = "RF", +) -> Dict[str, str]: + """ + Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. + + Generates descriptions for: + - AC0 filter + - InbreedingCoeff filter + - Variant QC filter (RF or AS_VQSR) + - PASS (passed all variant filters) + + :param snp_cutoff: Minimum SNP cutoff score from random forest model. + :param indel_cutoff: Minimum indel cutoff score from random forest model. + :param inbreeding_cutoff: Inbreeding coefficient hard cutoff. + :param variant_qc_filter: Method used for variant QC filter. One of 'RF' or 'AS_VQSR'. Default is 'RF'. + :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes. + """ + variant_qc_filter_dict = { + "RF": { + "Description": ( + f"Failed random forest filtering thresholds of {snp_cutoff} for SNPs" + f" and {indel_cutoff} for indels (probabilities of being a true" + " positive variant)" + ) + }, + "AS_VQSR": { + "Description": ( + f"Failed VQSR filtering thresholds of {snp_cutoff} for SNPs and" + f" {indel_cutoff} for indels" + ) + }, + } + + if variant_qc_filter not in variant_qc_filter_dict: + raise ValueError( + f"{variant_qc_filter} is not a valid value for 'variant_qc_filter'. It must" + " be 'RF' or 'AS_VQSR'" + ) + + filter_dict = { + "AC0": { + "Description": ( + "Allele count is zero after filtering out low-confidence genotypes (GQ" + " < 20; DP < 10; and AB < 0.2 for het calls)" + ) + }, + "InbreedingCoeff": { + "Description": f"Inbreeding coefficient < {inbreeding_cutoff}" + }, + "PASS": {"Description": "Passed all variant filters"}, + variant_qc_filter: variant_qc_filter_dict[variant_qc_filter], + } + + return filter_dict
+ + +
[docs]def make_hist_bin_edges_expr( + ht: hl.Table, + hists: List[str] = HISTS, + ann_with_hists: Optional[str] = None, + prefix: str = "", + label_delimiter: str = "_", + include_age_hists: bool = True, +) -> Dict[str, str]: + """ + Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters. + + :param ht: Table containing histogram variant annotations. + :param hists: List of variant histogram annotations. Default is HISTS. + :param ann_with_hists: Name of row annotation containing histogram data. In exomes or + genomes release HT, `histograms` is a row, but in the joint release HT, it's + under the row of `exomes`, `genomes`, or `joint`. + :param prefix: Prefix text for age histogram bin edges. Default is empty string. + :param label_delimiter: String used as delimiter between prefix and histogram annotation. + :param include_age_hists: Include age histogram annotations. + :return: Dictionary keyed by histogram annotation name, with corresponding + reformatted bin edges for values. + """ + # Add underscore to prefix if it isn't empty + if prefix: + prefix += label_delimiter + + edges_dict = {} + + if include_age_hists: + for call_type in ["het", "hom"]: + if ann_with_hists: + bin_edges = ( + ht.filter( + hl.is_defined( + ht[ann_with_hists] + .histograms.age_hists[f"age_hist_{call_type}"] + .bin_edges + ) + )[ann_with_hists] + .histograms.age_hists[f"age_hist_{call_type}"] + .bin_edges.take(1)[0] + ) + else: + bin_edges = ( + ht.filter( + hl.is_defined( + ht.histograms.age_hists[f"age_hist_{call_type}"].bin_edges + ) + ) + .histograms.age_hists[f"age_hist_{call_type}"] + .bin_edges.take(1)[0] + ) + + if bin_edges: + edges_dict[f"{prefix}{call_type}"] = "|".join( + map(lambda x: f"{x:.1f}", bin_edges) + ) + + for hist in hists: + # Parse hists calculated on both raw and adj-filtered data + for hist_type in [f"{prefix}raw_qual_hists", f"{prefix}qual_hists"]: + hist_name = hist if "raw" not in hist_type else f"{prefix}{hist}_raw" + + if ann_with_hists: + bin_edges = ( + ht.filter( + hl.is_defined( + ht[ann_with_hists].histograms[hist_type][hist].bin_edges + ) + )[ann_with_hists] + .histograms[hist_type][hist] + .bin_edges.take(1)[0] + ) + else: + bin_edges = ( + ht.filter(hl.is_defined(ht.histograms[hist_type][hist].bin_edges)) + .histograms[hist_type][hist] + .bin_edges.take(1)[0] + ) + if bin_edges: + edges_dict[hist_name] = "|".join( + map( + lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)), bin_edges + ) + ) + + return edges_dict
+ + +
[docs]def make_hist_dict( + bin_edges: Dict[str, Dict[str, str]], + adj: bool, + hist_metric_list: List[str] = HISTS, + label_delimiter: str = "_", + drop_n_smaller_larger: bool = False, + prefix: str = "", + suffix: str = "", + description_text: str = "", +) -> Dict[str, str]: + """ + Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations. + + :param bin_edges: Dictionary keyed by histogram annotation name, with corresponding string-reformatted bin edges for values. + :param adj: Whether to create a header dict for raw or adj quality histograms. + :param hist_metric_list: List of hists for which to build hist info dict + :param label_delimiter: String used as delimiter in values stored in hist_metric_list. + :param drop_n_smaller_larger: Whether to drop n_smaller and n_larger annotations from header dict. Default is False. + :param prefix: Prefix text for histogram annotations. Default is empty string. + :param suffix: Suffix text for histogram annotations. Default is empty string. + :param description_text: Optional text to append to the end of descriptions. Needs to start with a space if specified. + :return: Dictionary keyed by VCF INFO annotations, where values are Dictionaries of Number and Description attributes. + """ + if prefix != "": + prefix = f"{prefix}{label_delimiter}" + if suffix != "": + suffix = f"{label_delimiter}{suffix}" + + header_hist_dict = {} + for hist in hist_metric_list: + # Get hists for both raw and adj data + # Add "_raw" to quality histograms calculated on raw data + if not adj: + hist = f"{hist}_raw" + + edges = bin_edges[hist] + hist_fields = hist.split(label_delimiter) + hist_text = hist_fields[0].upper() + + if hist_fields[2] == "alt": + hist_text = hist_text + " in heterozygous individuals" + if adj: + hist_text = hist_text + " calculated on high quality genotypes" + + hist_dict = { + f"{prefix}{hist}_bin_freq{suffix}": { + "Number": "A", + "Description": ( + f"Histogram for {hist_text}{description_text}; bin edges are:" + f" {edges}" + ), + }, + } + # These annotations are frequently zero and are dropped from gnomad + # releases for most histograms. + if not drop_n_smaller_larger: + hist_dict.update( + { + f"{prefix}{hist}_n_smaller{suffix}": { + "Number": "A", + "Description": ( + f"Count of {hist_fields[0].upper()} values falling below" + f" lowest histogram bin edge {hist_text}{description_text}" + ), + }, + f"{prefix}{hist}_n_larger{suffix}": { + "Number": "A", + "Description": ( + f"Count of {hist_fields[0].upper()} values falling above" + f" highest histogram bin edge {hist_text}{description_text}" + ), + }, + } + ) + # Only add n_larger for dp qual histograms. + if "dp" in hist: + hist_dict.update( + { + f"{prefix}{hist}_n_larger{suffix}": { + "Number": "A", + "Description": ( + f"Count of {hist_fields[0].upper()} values falling above" + f" highest histogram bin edge {hist_text}{description_text}" + ), + }, + } + ) + + header_hist_dict.update(hist_dict) + + return header_hist_dict
+ + +
[docs]def set_female_y_metrics_to_na( + t: Union[hl.Table, hl.MatrixTable], +) -> Dict[str, hl.expr.Int32Expression]: + """ + Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0). + + :param t: Table/MatrixTable containing female variant annotations. + :return: Dictionary with reset annotations + """ + metrics = list(t.row.info) + female_metrics = [x for x in metrics if "_female" in x or "_XX" in x] + + female_metrics_dict = {} + for metric in female_metrics: + female_metrics_dict.update( + { + f"{metric}": hl.or_missing( + (~t.locus.in_y_nonpar() & ~t.locus.in_y_par()), + t.info[f"{metric}"], + ) + } + ) + return female_metrics_dict
+ + +
[docs]def build_vcf_export_reference( + name: str, + build: str = "GRCh38", + keep_contigs: List[str] = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"], + keep_chrM: bool = True, +) -> hl.ReferenceGenome: + """ + Create export reference based on reference genome defined by `build`. + + By default this will return a new reference with all non-standard contigs eliminated. Keeps chr 1-22, Y, X, and M. + + An example of a non-standard contig is: ##contig=<ID=chr3_GL000221v1_random,length=155397,assembly=GRCh38> + + :param name: Name to use for new reference. + :param build: Reference genome build to use as starting reference genome. + :param keep_contigs: Contigs to keep from reference genome defined by `build`. Default is autosomes and sex chromosomes. + :param keep_chrM: Whether to keep chrM. Default is True. + :return: Reference genome for VCF export containing only contigs in `keep_contigs`. + """ + ref = hl.get_reference(build) + ref_args = {} + + if keep_chrM: + keep_contigs.extend(ref.mt_contigs) + ref_args.update({"mt_contigs": ref.mt_contigs}) + + ref_args.update( + { + "name": name, + "contigs": keep_contigs, + "lengths": {contig: ref.lengths[contig] for contig in keep_contigs}, + "x_contigs": ref.x_contigs, + "y_contigs": ref.y_contigs, + "par": [ + (interval.start.contig, interval.start.position, interval.end.position) + for interval in ref.par + ], + } + ) + + export_reference = hl.ReferenceGenome(**ref_args) + + return export_reference
+ + +
[docs]def rekey_new_reference( + t: Union[hl.Table, hl.MatrixTable], reference: hl.ReferenceGenome +) -> Union[hl.Table, hl.MatrixTable]: + """ + Re-key Table or MatrixTable with a new reference genome. + + :param t: Input Table/MatrixTable. + :param reference: Reference genome to re-key with. + :return: Re-keyed Table/MatrixTable + """ + t = t.rename({"locus": "locus_original"}) + locus_expr = hl.locus( + t.locus_original.contig, + t.locus_original.position, + reference_genome=reference, + ) + + if isinstance(t, hl.MatrixTable): + t = t.annotate_rows(locus=locus_expr) + t = t.key_rows_by("locus", "alleles").drop("locus_original") + else: + t = t.annotate(locus=locus_expr) + t = t.key_by("locus", "alleles").drop("locus_original") + + return t
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/utils/vep.html b/_modules/gnomad/utils/vep.html new file mode 100644 index 000000000..c9a9b46ed --- /dev/null +++ b/_modules/gnomad/utils/vep.html @@ -0,0 +1,979 @@ + + + + + + gnomad.utils.vep — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.utils.vep

+# noqa: D100
+
+import json
+import logging
+import os
+import subprocess
+from typing import Callable, List, Optional, Union
+
+import hail as hl
+
+from gnomad.resources.resource_utils import VersionedTableResource
+from gnomad.utils.filtering import combine_functions
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+VEP_VERSIONS = ["101", "105"]
+CURRENT_VEP_VERSION = VEP_VERSIONS[-1]
+"""
+Versions of VEP used in gnomAD data, the latest version is 105.
+"""
+
+# Note that this is the current as of v81 with some included for backwards
+# compatibility (VEP <= 75)
+CSQ_CODING_HIGH_IMPACT = [
+    "transcript_ablation",
+    "splice_acceptor_variant",
+    "splice_donor_variant",
+    "stop_gained",
+    "frameshift_variant",
+    "stop_lost",
+]
+
+CSQ_CODING_MEDIUM_IMPACT = [
+    "start_lost",  # new in v81
+    "initiator_codon_variant",  # deprecated
+    "transcript_amplification",
+    "inframe_insertion",
+    "inframe_deletion",
+    "missense_variant",
+    "protein_altering_variant",  # new in v79
+    "splice_region_variant",
+]
+
+CSQ_CODING_LOW_IMPACT = [
+    "incomplete_terminal_codon_variant",
+    "start_retained_variant",  # new in v92
+    "stop_retained_variant",
+    "synonymous_variant",
+    "coding_sequence_variant",
+]
+
+CSQ_NON_CODING = [
+    "mature_miRNA_variant",
+    "5_prime_UTR_variant",
+    "3_prime_UTR_variant",
+    "non_coding_transcript_exon_variant",
+    "non_coding_exon_variant",  # deprecated
+    "intron_variant",
+    "NMD_transcript_variant",
+    "non_coding_transcript_variant",
+    "nc_transcript_variant",  # deprecated
+    "upstream_gene_variant",
+    "downstream_gene_variant",
+    "TFBS_ablation",
+    "TFBS_amplification",
+    "TF_binding_site_variant",
+    "regulatory_region_ablation",
+    "regulatory_region_amplification",
+    "feature_elongation",
+    "regulatory_region_variant",
+    "feature_truncation",
+    "intergenic_variant",
+]
+
+CSQ_ORDER = (
+    CSQ_CODING_HIGH_IMPACT
+    + CSQ_CODING_MEDIUM_IMPACT
+    + CSQ_CODING_LOW_IMPACT
+    + CSQ_NON_CODING
+)
+
+CSQ_CODING = CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT + CSQ_CODING_LOW_IMPACT
+"""
+Constant containing all coding consequences.
+"""
+
+CSQ_SPLICE = [
+    "splice_acceptor_variant",
+    "splice_donor_variant",
+    "splice_region_variant",
+]
+"""
+Constant containing all splice consequences.
+"""
+
+POSSIBLE_REFS = ("GRCh37", "GRCh38")
+"""
+Constant containing supported references
+"""
+
+VEP_CONFIG_PATH = "file:///vep_data/vep-gcloud.json"
+"""
+Constant that contains the local path to the VEP config file
+"""
+
+VEP_CSQ_FIELDS = {
+    "101": "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info",
+    "105": "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|UNIPROT_ISOFORM|SOURCE|SIFT|PolyPhen|DOMAINS|miRNA|HGVS_OFFSET|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|LoF|LoF_filter|LoF_flags|LoF_info",
+}
+"""
+Constant that defines the order of VEP annotations used in VCF export, currently stored in a dictionary with the VEP version as the key.
+"""
+
+VEP_CSQ_HEADER = (
+    "Consequence annotations from Ensembl VEP. Format:"
+    f" {VEP_CSQ_FIELDS[CURRENT_VEP_VERSION]}"
+)
+"""
+Constant that contains description for VEP used in VCF export.
+"""
+
+LOFTEE_LABELS = ["HC", "LC", "OS"]
+"""
+Constant that contains annotations added by LOFTEE.
+"""
+
+LOF_CSQ_SET = {
+    "splice_acceptor_variant",
+    "splice_donor_variant",
+    "stop_gained",
+    "frameshift_variant",
+}
+"""
+Set containing loss-of-function consequence strings.
+"""
+
+
+
[docs]def get_vep_help(vep_config_path: Optional[str] = None): + """ + Return the output of vep --help which includes the VEP version. + + .. warning:: + If no `vep_config_path` is supplied, this function will only work for Dataproc clusters + created with `hailctl dataproc start --vep`. It assumes that the command is `/path/to/vep`. + + :param vep_config_path: Optional path to use as the VEP config file. If None, `VEP_CONFIG_URI` environment variable is used + :return: VEP help string + """ + if vep_config_path is None: + vep_config_path = os.environ["VEP_CONFIG_URI"] + + with hl.hadoop_open(vep_config_path) as vep_config_file: + vep_config = json.load(vep_config_file) + vep_command = vep_config["command"] + vep_help = subprocess.check_output([vep_command[0]]).decode("utf-8") + return vep_help
+ + +
[docs]def get_vep_context(ref: Optional[str] = None) -> VersionedTableResource: + """ + Get VEP context resource for the genome build `ref`. + + :param ref: Genome build. If None, `hl.default_reference` is used + :return: VEPed context resource + """ + import gnomad.resources.grch37.reference_data as grch37 + import gnomad.resources.grch38.reference_data as grch38 + + if ref is None: + ref = hl.default_reference().name + + if ref not in POSSIBLE_REFS: + raise ValueError( + f'get_vep_context passed {ref}. Expected one of {", ".join(POSSIBLE_REFS)}' + ) + + vep_context = grch37.vep_context if ref == "GRCh37" else grch38.vep_context + return vep_context
+ + +
[docs]def vep_or_lookup_vep( + ht, reference_vep_ht=None, reference=None, vep_config_path=None, vep_version=None +): + """ + VEP a table, or lookup variants in a reference database. + + .. warning:: + If `reference_vep_ht` is supplied, no check is performed to confirm `reference_vep_ht` was + generated with the same version of VEP / VEP configuration as the VEP referenced in `vep_config_path`. + + :param ht: Input Table + :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`) + :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference) + :param vep_config_path: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen) + :param vep_version: Version of VEPed context Table to use (if None, the default `vep_context` resource will be used) + :return: VEPed Table + """ + if reference is None: + reference = hl.default_reference().name + + if vep_config_path is None: + vep_config_path = VEP_CONFIG_PATH + + vep_help = get_vep_help(vep_config_path) + + with hl.hadoop_open(vep_config_path) as vep_config_file: + vep_config = vep_config_file.read() + + if reference_vep_ht is None: + if reference not in POSSIBLE_REFS: + raise ValueError( + f"vep_or_lookup_vep got {reference}. Expected one of" + f" {', '.join(POSSIBLE_REFS)}" + ) + + vep_context = get_vep_context(reference) + if vep_version is None: + vep_version = vep_context.default_version + + if vep_version not in vep_context.versions: + logger.warning( + "No VEPed context Table available for genome build %s and VEP" + " version %s, all variants will be VEPed using the following" + " VEP:\n%s", + reference, + vep_version, + vep_help, + ) + return hl.vep(ht, vep_config_path) + + logger.info( + "Using VEPed context Table from genome build %s and VEP version %s", + reference, + vep_version, + ) + + reference_vep_ht = vep_context.versions[vep_version].ht() + vep_context_help = hl.eval(reference_vep_ht.vep_help) + vep_context_config = hl.eval(reference_vep_ht.vep_config) + + assert vep_help == vep_context_help, ( + "The VEP context HT version does not match the version referenced in the" + f" VEP config file.\nVEP context:\n{vep_context_help}\n\n VEP" + f" config:\n{vep_help}" + ) + + assert vep_config == vep_context_config, ( + "The VEP context HT configuration does not match the configuration in" + f" {vep_config_path}.\nVEP context:\n{vep_context_config}\n\n Current" + f" config:\n{vep_config}" + ) + + ht = ht.annotate(vep=reference_vep_ht[ht.key].vep) + + vep_ht = ht.filter(hl.is_defined(ht.vep)) + revep_ht = ht.filter(hl.is_missing(ht.vep)) + revep_ht = hl.vep(revep_ht, vep_config_path) + if "vep_proc_id" in list(revep_ht.row): + revep_ht = revep_ht.drop("vep_proc_id") + if "vep_proc_id" in list(vep_ht.row): + vep_ht = vep_ht.drop("vep_proc_id") + + vep_ht = vep_ht.annotate_globals( + vep_version=f"v{vep_version}", vep_help=vep_help, vep_config=vep_config + ) + + return vep_ht.union(revep_ht)
+ + +
[docs]def add_most_severe_consequence_to_consequence( + tc: hl.expr.StructExpression, +) -> hl.expr.StructExpression: + """ + Add most_severe_consequence annotation to transcript consequences. + + This is for a given transcript, as there are often multiple annotations for a single transcript: + e.g. splice_region_variant&intron_variant -> splice_region_variant + """ + csqs = hl.literal(CSQ_ORDER) + + return tc.annotate( + most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c)) + )
+ + +
[docs]def process_consequences( + mt: Union[hl.MatrixTable, hl.Table], + vep_root: str = "vep", + penalize_flags: bool = True, + csq_order: Optional[List[str]] = None, + has_polyphen: bool = True, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Add most_severe_consequence into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root]. + + `most_severe_consequence` is the worst consequence for a transcript. + + .. note:: + From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct + in the release HTs. When using this function with gnomAD v4.0 or later, + set `has_polyphen` to False. + + :param mt: Input Table or MatrixTable. + :param vep_root: Root for VEP annotation (probably "vep"). + :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them + as equal to HC. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. + :param has_polyphen: Whether the input VEP Struct has a PolyPhen annotation which + will be used to modify the consequence score. Default is True. + :return: MT with better formatted consequences. + """ + if csq_order is None: + csq_order = CSQ_ORDER + csqs = hl.literal(csq_order) + csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) + + def _csq_score(tc: hl.expr.StructExpression) -> int: + return csq_dict[tc.most_severe_consequence] + + flag_score = 500 + no_flag_score = flag_score * (1 + penalize_flags) + + def _csq_score_modifier(tc: hl.expr.StructExpression) -> float: + modifier = _csq_score(tc) + flag_condition = (tc.lof == "HC") & (tc.lof_flags != "") + modifier -= hl.if_else(flag_condition, flag_score, no_flag_score) + modifier -= hl.if_else(tc.lof == "OS", 20, 0) + modifier -= hl.if_else(tc.lof == "LC", 10, 0) + if has_polyphen: + modifier -= ( + hl.case() + .when(tc.polyphen_prediction == "probably_damaging", 0.5) + .when(tc.polyphen_prediction == "possibly_damaging", 0.25) + .when(tc.polyphen_prediction == "benign", 0.1) + .default(0) + ) + return modifier + + def find_worst_transcript_consequence( + tcl: hl.expr.ArrayExpression, + ) -> hl.expr.StructExpression: + tcl = tcl.map( + lambda tc: tc.annotate(csq_score=_csq_score(tc) - _csq_score_modifier(tc)) + ) + return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) + + transcript_csqs = mt[vep_root].transcript_consequences.map( + add_most_severe_consequence_to_consequence + ) + + gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) + worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence).values() + sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) + + canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) + gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) + worst_csq_gene_canonical = gene_canonical_dict.map_values( + find_worst_transcript_consequence + ).values() + sorted_canonical_scores = hl.sorted( + worst_csq_gene_canonical, key=lambda tc: tc.csq_score + ) + + vep_data = mt[vep_root].annotate( + transcript_consequences=transcript_csqs, + worst_consequence_term=csqs.find( + lambda c: transcript_csqs.map( + lambda csq: csq.most_severe_consequence + ).contains(c) + ), + worst_csq_by_gene=sorted_scores, + worst_csq_for_variant=hl.or_missing( + hl.len(sorted_scores) > 0, sorted_scores[0] + ), + worst_csq_by_gene_canonical=sorted_canonical_scores, + worst_csq_for_variant_canonical=hl.or_missing( + hl.len(sorted_canonical_scores) > 0, sorted_canonical_scores[0] + ), + ) + + return ( + mt.annotate_rows(**{vep_root: vep_data}) + if isinstance(mt, hl.MatrixTable) + else mt.annotate(**{vep_root: vep_data}) + )
+ + +
[docs]def filter_vep_to_canonical_transcripts( + mt: Union[hl.MatrixTable, hl.Table], + vep_root: str = "vep", + filter_empty_csq: bool = False, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter VEP transcript consequences to those in the canonical transcript. + + :param mt: Input Table or MatrixTable. + :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' is empty. Default is False. + :return: Table or MatrixTable with VEP transcript consequences filtered. + """ + return filter_vep_transcript_csqs( + mt, vep_root, synonymous=False, filter_empty_csq=filter_empty_csq + )
+ + +
[docs]def filter_vep_to_mane_select_transcripts( + mt: Union[hl.MatrixTable, hl.Table], + vep_root: str = "vep", + filter_empty_csq: bool = False, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter VEP transcript consequences to those in the MANE Select transcript. + + :param mt: Input Table or MatrixTable. + :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' is empty. Default is False. + :return: Table or MatrixTable with VEP transcript consequences filtered. + """ + return filter_vep_transcript_csqs( + mt, + vep_root, + synonymous=False, + canonical=False, + mane_select=True, + filter_empty_csq=filter_empty_csq, + )
+ + +
[docs]def filter_vep_to_synonymous_variants( + mt: Union[hl.MatrixTable, hl.Table], + vep_root: str = "vep", + filter_empty_csq: bool = False, +) -> Union[hl.MatrixTable, hl.Table]: + """ + Filter VEP transcript consequences to those with a most severe consequence of 'synonymous_variant'. + + :param mt: Input Table or MatrixTable. + :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' is empty. Default is False. + :return: Table or MatrixTable with VEP transcript consequences filtered. + """ + return filter_vep_transcript_csqs( + mt, vep_root, canonical=False, filter_empty_csq=filter_empty_csq + )
+ + +
[docs]def filter_vep_to_gene_list( + t: Union[hl.MatrixTable, hl.Table], + genes: List[str], + match_by_gene_symbol: bool = False, + vep_root: str = "vep", + filter_empty_csq: bool = False, +): + """ + Filter VEP transcript consequences to those in a set of genes. + + .. note:: + + Filtering to a list of genes by their 'gene_id' or 'gene_symbol' will filter to + all variants that are annotated to the gene, including + ['upstream_gene_variant', 'downstream_gene_variant'], which will not be the + same as if you filter to a gene interval. If you only want variants inside + certain gene boundaries and a faster filter, you can first filter `t` to an + interval list and then apply this filter. + + :param t: Input Table or MatrixTable. + :param genes: Genes of interest to filter VEP transcript consequences to. + :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript + consequences by 'gene_symbol' instead of 'gene_id'. Default is False. + :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' + is empty. Default is False. + :return: Table or MatrixTable with VEP transcript consequences filtered. + """ + return filter_vep_transcript_csqs( + t, + vep_root, + synonymous=False, + canonical=False, + filter_empty_csq=filter_empty_csq, + genes=genes, + match_by_gene_symbol=match_by_gene_symbol, + )
+ + +
[docs]def vep_struct_to_csq( + vep_expr: hl.expr.StructExpression, + csq_fields: str = VEP_CSQ_FIELDS[CURRENT_VEP_VERSION], + has_polyphen_sift: bool = True, +) -> hl.expr.ArrayExpression: + """ + Given a VEP Struct, returns and array of VEP VCF CSQ strings (one per consequence in the struct). + + The fields and their order will correspond to those passed in `csq_fields`, which corresponds to the + VCF header that is required to interpret the VCF CSQ INFO field. + + Note that the order is flexible and that all fields that are in the default value are supported. + These fields will be formatted in the same way that their VEP CSQ counterparts are. + + While other fields can be added if their name are the same as those in the struct. Their value will be the result of calling + hl.str(), so it may differ from their usual VEP CSQ representation. + + :param vep_expr: The input VEP Struct + :param csq_fields: The | delimited list of fields to include in the CSQ (in that order), default is the CSQ fields of the CURRENT_VEP_VERSION. + :param has_polyphen_sift: Whether the input VEP Struct has PolyPhen and SIFT annotations. Default is True. + :return: The corresponding CSQ strings + """ + _csq_fields = [f.lower() for f in csq_fields.split("|")] + + def get_csq_from_struct( + element: hl.expr.StructExpression, feature_type: str + ) -> hl.expr.StringExpression: + # Most fields are 1-1, just lowercase + fields = dict(element) + + # Add general exceptions + fields.update( + { + "allele": element.variant_allele, + "consequence": hl.delimit(element.consequence_terms, delimiter="&"), + "feature_type": feature_type, + "feature": ( + element.transcript_id + if "transcript_id" in element + else ( + element.regulatory_feature_id + if "regulatory_feature_id" in element + else ( + element.motif_feature_id + if "motif_feature_id" in element + else "" + ) + ) + ), + "variant_class": vep_expr.variant_class, + } + ) + + # Add exception for transcripts + if feature_type == "Transcript": + transcript_dict = { + "canonical": hl.if_else(element.canonical == 1, "YES", ""), + "ensp": element.protein_id, + "gene": element.gene_id, + "symbol": element.gene_symbol, + "symbol_source": element.gene_symbol_source, + "cdna_position": hl.str(element.cdna_start) + + hl.if_else( + element.cdna_start == element.cdna_end, + "", + "-" + hl.str(element.cdna_end), + ), + "cds_position": hl.str(element.cds_start) + + hl.if_else( + element.cds_start == element.cds_end, + "", + "-" + hl.str(element.cds_end), + ), + "mirna": hl.delimit(element.mirna, "&") if "mirna" in element else None, + "protein_position": hl.str(element.protein_start) + + hl.if_else( + element.protein_start == element.protein_end, + "", + "-" + hl.str(element.protein_end), + ), + "uniprot_isoform": ( + hl.delimit(element.uniprot_isoform, "&") + if "uniprot_isoform" in element + else None + ), + } + # Retain transcript dict updates only for fields that exist in the csq + # fields. + transcript_dict = { + k: v + for k, v in transcript_dict.items() + if k in [x.lower() for x in csq_fields.split("|")] + } + fields.update(transcript_dict) + + if has_polyphen_sift: + fields.update( + { + "sift": ( + element.sift_prediction + + "(" + + hl.format("%.3f", element.sift_score) + + ")" + ), + "polyphen": ( + element.polyphen_prediction + + "(" + + hl.format("%.3f", element.polyphen_score) + + ")" + ), + } + ) + fields.update( + { + "domains": hl.delimit( + element.domains.map(lambda d: d.db + ":" + d.name), "&" + ), + } + ) + elif feature_type == "MotifFeature": + fields["motif_score_change"] = hl.format("%.3f", element.motif_score_change) + if "transcription_factors" in element: + fields["transcription_factors"] = hl.delimit( + element.transcription_factors, "&" + ) + + return hl.delimit( + [hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields], "|" + ) + + csq = hl.empty_array(hl.tstr) + for feature_field, feature_type in [ + ("transcript_consequences", "Transcript"), + ("regulatory_feature_consequences", "RegulatoryFeature"), + ("motif_feature_consequences", "MotifFeature"), + ("intergenic_consequences", "Intergenic"), + ]: + csq = csq.extend( + hl.or_else( + vep_expr[feature_field].map( + lambda x: get_csq_from_struct(x, feature_type=feature_type) + ), + hl.empty_array(hl.tstr), + ) + ) + + return hl.or_missing(hl.len(csq) > 0, csq)
+ + +
[docs]def get_most_severe_consequence_for_summary( + ht: hl.Table, + csq_order: List[str] = CSQ_ORDER, + loftee_labels: List[str] = LOFTEE_LABELS, +) -> hl.Table: + """ + Prepare a hail Table for summary statistics generation. + + Adds the following annotations: + - most_severe_csq: Most severe consequence for variant + - protein_coding: Whether the variant is present on a protein-coding transcript + - lof: Whether the variant is a loss-of-function variant + - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags) + + Assumes input Table is annotated with VEP and that VEP annotations have been filtered to canonical transcripts. + + :param ht: Input Table. + :param csq_order: Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER. + :param loftee_labels: Annotations added by LOFTEE. Default is LOFTEE_LABELS. + :return: Table annotated with VEP summary annotations. + """ + + def _get_most_severe_csq( + csq_list: hl.expr.ArrayExpression, protein_coding: bool + ) -> hl.expr.StructExpression: + """ + Process VEP consequences to generate summary annotations. + + :param csq_list: VEP consequences list to be processed. + :param protein_coding: Whether variant is in a protein-coding transcript. + :return: Struct containing summary annotations. + """ + lof = hl.null(hl.tstr) + no_lof_flags = hl.null(hl.tbool) + if protein_coding: + all_lofs = csq_list.map(lambda x: x.lof) + lof = hl.literal(loftee_labels).find(lambda x: all_lofs.contains(x)) + csq_list = hl.if_else( + hl.is_defined(lof), csq_list.filter(lambda x: x.lof == lof), csq_list + ) + no_lof_flags = hl.or_missing( + hl.is_defined(lof), + csq_list.any(lambda x: (x.lof == lof) & hl.is_missing(x.lof_flags)), + ) + all_csq_terms = csq_list.flatmap(lambda x: x.consequence_terms) + most_severe_csq = hl.literal(csq_order).find( + lambda x: all_csq_terms.contains(x) + ) + return hl.struct( + most_severe_csq=most_severe_csq, + protein_coding=protein_coding, + lof=lof, + no_lof_flags=no_lof_flags, + ) + + protein_coding = ht.vep.transcript_consequences.filter( + lambda x: x.biotype == "protein_coding" + ) + return ht.annotate( + **hl.case(missing_false=True) + .when(hl.len(protein_coding) > 0, _get_most_severe_csq(protein_coding, True)) + .when( + hl.len(ht.vep.transcript_consequences) > 0, + _get_most_severe_csq(ht.vep.transcript_consequences, False), + ) + .when( + hl.len(ht.vep.regulatory_feature_consequences) > 0, + _get_most_severe_csq(ht.vep.regulatory_feature_consequences, False), + ) + .when( + hl.len(ht.vep.motif_feature_consequences) > 0, + _get_most_severe_csq(ht.vep.motif_feature_consequences, False), + ) + .default(_get_most_severe_csq(ht.vep.intergenic_consequences, False)) + )
+ + +
[docs]def filter_vep_transcript_csqs( + t: Union[hl.Table, hl.MatrixTable], + vep_root: str = "vep", + synonymous: bool = True, + canonical: bool = True, + mane_select: bool = False, + filter_empty_csq: bool = True, + ensembl_only: bool = True, + protein_coding: bool = False, + csqs: List[str] = None, + keep_csqs: bool = True, + genes: Optional[List[str]] = None, + keep_genes: bool = True, + match_by_gene_symbol: bool = False, + additional_filtering_criteria: Optional[List[Callable]] = None, +) -> Union[hl.Table, hl.MatrixTable]: + """ + Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering. + + Transcript consequences can be filtered to those where 'most_severe_consequence' is + 'synonymous_variant' and/or the transcript is the canonical transcript, if the + `synonymous` and `canonical` parameter are set to True, respectively. + + If `filter_empty_csq` parameter is set to True, the Table/MatrixTable is filtered + to variants where 'transcript_consequences' within the VEP annotation is not empty + after the specified filtering criteria is applied. + + :param t: Input Table or MatrixTable. + :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param synonymous: Whether to filter to variants where the most severe consequence + is 'synonymous_variant'. Default is True. + :param canonical: Whether to filter to only canonical transcripts. Default is True. + :param mane_select: Whether to filter to only MANE Select transcripts. Default is + False. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' + is empty, after filtering 'transcript_consequences' to the specified criteria. + Default is True. + :param ensembl_only: Whether to filter to only Ensembl transcripts. This option is + useful for deduplicating transcripts that are the same between RefSeq and + Emsembl. Default is True. + :param protein_coding: Whether to filter to only protein-coding transcripts. + Default is False. + :param csqs: Optional list of consequence terms to filter to. Transcript + consequences are filtered to those where 'most_severe_consequence' is in the + list of consequence terms `csqs`. Default is None. + :param keep_csqs: Whether to keep transcript consequences that are in `csqs`. If + set to False, transcript consequences that are in `csqs` will be removed. + Default is True. + :param genes: Optional list of genes to filter VEP transcript consequences to. + Default is None. + :param keep_genes: Whether to keep transcript consequences that are in `genes`. If + set to False, transcript consequences that are in `genes` will be removed. + Default is True. + :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript + consequences by 'gene_symbol' instead of 'gene_id'. Default is False. + :param additional_filtering_criteria: Optional list of additional filtering + criteria to apply to the VEP transcript consequences. + :return: Table or MatrixTable filtered to specified criteria. + """ + if not synonymous and not (canonical or mane_select) and not filter_empty_csq: + logger.warning("No changes have been made to input Table/MatrixTable!") + return t + + transcript_csqs = t[vep_root].transcript_consequences + criteria = [lambda csq: True] + if synonymous: + logger.info("Filtering to most severe consequence of synonymous_variant...") + csqs = ["synonymous_variant"] + if csqs is not None: + csqs = hl.literal(csqs) + if keep_csqs: + criteria.append(lambda csq: csqs.contains(csq.most_severe_consequence)) + else: + criteria.append(lambda csq: ~csqs.contains(csq.most_severe_consequence)) + if canonical: + logger.info("Filtering to canonical transcripts") + criteria.append(lambda csq: csq.canonical == 1) + if mane_select: + logger.info("Filtering to MANE Select transcripts...") + criteria.append(lambda csq: hl.is_defined(csq.mane_select)) + if ensembl_only: + logger.info("Filtering to Ensembl transcripts...") + criteria.append(lambda csq: csq.transcript_id.startswith("ENST")) + if protein_coding: + logger.info("Filtering to protein coding transcripts...") + criteria.append(lambda csq: csq.biotype == "protein_coding") + if genes is not None: + logger.info("Filtering to genes of interest...") + genes = hl.literal(genes) + gene_field = "gene_symbol" if match_by_gene_symbol else "gene_id" + if keep_genes: + criteria.append(lambda csq: genes.contains(csq[gene_field])) + else: + criteria.append(lambda csq: ~genes.contains(csq[gene_field])) + if additional_filtering_criteria is not None: + logger.info("Filtering to variants with additional criteria...") + criteria = criteria + additional_filtering_criteria + + transcript_csqs = transcript_csqs.filter(lambda x: combine_functions(criteria, x)) + is_mt = isinstance(t, hl.MatrixTable) + vep_data = {vep_root: t[vep_root].annotate(transcript_consequences=transcript_csqs)} + t = t.annotate_rows(**vep_data) if is_mt else t.annotate(**vep_data) + + if filter_empty_csq: + transcript_csq_expr = t[vep_root].transcript_consequences + filter_expr = hl.is_defined(transcript_csq_expr) & ( + hl.len(transcript_csq_expr) > 0 + ) + t = t.filter_rows(filter_expr) if is_mt else t.filter(filter_expr) + + return t
+ + +
[docs]def add_most_severe_csq_to_tc_within_vep_root( + t: Union[hl.Table, hl.MatrixTable], vep_root: str = "vep" +) -> Union[hl.Table, hl.MatrixTable]: + """ + Add most_severe_consequence annotation to 'transcript_consequences' within the vep root annotation. + + :param t: Input Table or MatrixTable. + :param vep_root: Root for vep annotation (probably vep). + :return: Table or MatrixTable with most_severe_consequence annotation added. + """ + annotation = t[vep_root].annotate( + transcript_consequences=t[vep_root].transcript_consequences.map( + add_most_severe_consequence_to_consequence + ) + ) + return ( + t.annotate_rows(**{vep_root: annotation}) + if isinstance(t, hl.MatrixTable) + else t.annotate(**{vep_root: annotation}) + )
+ + +
[docs]def explode_by_vep_annotation( + t: Union[hl.Table, hl.MatrixTable], + vep_annotation: str = "transcript_consequences", + vep_root: str = "vep", +) -> Union[hl.Table, hl.MatrixTable]: + """ + Explode the specified VEP annotation on the input Table/MatrixTable. + + :param t: Input Table or MatrixTable. + :param vep_annotation: Name of annotation in `vep_root` to explode. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :return: Table or MatrixTable with exploded VEP annotation. + """ + if vep_annotation not in t[vep_root].keys(): + raise ValueError( + f"{vep_annotation} is not a row field of the {vep_root} annotation in" + " Table/MatrixTable!" + ) + # Create top-level annotation for `vep_annotation` and explode it. + if isinstance(t, hl.Table): + t = t.transmute(**{vep_annotation: t[vep_root][vep_annotation]}) + t = t.explode(t[vep_annotation]) + else: + t = t.transmute_rows(**{vep_annotation: t[vep_root][vep_annotation]}) + t = t.explode_rows(t[vep_annotation]) + + return t
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/variant_qc/evaluation.html b/_modules/gnomad/variant_qc/evaluation.html new file mode 100644 index 000000000..48a449adc --- /dev/null +++ b/_modules/gnomad/variant_qc/evaluation.html @@ -0,0 +1,532 @@ + + + + + + gnomad.variant_qc.evaluation — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.variant_qc.evaluation

+# noqa: D100
+
+import logging
+from typing import Dict, Optional
+
+import hail as hl
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def compute_ranked_bin( + ht: hl.Table, + score_expr: hl.expr.NumericExpression, + bin_expr: Dict[str, hl.expr.BooleanExpression] = {"bin": True}, + compute_snv_indel_separately: bool = True, + n_bins: int = 100, + desc: bool = True, +) -> hl.Table: + r""" + Return a table with a bin for each row based on the ranking of `score_expr`. + + The bin is computed by dividing the `score_expr` into `n_bins` bins containing approximately equal numbers of elements. + This is done by ranking the rows by `score_expr` (and a random number in cases where multiple variants have the same score) + and then assigning the variant to a bin based on its ranking. + + If `compute_snv_indel_separately` is True all items in `bin_expr` will be stratified by snv / indels for the ranking and + bin calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For + example if we have the following four variants and scores and `n_bins` of 2: + + ======== ======= ====== ================= ================= + Variant Type Score bin - `compute_snv_indel_separately`: + -------- ------- ------ ------------------------------------- + \ \ \ False True + ======== ======= ====== ================= ================= + Var1 SNV 0.1 1 1 + Var2 SNV 0.2 1 2 + Var3 Indel 0.3 2 1 + Var4 Indel 0.4 2 2 + ======== ======= ====== ================= ================= + + .. note:: + + The `bin_expr` defines which data the bin(s) should be computed on. E.g., to get biallelic specific binning + and singleton specific binning, the following could be used: + + .. code-block:: python + + bin_expr={ + 'biallelic_bin': ~ht.was_split, + 'singleton_bin': ht.singleton + } + + :param ht: Input Table + :param score_expr: Expression containing the score + :param bin_expr: Specific row grouping(s) to perform ranking and binning on (see note) + :param compute_snv_indel_separately: Should all `bin_expr` items be stratified by SNVs / indels + :param n_bins: Number of bins to bin the data into + :param desc: Whether to bin the score in descending order + :return: Table with the requested bin annotations + """ + if compute_snv_indel_separately: + # For each bin, add a SNV / indel stratification + bin_expr = { + f"{bin_id}_{snv}": bin_expr & snv_expr + for bin_id, bin_expr in bin_expr.items() + for snv, snv_expr in [ + ("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])), + ("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])), + ] + } + + bin_ht = ht.select( + **{f"_filter_{bin_id}": bin_expr for bin_id, bin_expr in bin_expr.items()}, + _score=score_expr, + snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), + _rand=hl.rand_unif(0, 1), + ) + + # Checkpoint bin Table prior to variant count aggregation. + bin_ht = bin_ht.checkpoint(hl.utils.new_temp_file("bin", "ht")) + + # Compute variant counts per group defined by bin_expr. This is used to determine + # bin assignment. + bin_group_variant_counts = bin_ht.aggregate( + hl.Struct( + **{ + bin_id: hl.agg.filter( + bin_ht[f"_filter_{bin_id}"], + hl.agg.count(), + ) + for bin_id in bin_expr + } + ) + ) + + logger.info( + "Sorting the HT by score_expr followed by a random float between 0 and 1. " + "Then adding a row index per grouping defined by bin_expr..." + ) + bin_ht = bin_ht.key_by("_score", "_rand") + bin_ht = bin_ht.annotate( + **{ + f"{bin_id}_rank": hl.or_missing( + bin_ht[f"_filter_{bin_id}"], + hl.scan.count_where(bin_ht[f"_filter_{bin_id}"]), + ) + for bin_id in bin_expr + } + ) + bin_ht = bin_ht.key_by("locus", "alleles") + + logger.info("Binning ranked rows into %d bins...", n_bins) + bin_ht = bin_ht.select( + "snv", + **{ + bin_id: hl.int( + hl.floor( + ( + n_bins + * ( + bin_ht[f"{bin_id}_rank"] + / hl.float64(bin_group_variant_counts[bin_id]) + ) + ) + + 1 + ) + ) + for bin_id in bin_expr + }, + ) + + if desc: + bin_ht = bin_ht.annotate( + **{bin_id: n_bins - bin_ht[bin_id] + 1 for bin_id in bin_expr} + ) + + # Because SNV and indel rows are mutually exclusive, re-combine them into a single bin. + # Update the global bin_group_variant_counts struct to reflect the change + # in bin names in the table + if compute_snv_indel_separately: + bin_expr_no_snv = { + bin_id.rsplit("_", 1)[0] for bin_id in bin_group_variant_counts + } + bin_group_variant_counts = hl.struct( + **{ + bin_id: hl.struct( + **{ + snv: bin_group_variant_counts[f"{bin_id}_{snv}"] + for snv in ["snv", "indel"] + } + ) + for bin_id in bin_expr_no_snv + } + ) + + bin_ht = bin_ht.transmute( + **{ + bin_id: hl.if_else( + bin_ht.snv, + bin_ht[f"{bin_id}_snv"], + bin_ht[f"{bin_id}_indel"], + ) + for bin_id in bin_expr_no_snv + } + ) + + bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=bin_group_variant_counts) + + return bin_ht
+ + +
[docs]def compute_grouped_binned_ht( + bin_ht: hl.Table, + checkpoint_path: Optional[str] = None, +) -> hl.GroupedTable: + """ + Group a Table that has been annotated with bins (`compute_ranked_bin` or `create_binned_ht`). + + The table will be grouped by bin_id (bin, biallelic, etc.), contig, snv, bi_allelic and singleton. + + .. note:: + + If performing an aggregation following this grouping (such as `score_bin_agg`) then the aggregation + function will need to use `ht._parent` to get the origin Table from the GroupedTable for the aggregation + + :param bin_ht: Input Table with a `bin_id` annotation + :param checkpoint_path: If provided an intermediate checkpoint table is created with all required annotations before shuffling. + :return: Table grouped by bins(s) + """ + # Explode the rank table by bin_id + bin_ht = bin_ht.annotate( + bin_groups=hl.array( + [ + hl.Struct(bin_id=bin_name, bin=bin_ht[bin_name]) + for bin_name in bin_ht.bin_group_variant_counts + ] + ) + ) + bin_ht = bin_ht.explode(bin_ht.bin_groups) + bin_ht = bin_ht.transmute( + bin_id=bin_ht.bin_groups.bin_id, bin=bin_ht.bin_groups.bin + ) + bin_ht = bin_ht.filter(hl.is_defined(bin_ht.bin)) + + if checkpoint_path is not None: + bin_ht.checkpoint(checkpoint_path, overwrite=True) + else: + bin_ht = bin_ht.persist() + + # Group by bin_id, bin and additional stratification desired and compute + # QC metrics per bin + return bin_ht.group_by( + bin_id=bin_ht.bin_id, + contig=bin_ht.locus.contig, + snv=hl.is_snp(bin_ht.alleles[0], bin_ht.alleles[1]), + bi_allelic=~bin_ht.was_split, + singleton=bin_ht.singleton, + release_adj=bin_ht.ac > 0, + bin=bin_ht.bin, + )._set_buffer_size(20000)
+ + +
[docs]def compute_binned_truth_sample_concordance( + ht: hl.Table, + binned_score_ht: hl.Table, + n_bins: int = 100, + add_bins: Dict[str, hl.expr.BooleanExpression] = {}, +) -> hl.Table: + """ + Determine the concordance (TP, FP, FN) between a truth sample within the callset and the samples truth data grouped by bins computed using `compute_ranked_bin`. + + .. note:: + The input 'ht` should contain three row fields: + - score: value to use for binning + - GT: a CallExpression containing the genotype of the evaluation data for the sample + - truth_GT: a CallExpression containing the genotype of the truth sample + The input `binned_score_ht` should contain: + - score: value used to bin the full callset + - bin: the full callset bin + + 'add_bins` can be used to add additional global and truth sample binning to the final binned truth sample + concordance HT. The keys in `add_bins` must be present in `binned_score_ht` and the values in `add_bins` + should be expressions on `ht` that define a subset of variants to bin in the truth sample. An example is if we want + to look at the global and truth sample binning on only bi-allelic variants. `add_bins` could be set to + {'biallelic_bin': ht.biallelic}. + + The table is grouped by global/truth sample bin and variant type and contains TP, FP and FN. + + :param ht: Input HT + :param binned_score_ht: Table with the bin annotation for each variant + :param n_bins: Number of bins to bin the data into + :param add_bins: Dictionary of additional global bin columns (key) and the expr to use for binning the truth sample (value) + :return: Binned truth sample concordance HT + """ + # Annotate score and global bin + indexed_binned_score_ht = binned_score_ht[ht.key] + ht = ht.annotate( + **{f"global_{bin_id}": indexed_binned_score_ht[bin_id] for bin_id in add_bins}, + **{f"_{bin_id}": bin_expr for bin_id, bin_expr in add_bins.items()}, + score=indexed_binned_score_ht.score, + global_bin=indexed_binned_score_ht.bin, + ) + ht = ht.checkpoint(hl.utils.new_temp_file("pre_bin", "ht")) + # Annotate the truth sample bin + bin_ht = compute_ranked_bin( + ht, + score_expr=ht.score, + bin_expr={ + "truth_sample_bin": hl.expr.bool(True), + **{f"truth_sample_{bin_id}": ht[f"_{bin_id}"] for bin_id in add_bins}, + }, + n_bins=n_bins, + ) + ht = ht.join(bin_ht, how="left") + + bin_list = [ + hl.tuple(["global_bin", ht.global_bin]), + hl.tuple(["truth_sample_bin", ht.truth_sample_bin]), + ] + bin_list.extend( + [hl.tuple([f"global_{bin_id}", ht[f"global_{bin_id}"]]) for bin_id in add_bins] + ) + bin_list.extend( + [ + hl.tuple([f"truth_sample_{bin_id}", ht[f"truth_sample_{bin_id}"]]) + for bin_id in add_bins + ] + ) + + # Explode the global and truth sample bins + ht = ht.annotate(bin=bin_list) + + ht = ht.explode(ht.bin) + ht = ht.annotate(bin_id=ht.bin[0], bin=hl.int(ht.bin[1])) + + # Compute TP, FP and FN by bin_id, variant type and bin + return ( + ht.group_by("bin_id", "snv", "bin") + .aggregate( + # TP => allele is found in both data sets + tp=hl.agg.count_where(ht.GT.is_non_ref() & ht.truth_GT.is_non_ref()), + # FP => allele is found only in test data set + fp=hl.agg.count_where( + ht.GT.is_non_ref() & hl.or_else(ht.truth_GT.is_hom_ref(), True) + ), + # FN => allele is found in truth data only + fn=hl.agg.count_where( + hl.or_else(ht.GT.is_hom_ref(), True) & ht.truth_GT.is_non_ref() + ), + min_score=hl.agg.min(ht.score), + max_score=hl.agg.max(ht.score), + n_alleles=hl.agg.count(), + ) + .repartition(5) + )
+ + +
[docs]def create_truth_sample_ht( + mt: hl.MatrixTable, truth_mt: hl.MatrixTable, high_confidence_intervals_ht: hl.Table +) -> hl.Table: + """ + Compute a table comparing a truth sample in callset vs the truth. + + :param mt: MT of truth sample from callset to be compared to truth + :param truth_mt: MT of truth sample + :param high_confidence_intervals_ht: High confidence interval HT + :return: Table containing both the callset truth sample and the truth data + """ + + def split_filter_and_flatten_ht( + truth_mt: hl.MatrixTable, high_confidence_intervals_ht: hl.Table + ) -> hl.Table: + """ + Split a truth sample MT, filter it to the given high confidence intervals, and then "flatten" it as a HT by annotating GT in a row field. + + :param truth_mt: Truth sample MT + :param high_confidence_intervals_ht: High confidence intervals + :return: Truth sample table with GT as a row annotation + """ + assert truth_mt.count_cols() == 1 + + if not "was_split" in truth_mt.row: + truth_mt = hl.split_multi_hts(truth_mt) + + truth_mt = truth_mt.filter_rows( + hl.is_defined(high_confidence_intervals_ht[truth_mt.locus]) + ) + rename_entries = {"GT": "_GT"} + if "adj" in truth_mt.entry: + rename_entries.update({"adj": "_adj"}) + + truth_mt = truth_mt.rename(rename_entries) + return truth_mt.annotate_rows( + **{x: hl.agg.take(truth_mt[f"_{x}"], 1)[0] for x in rename_entries} + ).rows() + + # Load truth sample MT, + # restrict it to high confidence intervals + # and flatten it to a HT by annotating GT in a row annotation + truth_ht = split_filter_and_flatten_ht(truth_mt, high_confidence_intervals_ht) + truth_ht = truth_ht.rename({f: f"truth_{f}" for f in truth_ht.row_value}) + + # Similarly load, filter and flatten callset truth sample MT + ht = split_filter_and_flatten_ht(mt, high_confidence_intervals_ht) + + # Outer join of truth and callset truth and annotate the score and global bin + ht = truth_ht.join(ht, how="outer") + ht = ht.annotate(snv=hl.is_snp(ht.alleles[0], ht.alleles[1])) + + return ht
+ + +
[docs]def add_rank( + ht: hl.Table, + score_expr: hl.expr.NumericExpression, + subrank_expr: Optional[Dict[str, hl.expr.BooleanExpression]] = None, +) -> hl.Table: + """ + Add rank based on the `score_expr`. Rank is added for snvs and indels separately. + + If one or more `subrank_expr` are provided, then subrank is added based on all sites for which the boolean expression is true. + + In addition, variant counts (snv, indel separately) is added as a global (`rank_variant_counts`). + + :param ht: input Hail Table containing variants (with QC annotations) to be ranked + :param score_expr: the Table annotation by which ranking should be scored + :param subrank_expr: Any subranking to be added in the form name_of_subrank: subrank_filtering_expr + :return: Table with rankings added + """ + key = ht.key + if subrank_expr is None: + subrank_expr = {} + + temp_expr = {"_score": score_expr} + temp_expr.update({f"_{name}": expr for name, expr in subrank_expr.items()}) + rank_ht = ht.select(**temp_expr, is_snv=hl.is_snp(ht.alleles[0], ht.alleles[1])) + + rank_ht = rank_ht.key_by("_score").persist() + scan_expr = { + "rank": hl.if_else( + rank_ht.is_snv, + hl.scan.count_where(rank_ht.is_snv), + hl.scan.count_where(~rank_ht.is_snv), + ) + } + scan_expr.update( + { + name: hl.or_missing( + rank_ht[f"_{name}"], + hl.if_else( + rank_ht.is_snv, + hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]), + hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]), + ), + ) + for name in subrank_expr + } + ) + rank_ht = rank_ht.annotate(**scan_expr) + + rank_ht = rank_ht.key_by(*key).persist() + rank_ht = rank_ht.select(*scan_expr.keys()) + + ht = ht.annotate(**rank_ht[key]) + return ht
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/variant_qc/ld.html b/_modules/gnomad/variant_qc/ld.html new file mode 100644 index 000000000..9894a6bd4 --- /dev/null +++ b/_modules/gnomad/variant_qc/ld.html @@ -0,0 +1,217 @@ + + + + + + gnomad.variant_qc.ld — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.variant_qc.ld

+# noqa: D100
+
+import hail as hl
+from hail.linalg import BlockMatrix
+
+from gnomad.resources.grch37.gnomad import public_release
+from gnomad.resources.grch37.gnomad_ld import ld_index, ld_matrix
+
+
+
[docs]def get_r_human_readable( + pop: str, var1: str, var2: str, ref_genome: str = "GRCh37" +): # noqa: D103 + bm = ld_matrix(pop).bm() + ht = ld_index(pop).ht() + chrom, pos, ref, alt = var1.split("-") + var1 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt]) + chrom, pos, ref, alt = var2.split("-") + var2 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt]) + return get_r_for_pair_of_variants(bm, ht, var1, var2)
+ + +# TODO: find LD proxies + + +
[docs]def get_r_for_pair_of_variants( + bm: BlockMatrix, + ld_index: hl.Table, + var1: (hl.tlocus, hl.tarray(hl.tstr)), + var2: (hl.tlocus, hl.tarray(hl.tstr)), +): + """ + Get `r` value (LD) for pair of variants `var1` and `var2`. + + .. code-block:: python + + bm = get_ld_matrix('nfe') + ld_index = get_ld_index('nfe') + var1 = (hl.parse_locus('1:10146', 'GRCh37'), ['AC', 'A']) + var2 = (hl.parse_locus('1:10151', 'GRCh37'), ['TA', 'T']) + get_r_for_pair_of_variants(bm, ld_index, var1, var2) + # 0.01789767935482124 + + :param bm: Input BlockMatrix + :param ld_index: Corresponding index table + :param var1: Tuple of locus and alleles + :param var2: Tuple of locus and alleles + :return: Correlation (r) between two variants + """ + idx1 = ld_index.filter( + (ld_index.locus == var1[0]) & (ld_index.alleles == var1[1]) + ).idx.collect()[0] + idx2 = ld_index.filter( + (ld_index.locus == var2[0]) & (ld_index.alleles == var2[1]) + ).idx.collect()[0] + + if idx1 > idx2: + temp = idx1 + idx1 = idx2 + idx2 = temp + + return bm[idx1, idx2]
+ + +
[docs]def get_r_within_gene_in_pop(pop: str, gene: str): + """ + Get LD information (`r`) for all pairs of variants within `gene` for a given `pop`. + + Warning: this returns a table quadratic in number of variants. Exercise caution with large genes. + + :param pop: Population for which to get LD information + :param gene: Gene symbol as string + :return: Table with pairs of variants + """ + return get_r_within_gene( + ld_matrix(pop).bm(), ld_index(pop).ht(), gene, None, "GRCh37" + )
+ + +
[docs]def get_r_within_gene( + bm: BlockMatrix, + ld_index: hl.Table, + gene: str, + vep_ht: hl.Table = None, + reference_genome: str = None, +): + """ + Get LD information (`r`) for all pairs of variants within `gene`. + + Warning: this returns a table quadratic in number of variants. Exercise caution with large genes. + + :param bm: Input Block Matrix + :param ld_index: Corresponding index table + :param gene: Gene symbol as string + :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data()) + :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene + :return: Table with pairs of variants + """ + if vep_ht is None: + vep_ht = public_release("exomes").ht() + if reference_genome is None: + reference_genome = hl.default_reference().name + intervals = hl.experimental.get_gene_intervals( + gene_symbols=[gene], reference_genome=reference_genome + ) + ld_index = hl.filter_intervals(ld_index, intervals) + ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep) + ld_index = ld_index.filter( + hl.any(lambda tc: tc.gene_symbol == gene, ld_index.vep.transcript_consequences) + ) + + indices_to_keep = ld_index.idx.collect() + filt_bm = bm.filter(indices_to_keep, indices_to_keep) + ht = filt_bm.entries() + ld_index = ld_index.add_index("new_idx").key_by("new_idx") + return ht.transmute(r=ht.entry, i_variant=ld_index[ht.i], j_variant=ld_index[ht.j])
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/variant_qc/pipeline.html b/_modules/gnomad/variant_qc/pipeline.html new file mode 100644 index 000000000..dfd9ae360 --- /dev/null +++ b/_modules/gnomad/variant_qc/pipeline.html @@ -0,0 +1,582 @@ + + + + + + gnomad.variant_qc.pipeline — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.variant_qc.pipeline

+# noqa: D100
+
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+
+import hail as hl
+import pyspark.sql
+
+import gnomad.resources.grch37 as grch37_resources
+import gnomad.resources.grch38 as grch38_resources
+from gnomad.sample_qc.relatedness import (
+    SIBLINGS,
+    generate_sib_stats_expr,
+    generate_trio_stats_expr,
+)
+from gnomad.utils.annotations import annotate_adj, bi_allelic_expr
+from gnomad.utils.filtering import filter_to_autosomes, filter_to_clinvar_pathogenic
+from gnomad.utils.reference_genome import get_reference_genome
+from gnomad.variant_qc.evaluation import compute_ranked_bin
+from gnomad.variant_qc.random_forest import (
+    get_features_importance,
+    test_model,
+    train_rf,
+)
+from gnomad.variant_qc.training import sample_training_examples
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+INBREEDING_COEFF_HARD_CUTOFF = -0.3
+
+
+
[docs]def create_binned_ht( + ht: hl.Table, + n_bins: int = 100, + singleton: bool = True, + biallelic: bool = True, + adj: bool = True, + add_substrat: Optional[Dict[str, hl.expr.BooleanExpression]] = None, +) -> hl.Table: + """ + Annotate each row of `ht` with a bin based on binning the score annotation into `n_bins` equally-sized bins. + + This is meant as a default wrapper for `compute_ranked_bin`. + + .. note:: + + The following fields should be present: + - score + - ac - expected that this is the adj filtered allele count + - ac_raw - expected that this is the raw allele count before adj filtering + + Computes bin numbers stratified by SNV / Indels and with the following optional sub bins + - singletons + - biallelics + - biallelic singletons + - adj + - adj biallelics + - adj singletons + - adj biallelic singletons + + :param ht: Input table + :param n_bins: Number of bins to bin into + :param singleton: Should bins be stratified by singletons + :param biallelic: Should bins be stratified by bi-alleleic variants + :param adj: Should bins be stratified by adj filtering + :param add_substrat: Any additional stratifications for adding bins + :return: table with bin number for each variant + """ + + def _new_bin_expr( + bin_expr: Union[Dict[str, hl.expr.BooleanExpression], Dict[str, bool]], + new_expr: hl.expr.BooleanExpression, + new_id: str, + update: bool = False, + ) -> Dict[str, hl.expr.BooleanExpression]: + """ + Update a dictionary of expressions to add another stratification. + + :param bin_expr: Dictionary of expressions to add another stratification to + :param new_expr: New Boolean expression to add to `bin_expr` + :param new_id: Name to add to each current key in `bin_expr` to indicate the new stratification + :return: Dictionary of `bin_expr` updated with `new_expr` added as an additional stratification to all + expressions already in `bin_expr` + """ + new_bin_expr = { + f"{new_id}_{bin_id}": bin_expr & new_expr + for bin_id, bin_expr in bin_expr.items() + } + if update: + bin_expr.update(new_bin_expr) + return bin_expr + else: + return new_bin_expr + + # Desired bins and sub-bins + bin_expr = {"bin": True} + + if singleton: + bin_expr = _new_bin_expr(bin_expr, ht.ac_raw == 1, "singleton", update=True) + + if biallelic: + bin_expr = _new_bin_expr(bin_expr, ~ht.was_split, "biallelic", update=True) + + if adj: + bin_expr = _new_bin_expr(bin_expr, (ht.ac > 0), "adj", update=True) + + if add_substrat is not None: + new_bin_expr = {} + for add_id, add_expr in add_substrat.items(): + new_bin_expr.update(_new_bin_expr(bin_expr, add_expr, add_id)) + bin_expr.update(new_bin_expr) + + bin_ht = compute_ranked_bin( + ht, score_expr=ht.score, bin_expr=bin_expr, n_bins=n_bins + ) + + ht = ht.select_globals() + ht = ht.join(bin_ht, how="left") + + return ht
+ + +
[docs]def score_bin_agg( + ht: hl.GroupedTable, fam_stats_ht: hl.Table +) -> Dict[str, hl.expr.Aggregation]: + """ + Make dict of aggregations for min/max of score, number of ClinVar variants, number of truth variants, and family statistics. + + .. note:: + + This function uses `ht._parent` to get the origin Table from the GroupedTable for the aggregation + + This can easily be combined with the GroupedTable returned by `compute_grouped_binned_ht`, For example: + + .. code-block:: python + + binned_ht = create_binned_ht(...) + grouped_binned_ht = compute_grouped_binned_ht(binned_ht) + agg_ht = grouped_binned_ht.aggregate(score_bin_agg(**grouped_binned_ht, ...)) + + .. note:: + + The following annotations should be present: + + In ht: + - score + - singleton + - positive_train_site + - negative_train_site + - ac_raw - expected that this is the raw allele count before adj filtering + - ac - expected that this is the allele count after adj filtering + - ac_qc_samples_unrelated_raw - allele count before adj filtering for unrelated samples passing sample QC + - info - struct that includes QD, FS, and MQ in order to add an annotation for fail_hard_filters + + In truth_ht: + - omni + - mills + - hapmap + - kgp_phase1_hc + + In fam_stats_ht: + - n_de_novos_adj + - n_de_novos_raw + - n_transmitted_raw + - n_untransmitted_raw + + Automatic aggregations that will be done are: + - `min_score` - minimun of score annotation per group + - `max_score` - maiximum of score annotation per group + - `n` - count of variants per group + - `n_ins` - count of insertion per group + - `n_ins` - count of insertion per group + - `n_del` - count of deletions per group + - `n_ti` - count of transitions per group + - `n_tv` - count of trnasversions per group + - `n_1bp_indel` - count of one base pair indels per group + - `n_mod3bp_indel` - count of indels with a length divisible by three per group + - `n_singleton` - count of singletons per group + - `fail_hard_filters` - count of variants per group with QD < 2 | FS > 60 | MQ < 30 + - `n_vqsr_pos_train` - count of variants that were a VQSR positive train site per group + - `n_vqsr_neg_train` - count of variants that were a VQSR negative train site per group + - `n_clinvar` - count of clinvar variants + - `n_de_novos_singleton_adj` - count of singleton de novo variants after adj filtration + - `n_de_novo_singleton` - count of raw unfiltered singleton de novo variants + - `n_de_novos_adj` - count of adj filtered de novo variants + - `n_de_novos` - count of raw unfiltered de novo variants + - `n_trans_singletons` - count of transmitted singletons + - `n_untrans_singletons` - count of untransmitted singletons + - `n_omni` - count of omni truth variants + - `n_mills` - count of mills truth variants + - `n_hapmap` - count of hapmap truth variants + - `n_kgp_phase1_hc` - count of 1000 genomes phase 1 high confidence truth variants + + :param ht: Table that aggregation will be performed on + :param fam_stats_ht: Path to family statistics HT + :return: a dictionary containing aggregations to perform on ht + """ + # Annotate binned table with the evaluation data + ht = ht._parent + indel_length = hl.abs(ht.alleles[0].length() - ht.alleles[1].length()) + # Load external evaluation data + build = get_reference_genome(ht.locus).name + clinvar_ht = ( + grch37_resources.reference_data.clinvar + if build == "GRCh37" + else grch38_resources.reference_data.clinvar + ).ht() + # Filter to ClinVar pathogenic data. + clinvar_path = filter_to_clinvar_pathogenic(clinvar_ht)[ht.key] + clinvar = clinvar_ht[ht.key] + truth_data = ( + grch37_resources.reference_data.get_truth_ht() + if build == "GRCh37" + else grch38_resources.reference_data.get_truth_ht() + )[ht.key] + fam = fam_stats_ht[ht.key] + + if "fail_hard_filters" in ht.row: + fail_hard_filters_expr = ht.fail_hard_filters + elif "info" in ht.row: + fail_hard_filters_expr = ( + (ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30) + ) + else: + raise ValueError( + "Either 'fail_hard_filters' or 'info' must be present in the input Table!" + ) + + ins_expr = hl.is_insertion(ht.alleles[0], ht.alleles[1]) + del_expr = hl.is_deletion(ht.alleles[0], ht.alleles[1]) + indel_1bp_expr = indel_length == 1 + count_where_expr = { + "n_ins": ins_expr, + "n_del": del_expr, + "n_ti": hl.is_transition(ht.alleles[0], ht.alleles[1]), + "n_tv": hl.is_transversion(ht.alleles[0], ht.alleles[1]), + "n_1bp_indel": indel_1bp_expr, + "n_1bp_ins": ins_expr & indel_1bp_expr, + "n_2bp_ins": ins_expr & (indel_length == 2), + "n_3bp_ins": ins_expr & (indel_length == 3), + "n_1bp_del": del_expr & indel_1bp_expr, + "n_2bp_del": del_expr & (indel_length == 2), + "n_3bp_del": del_expr & (indel_length == 3), + "n_mod3bp_indel": (indel_length % 3) == 0, + "n_singleton": ht.singleton, + "fail_hard_filters": fail_hard_filters_expr, + "n_pos_train": ht.positive_train_site, + "n_neg_train": ht.negative_train_site, + "n_clinvar": hl.is_defined(clinvar), + "n_clinvar_path": hl.is_defined(clinvar_path), + "n_omni": truth_data.omni, + "n_mills": truth_data.mills, + "n_hapmap": truth_data.hapmap, + "n_kgp_phase1_hc": truth_data.kgp_phase1_hc, + } + + return dict( + min_score=hl.agg.min(ht.score), + max_score=hl.agg.max(ht.score), + n=hl.agg.count(), + **{k: hl.agg.count_where(v) for k, v in count_where_expr.items()}, + n_de_novos_singleton_adj=hl.agg.filter( + ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj) + ), + n_de_novo_singleton=hl.agg.filter( + ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw) + ), + n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj), + n_de_novo=hl.agg.sum(fam.n_de_novos_raw), + n_de_novos_AF_001_adj=hl.agg.filter( + hl.if_else( + fam.ac_parents_adj == 0, 0.0, fam.ac_parents_adj / fam.an_parents_adj + ) + < 0.001, + hl.agg.sum(fam.n_de_novos_adj), + ), + n_de_novos_AF_001=hl.agg.filter( + hl.if_else( + fam.ac_parents_raw == 0, 0.0, fam.ac_parents_raw / fam.an_parents_raw + ) + < 0.001, + hl.agg.sum(fam.n_de_novos_raw), + ), + n_trans_singletons=hl.agg.filter( + ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw) + ), + n_untrans_singletons=hl.agg.filter( + (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1), + hl.agg.sum(fam.n_untransmitted_raw), + ), + n_train_trans_singletons=hl.agg.filter( + (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw) + ), + )
+ + +
[docs]def generate_trio_stats( + mt: hl.MatrixTable, autosomes_only: bool = True, bi_allelic_only: bool = True +) -> hl.Table: + """ + Run `generate_trio_stats_expr` with variant QC pipeline defaults to get trio stats stratified by raw and adj. + + .. note:: + + Expects that `mt` is it a trio matrix table that was annotated with adj and if dealing with + a sparse MT `hl.experimental.densify` must be run first. + + By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites. + + :param mt: A Trio Matrix Table returned from `hl.trio_matrix`. Must be dense + :param autosomes_only: If set, only autosomal intervals are used. + :param bi_allelic_only: If set, only bi-allelic sites are used for the computation + :return: Table with trio stats + """ + if autosomes_only: + mt = filter_to_autosomes(mt) + if bi_allelic_only: + mt = mt.filter_rows(bi_allelic_expr(mt)) + + logger.info("Generating trio stats using %d trios.", mt.count_cols()) + trio_adj = mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj + + ht = mt.select_rows( + **generate_trio_stats_expr( + mt, + transmitted_strata={"raw": True, "adj": trio_adj}, + de_novo_strata={"raw": True, "adj": trio_adj}, + ac_strata={"raw": True, "adj": trio_adj}, + ) + ).rows() + + return ht
+ + +
[docs]def generate_sib_stats( + mt: hl.MatrixTable, + relatedness_ht: hl.Table, + i_col: str = "i", + j_col: str = "j", + relationship_col: str = "relationship", + autosomes_only: bool = True, + bi_allelic_only: bool = True, +) -> hl.Table: + """ + Generate a hail table with counts of variants shared by pairs of siblings in `relatedness_ht`. + + This is meant as a default wrapper for `generate_sib_stats_expr`. + + This function takes a hail Table with a row for each pair of individuals i,j in the data that are related + (it's OK to have unrelated samples too). + + The `relationship_col` should be a column specifying the relationship between each two samples as defined by + the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of + samples that are annotated as `SIBLINGS`. + + .. note:: + + By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites. + + :param mt: Input Matrix table + :param relatedness_ht: Input relationship table + :param i_col: Column containing the 1st sample of the pair in the relationship table + :param j_col: Column containing the 2nd sample of the pair in the relationship table + :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants. + :param autosomes_only: If set, only autosomal intervals are used. + :param bi_allelic_only: If set, only bi-allelic sites are used for the computation + :return: A Table with the sibling shared variant counts + """ + if autosomes_only: + mt = filter_to_autosomes(mt) + if bi_allelic_only: + mt = mt.filter_rows(bi_allelic_expr(mt)) + + sib_ht = relatedness_ht.filter(relatedness_ht[relationship_col] == SIBLINGS) + s_to_keep = sib_ht.aggregate( + hl.agg.explode( + lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s] + ), + _localize=False, + ) + mt = mt.filter_cols(s_to_keep.contains(mt.s)) + if "adj" not in mt.entry: + mt = annotate_adj(mt) + + sib_stats_ht = mt.select_rows( + **generate_sib_stats_expr( + mt, + sib_ht, + i_col=i_col, + j_col=j_col, + strata={"raw": True, "adj": mt.adj}, + ) + ).rows() + + return sib_stats_ht
+ + +
[docs]def train_rf_model( + ht: hl.Table, + rf_features: List[str], + tp_expr: hl.expr.BooleanExpression, + fp_expr: hl.expr.BooleanExpression, + fp_to_tp: float = 1.0, + num_trees: int = 500, + max_depth: int = 5, + test_expr: hl.expr.BooleanExpression = False, +) -> Tuple[hl.Table, pyspark.ml.PipelineModel]: + """ + Perform random forest (RF) training using a Table annotated with features and training data. + + .. note:: + + This function uses `train_rf` and extends it by: + - Adding an option to apply the resulting model to test variants which are withheld from training. + - Uses a false positive (FP) to true positive (TP) ratio to determine what variants to use for RF training. + + The returned Table includes the following annotations: + - rf_train: indicates if the variant was used for training of the RF model. + - rf_label: indicates if the variant is a TP or FP. + - rf_test: indicates if the variant was used in testing of the RF model. + - features: global annotation of the features used for the RF model. + - features_importance: global annotation of the importance of each feature in the model. + - test_results: results from testing the model on variants defined by `test_expr`. + + :param ht: Table annotated with features for the RF model and the positive and negative training data. + :param rf_features: List of column names to use as features in the RF training. + :param tp_expr: TP training expression. + :param fp_expr: FP training expression. + :param fp_to_tp: Ratio of FPs to TPs for creating the RF model. If set to 0, all training examples are used. + :param num_trees: Number of trees in the RF model. + :param max_depth: Maxmimum tree depth in the RF model. + :param test_expr: An expression specifying variants to hold out for testing and use for evaluation only. + :return: Table with TP and FP training sets used in the RF training and the resulting RF model. + """ + ht = ht.annotate(_tp=tp_expr, _fp=fp_expr, rf_test=test_expr) + + rf_ht = sample_training_examples( + ht, tp_expr=ht._tp, fp_expr=ht._fp, fp_to_tp=fp_to_tp, test_expr=ht.rf_test + ) + ht = ht.annotate(rf_train=rf_ht[ht.key].train, rf_label=rf_ht[ht.key].label) + + summary = ht.group_by("_tp", "_fp", "rf_train", "rf_label", "rf_test").aggregate( + n=hl.agg.count() + ) + logger.info("Summary of TP/FP and RF training data:") + summary.show(n=20) + + logger.info( + "Training RF model:\nfeatures: %s\nnum_tree: %d\nmax_depth:%d", + ",".join(rf_features), + num_trees, + max_depth, + ) + + rf_model = train_rf( + ht.filter(ht.rf_train), + features=rf_features, + label="rf_label", + num_trees=num_trees, + max_depth=max_depth, + ) + + test_results = None + if test_expr is not None: + logger.info("Testing model on specified variants or intervals...") + test_ht = ht.filter(hl.is_defined(ht.rf_label) & ht.rf_test) + test_results = test_model( + test_ht, rf_model, features=rf_features, label="rf_label" + ) + + features_importance = get_features_importance(rf_model) + ht = ht.select_globals( + features_importance=features_importance, + features=rf_features, + test_results=test_results, + ) + + return ht.select("rf_train", "rf_label", "rf_test"), rf_model
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/variant_qc/random_forest.html b/_modules/gnomad/variant_qc/random_forest.html new file mode 100644 index 000000000..d2e127fee --- /dev/null +++ b/_modules/gnomad/variant_qc/random_forest.html @@ -0,0 +1,699 @@ + + + + + + gnomad.variant_qc.random_forest — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for gnomad.variant_qc.random_forest

+# noqa: D100
+
+import json
+import logging
+import pprint
+from pprint import pformat
+from typing import Dict, List, Optional, Tuple
+
+import hail as hl
+import pandas as pd
+import pyspark.sql
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import RandomForestClassifier
+from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
+from pyspark.ml.functions import vector_to_array
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col  # pylint: disable=no-name-in-module
+
+from gnomad.utils.file_utils import file_exists
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def run_rf_test( + mt: hl.MatrixTable, output: str = "/tmp" +) -> Tuple[pyspark.ml.PipelineModel, hl.Table]: + """ + Run a dummy test RF on a given MT. + + 1. Creates row annotations and labels to run model on + 2. Trains a RF pipeline model (including median imputation of missing values in created annotations) + 3. Saves the RF pipeline model + 4. Applies the model to the MT and prints features importance + + :param mt: Input MT + :param output: Output files prefix to save the RF model + :return: RF model and MatrixTable after applying RF model + """ + mt = mt.annotate_rows( + feature1=hl.rand_bool(0.1), + feature2=hl.rand_norm(0.0, 1.0), + feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)), + ) + + mt = mt.annotate_rows( + label=hl.if_else(mt["feature1"] & (mt["feature2"] > 0), "TP", "FP") + ) + ht = mt.rows() + + def f3stats(ht): + return ht.aggregate( + hl.struct( + n=hl.agg.count_where(hl.is_defined(ht["feature3"])), + med=hl.median(hl.agg.collect(ht["feature3"])), + ) + ) + + f3_before_imputation = f3stats(ht) + logger.info("Feature3 defined values before imputation: %d", f3_before_imputation.n) + logger.info("Feature3 median: %f", f3_before_imputation.med) + + features_to_impute = ["feature3"] + quantiles = get_columns_quantiles(ht, features_to_impute, [0.5]) + quantiles = {k: v[0] for k, v in quantiles.items()} + + logger.info("Features median:\n%s", [f"{k}: {v}\n" for k, v in quantiles.items()]) + ht = ht.annotate(**{f: hl.or_else(ht[f], quantiles[f]) for f in features_to_impute}) + ht = ht.annotate_globals(medians=quantiles) + + f3_after_imputation = f3stats(ht) + logger.info("Feature3 defined values after imputation: %d", f3_after_imputation.n) + logger.info("Feature3 median: %f", f3_after_imputation.med) + + ht = ht.select("label", "feature1", "feature2", "feature3") + + label = "label" + features = ["feature1", "feature2", "feature3"] + + rf_model = train_rf(ht, features, label) + save_model(rf_model, out_path=output + "/rf.model", overwrite=True) + rf_model = load_model(output + "/rf.model") + + return rf_model, apply_rf_model(ht, rf_model, features, label)
+ + +
[docs]def check_ht_fields_for_spark(ht: hl.Table, fields: List[str]) -> None: + """ + Check specified fields of a hail table for Spark DataFrame conversion (type and name). + + :param ht: input Table + :param fields: Fields to test + :return: None + """ + allowed_types = [ + hl.tfloat, + hl.tfloat32, + hl.tfloat64, + hl.tint, + hl.tint32, + hl.tint64, + hl.tstr, + hl.tbool, + ] + + bad_field_names = [c for c in fields if "." in c] + + bad_types = [ + c[0] + for c in ht.key_by().select(*fields).row.items() + if c[1].dtype not in allowed_types + ] + + if bad_field_names or bad_types: + raise ValueError( + "Only basic type fields can be converted from Hail to Spark. In addition," + " `.` are not allowed in field names in Spark.\n" + f"Offending fields (non basic type): {bad_types}" + f"Offending fields (bad field name): {', '.join(bad_field_names)}\n" + ) + + return
+ + +
[docs]def get_columns_quantiles( + ht: hl.Table, fields: List[str], quantiles: List[float], relative_error: int = 0.001 +) -> Dict[str, List[float]]: + """ + Compute approximate quantiles of specified numeric fields from non-missing values. Non-numeric fields are ignored. + + This function returns a Dict of column name -> list of quantiles in the same order specified. + If a column only has NAs, None is returned. + + :param ht: input HT + :param fields: list of features to impute. If none given, all numerical features with missing data are imputed + :param quantiles: list of quantiles to return (e.g. [0.5] would return the median) + :param relative_error: The relative error on the quantile approximation + :return: Dict of column -> quantiles + """ + check_ht_fields_for_spark(ht, fields) + + df = ht.key_by().select(*fields).to_spark() + + res = {} + for f in fields: + logger.info("Computing median for column: %s", f) + col_no_na = df.select(f).dropna() + if col_no_na.first() is not None: + res[f] = col_no_na.approxQuantile(str(f), quantiles, relative_error) + else: + res[f] = None + + return res
+ + +
[docs]def median_impute_features( + ht: hl.Table, strata: Optional[Dict[str, hl.expr.Expression]] = None +) -> hl.Table: + """ + Numerical features in the Table are median-imputed by Hail's `approx_median`. + + If a `strata` dict is given, imputation is done based on the median of of each stratification. + + The annotations that are added to the Table are + - feature_imputed - A row annotation indicating if each numerical feature was imputed or not. + - features_median - A global annotation containing the median of the numerical features. If `strata` is given, + this struct will also be broken down by the given strata. + - variants_by_strata - An additional global annotation with the variant counts by strata that will only be + added if imputing by a given `strata`. + + :param ht: Table containing all samples and features for median imputation. + :param strata: Whether to impute features median by specific strata (default False). + :return: Feature Table imputed using approximate median values. + """ + logger.info("Computing feature medians for imputation of missing numeric values") + numerical_features = [ + k for k, v in ht.row.dtype.items() if v == hl.tint or v == hl.tfloat + ] + + median_agg_expr = hl.struct( + **{feature: hl.agg.approx_median(ht[feature]) for feature in numerical_features} + ) + + if strata: + ht = ht.annotate_globals( + feature_medians=ht.aggregate( + hl.agg.group_by(hl.tuple([ht[x] for x in strata]), median_agg_expr), + _localize=False, + ), + variants_by_strata=ht.aggregate( + hl.agg.counter(hl.tuple([ht[x] for x in strata])), _localize=False + ), + ) + feature_median_expr = ht.feature_medians[hl.tuple([ht[x] for x in strata])] + logger.info( + "Variant count by strata:\n%s", + "\n".join( + [ + "{}: {}".format(k, v) + for k, v in hl.eval(ht.variants_by_strata).items() + ] + ), + ) + + else: + ht = ht.annotate_globals( + feature_medians=ht.aggregate(median_agg_expr, _localize=False) + ) + feature_median_expr = ht.feature_medians + + ht = ht.annotate( + **{f: hl.or_else(ht[f], feature_median_expr[f]) for f in numerical_features}, + feature_imputed=hl.struct( + **{f: hl.is_missing(ht[f]) for f in numerical_features} + ), + ) + + return ht
+ + +
[docs]def ht_to_rf_df( + ht: hl.Table, features: List[str], label: str = None, index: str = None +) -> pyspark.sql.DataFrame: + """ + Create a Spark dataframe ready for RF from a HT. + + Rows with any missing features are dropped. + Missing labels are replaced with 'NA' + + .. note:: + + Only basic types are supported! + + :param ht: Input HT + :param features: Features that will be used for RF + :param label: Optional label column that will be predicted by RF + :param index: Optional index column to keep (E.g. for joining results back at later stage) + :return: Spark Dataframe + """ + cols_to_keep = features[:] + + if label: + cols_to_keep.append(label) + if index: + cols_to_keep.append(index) + + df = ht.key_by().select(*cols_to_keep).to_spark() + df = df.dropna(subset=features) + + if label: + df = df.fillna("NA", subset=label) + + return df
+ + +
[docs]def get_features_importance( + rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3 +) -> Dict[str, float]: + """ + Extract the features importance from a Pipeline model containing a RandomForestClassifier stage. + + :param rf_pipeline: Input pipeline + :param rf_index: index of the RandomForestClassifier stage + :param assembler_index: index of the VectorAssembler stage + :return: feature importance for each feature in the RF model + """ + feature_names = [ + x[: -len("_indexed")] if x.endswith("_indexed") else x + for x in rf_pipeline.stages[assembler_index].getInputCols() + ] + + return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
+ + +
[docs]def get_labels(rf_pipeline: pyspark.ml.PipelineModel) -> List[str]: + """ + Return the labels from the StringIndexer stage at index 0 from an RF pipeline model. + + :param rf_pipeline: Input pipeline + :return: labels + """ + return rf_pipeline.stages[0].labels
+ + +
[docs]def test_model( + ht: hl.Table, + rf_model: pyspark.ml.PipelineModel, + features: List[str], + label: str, + prediction_col_name: str = "rf_prediction", +) -> List[hl.tstruct]: + """ + A wrapper to test a model on a set of examples with known labels. + + 1) Runs the model on the data + 2) Prints confusion matrix and accuracy + 3) Returns confusion matrix as a list of struct + + :param ht: Input table + :param rf_model: RF Model + :param features: Columns containing features that were used in the model + :param label: Column containing label to be predicted + :param prediction_col_name: Where to store the prediction + :return: A list containing structs with {label, prediction, n} + """ + ht = apply_rf_model( + ht.filter(hl.is_defined(ht[label])), + rf_model, + features, + label, + prediction_col_name=prediction_col_name, + ) + + test_results = ( + ht.group_by(ht[prediction_col_name], ht[label]) + .aggregate(n=hl.agg.count()) + .collect() + ) + + # Print results + df = pd.DataFrame(test_results) + df = df.pivot(index=label, columns=prediction_col_name, values="n") + logger.info("Testing results:\n%s", pprint.pformat(df)) + logger.info( + "Accuracy: %f", + sum([x.n for x in test_results if x[label] == x[prediction_col_name]]) + / sum([x.n for x in test_results]), + ) + + return test_results
+ + +
[docs]def apply_rf_model( + ht: hl.Table, + rf_model: pyspark.ml.PipelineModel, + features: List[str], + label: str = None, + probability_col_name: str = "rf_probability", + prediction_col_name: str = "rf_prediction", +) -> hl.Table: + """ + Apply a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions. + + :param ht: Input HT + :param rf_model: Random Forest pipeline model + :param features: List of feature columns in the pipeline. !Should match the model list of features! + :param label: Optional column containing labels. !Should match the model labels! + :param probability_col_name: Name of the column that will store the RF probabilities + :param prediction_col_name: Name of the column that will store the RF predictions + :return: Table with RF columns + """ + logger.info("Applying RF model.") + + check_fields = features[:] + if label: + check_fields.append(label) + check_ht_fields_for_spark(ht, check_fields) + + index_name = "rf_idx" + while index_name in ht.row: + index_name += "_tmp" + ht = ht.add_index(name=index_name) + + ht_keys = ht.key + ht = ht.key_by(index_name) + + df = ht_to_rf_df(ht, features, label, index_name) + + rf_df = rf_model.transform(df) + prob_cols = get_labels(rf_model) + + rf_df = rf_df.withColumn("probability", vector_to_array("probability")).select( + [index_name, "predictedLabel"] + + [col("probability")[i] for i in range(len(prob_cols))] + ) + new_colnames = [index_name, "predictedLabel"] + prob_cols + rf_df = rf_df.toDF(*new_colnames) + + # Note: SparkSession is needed to write DF to disk before converting to HT; + # the resulting HT sometimes has missing and/or duplicate rows without + # the intermediate write. + spark = SparkSession.builder.getOrCreate() + rf_df.write.mode("overwrite").save("rf_probs.parquet") + rf_df = spark.read.parquet("rf_probs.parquet") + + rf_ht = hl.Table.from_spark(rf_df) + + rf_ht = rf_ht.key_by(index_name) + ht = ht.annotate( + **{ + probability_col_name: {c: rf_ht[ht[index_name]][c] for c in prob_cols}, + prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"], + } + ) + + ht = ht.key_by(*ht_keys) + ht = ht.drop(index_name) + + return ht
+ + +
[docs]def save_model( + rf_pipeline: pyspark.ml.PipelineModel, out_path: str, overwrite: bool = False +) -> None: + """ + Save a Random Forest pipeline model. + + :param rf_pipeline: Pipeline to save + :param out_path: Output path + :param overwrite: If set, will overwrite existing file(s) at output location + :return: Nothing + """ + logger.info("Saving model to %s", out_path) + if overwrite: + rf_pipeline.write().overwrite().save(out_path) + else: + rf_pipeline.save(out_path)
+ + +
[docs]def load_model(input_path: str) -> pyspark.ml.PipelineModel: + """ + Load a Random Forest pipeline model. + + :param input_path: Location of model to load + :return: Random Forest pipeline model + """ + logger.info("Loading model from %s", input_path) + return pyspark.ml.PipelineModel.load(input_path)
+ + +
[docs]def train_rf( + ht: hl.Table, + features: List[str], + label: str, + num_trees: int = 500, + max_depth: int = 5, +) -> pyspark.ml.PipelineModel: + """ + Train a Random Forest (RF) pipeline model. + + :param ht: Input HT + :param features: List of columns to be used as features + :param label: Column containing the label to predict + :param num_trees: Number of trees to use + :param max_depth: Maximum tree depth + :return: Random Forest pipeline model + """ + logger.info( + "Training RF model using:\nfeatures: %s\nlabels: %s\nnum_trees:" + " %d\nmax_depth: %d", + ",".join(features), + label, + num_trees, + max_depth, + ) + + check_ht_fields_for_spark(ht, features + [label]) + + df = ht_to_rf_df(ht, features, label) + + label_indexer = ( + StringIndexer(inputCol=label, outputCol=label + "_indexed") + .setHandleInvalid("keep") + .fit(df) + ) + labels = label_indexer.labels + logger.info("Found labels: %s", labels) + + string_features = [x[0] for x in df.dtypes if x[0] != label and x[1] == "string"] + if string_features: + logger.info("Indexing string features: %s", ",".join(string_features)) + string_features_indexers = [ + StringIndexer(inputCol=x, outputCol=x + "_indexed") + .setHandleInvalid("keep") + .fit(df) + for x in string_features + ] + + assembler = VectorAssembler( + inputCols=[ + x[0] + "_indexed" if x[1] == "string" else x[0] + for x in df.dtypes + if x[0] != label + ], + outputCol="features", + ) + + rf = RandomForestClassifier( + labelCol=label + "_indexed", + featuresCol="features", + maxDepth=max_depth, + numTrees=num_trees, + ) + + label_converter = IndexToString( + inputCol="prediction", outputCol="predictedLabel", labels=labels + ) + + pipeline = Pipeline( + stages=[label_indexer] + + string_features_indexers + + [assembler, rf, label_converter] + ) + + # Train model + logger.info("Training RF model") + rf_model = pipeline.fit(df) + + feature_importance = get_features_importance(rf_model) + + logger.info( + "RF features importance:\n%s", + "\n".join([f"{f}: {i}" for f, i in feature_importance.items()]), + ) + + return rf_model
+ + +
[docs]def get_rf_runs(rf_json_fp: str) -> Dict: + """ + Load RF run data from JSON file. + + :param rf_json_fp: File path to rf json file. + :return: Dictionary containing the content of the JSON file, or an empty dictionary if the file wasn't found. + """ + if file_exists(rf_json_fp): + with hl.hadoop_open(rf_json_fp) as f: + return json.load(f) + else: + logger.warning( + "File %s could not be found. Returning empty RF run hash dict.", rf_json_fp + ) + return {}
+ + +
[docs]def get_run_data( + input_args: Dict[str, bool], + test_intervals: List[str], + features_importance: Dict[str, float], + test_results: List[hl.tstruct], +) -> Dict: + """ + Create a Dict containing information about the RF input arguments and feature importance. + + :param Dict of bool keyed by str input_args: Dictionary of model input arguments + :param List of str test_intervals: Intervals withheld from training to be used in testing + :param Dict of float keyed by str features_importance: Feature importance returned by the RF + :param List of struct test_results: Accuracy results from applying RF model to the test intervals + :return: Dict of RF information + """ + run_data = { + "input_args": input_args, + "features_importance": features_importance, + "test_intervals": test_intervals, + } + + if test_results is not None: + tps = 0 + total = 0 + for row in test_results: + values = list(row.values()) + # Note: values[0] is the TP/FP label and values[1] is the prediction + if values[0] == values[1]: + tps += values[2] + total += values[2] + run_data["test_results"] = [dict(x) for x in test_results] + run_data["test_accuracy"] = tps / total + + return run_data
+ + +
[docs]def pretty_print_runs( + runs: Dict, label_col: str = "rf_label", prediction_col_name: str = "rf_prediction" +) -> None: + """ + Print the information for the RF runs loaded from the json file storing the RF run hashes -> info. + + :param runs: Dictionary containing JSON input loaded from RF run file + :param label_col: Name of the RF label column + :param prediction_col_name: Name of the RF prediction column + :return: Nothing -- only prints information + """ + for run_hash, run_data in runs.items(): + print(f"\n=== {run_hash} ===") + testing_results = ( + run_data.pop("test_results") if "test_results" in run_data else None + ) + # print(testing_results) + print(json.dumps(run_data, sort_keys=True, indent=4, separators=(",", ": "))) + if testing_results is not None: + # Print results + res_pd = pd.DataFrame(testing_results) + res_pd = res_pd.pivot( + index=label_col, columns=prediction_col_name, values="n" + ) + logger.info("Testing results:\n%s", pformat(res_pd))
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/gnomad/variant_qc/training.html b/_modules/gnomad/variant_qc/training.html new file mode 100644 index 000000000..518e127de --- /dev/null +++ b/_modules/gnomad/variant_qc/training.html @@ -0,0 +1,228 @@ + + + + + + gnomad.variant_qc.training — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for gnomad.variant_qc.training

+# noqa: D100
+
+import logging
+from pprint import pformat
+from typing import Optional, Tuple
+
+import hail as hl
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+
[docs]def sample_training_examples( + ht: hl.Table, + tp_expr: hl.BooleanExpression, + fp_expr: hl.BooleanExpression, + fp_to_tp: float = 1.0, + test_expr: Optional[hl.expr.BooleanExpression] = None, +) -> hl.Table: + """ + Return a Table of all positive and negative training examples in `ht` with an annotation indicating those that should be used for training. + + If `fp_to_tp` is greater than 0, this true positive (TP) to false positive (FP) ratio will be used to determine + sampling of training variants. + + The returned Table has the following annotations: + - train: indicates if the variant should be used for training. A row is given False for the annotation if True + for `test_expr`, True for both `tp_expr and fp_expr`, or it is pruned out to obtain the desired `fp_to_tp` ratio. + - label: indicates if a variant is a 'TP' or 'FP' and will also be labeled as such for variants defined by `test_expr`. + + .. note:: + + - This function does not support multi-allelic variants. + - The function will give some stats about the TPs/FPs provided (Ti, Tv, indels). + + :param ht: Input Table. + :param tp_expr: Expression for TP examples. + :param fp_expr: Expression for FP examples. + :param fp_to_tp: FP to TP ratio. If set to <= 0, all training examples are used. + :param test_expr: Optional expression to exclude a set of variants from training set. Still contains TP/FP label annotation. + :return: Table subset with corresponding TP and FP examples with desired FP to TP ratio. + """ + ht = ht.select( + _tp=hl.or_else(tp_expr, False), + _fp=hl.or_else(fp_expr, False), + _exclude=False if test_expr is None else test_expr, + ) + ht = ht.filter(ht._tp | ht._fp).persist() + + # Get stats about TP / FP sets + def _get_train_counts(ht: hl.Table) -> Tuple[int, int]: + """ + Determine the number of TP and FP variants in the input Table and report some stats on Ti, Tv, indels. + + :param ht: Input Table + :return: Counts of TP and FP variants in the table + """ + train_stats = hl.struct(n=hl.agg.count()) + + if "alleles" in ht.row and ht.row.alleles.dtype == hl.tarray(hl.tstr): + train_stats = train_stats.annotate( + ti=hl.agg.count_where( + hl.expr.is_transition(ht.alleles[0], ht.alleles[1]) + ), + tv=hl.agg.count_where( + hl.expr.is_transversion(ht.alleles[0], ht.alleles[1]) + ), + indel=hl.agg.count_where( + hl.expr.is_indel(ht.alleles[0], ht.alleles[1]) + ), + ) + + # Sample training examples + pd_stats = ( + ht.group_by(**{"contig": ht.locus.contig, "tp": ht._tp, "fp": ht._fp}) + .aggregate(**train_stats) + .to_pandas() + ) + + logger.info(pformat(pd_stats)) + pd_stats = pd_stats.fillna(False) + + # Number of true positive and false positive variants to be sampled for + # the training set + n_tp = pd_stats[pd_stats["tp"] & ~pd_stats["fp"]]["n"].sum() + n_fp = pd_stats[~pd_stats["tp"] & pd_stats["fp"]]["n"].sum() + + return n_tp, n_fp + + n_tp, n_fp = _get_train_counts(ht.filter(~ht._exclude)) + + prob_tp = prob_fp = 1.0 + if fp_to_tp > 0: + desired_fp = fp_to_tp * n_tp + if desired_fp < n_fp: + prob_fp = desired_fp / n_fp + else: + prob_tp = n_fp / desired_fp + + logger.info( + "Training examples sampling: tp=%f*%d, fp=%f*%d", + prob_tp, + n_tp, + prob_fp, + n_fp, + ) + + train_expr = ( + hl.case(missing_false=True) + .when(ht._fp & hl.or_else(~ht._tp, True), hl.rand_bool(prob_fp)) + .when(ht._tp & hl.or_else(~ht._fp, True), hl.rand_bool(prob_tp)) + .default(False) + ) + else: + train_expr = ~(ht._tp & ht._fp) + logger.info("Using all %d TP and %d FP training examples.", n_tp, n_fp) + + label_expr = ( + hl.case(missing_false=True) + .when(ht._tp & hl.or_else(~ht._fp, True), "TP") + .when(ht._fp & hl.or_else(~ht._tp, True), "FP") + .default(hl.null(hl.tstr)) + ) + + return ht.select(train=train_expr & ~ht._exclude, label=label_expr)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 000000000..7fe692672 --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,137 @@ + + + + + + Overview: module code — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ + + + \ No newline at end of file diff --git a/_sources/api_reference/assessment/index.rst.txt b/_sources/api_reference/assessment/index.rst.txt new file mode 100644 index 000000000..976a33d5a --- /dev/null +++ b/_sources/api_reference/assessment/index.rst.txt @@ -0,0 +1,10 @@ +gnomad.assessment +================= + + + +.. toctree:: + :maxdepth: 2 + + summary_stats + validity_checks diff --git a/_sources/api_reference/assessment/summary_stats.rst.txt b/_sources/api_reference/assessment/summary_stats.rst.txt new file mode 100644 index 000000000..1936e3624 --- /dev/null +++ b/_sources/api_reference/assessment/summary_stats.rst.txt @@ -0,0 +1,8 @@ +gnomad.assessment.summary_stats +=============================== + + + +.. gnomad_automodulesummary:: gnomad.assessment.summary_stats + +.. automodule:: gnomad.assessment.summary_stats diff --git a/_sources/api_reference/assessment/validity_checks.rst.txt b/_sources/api_reference/assessment/validity_checks.rst.txt new file mode 100644 index 000000000..e3b04ec3d --- /dev/null +++ b/_sources/api_reference/assessment/validity_checks.rst.txt @@ -0,0 +1,8 @@ +gnomad.assessment.validity_checks +================================= + + + +.. gnomad_automodulesummary:: gnomad.assessment.validity_checks + +.. automodule:: gnomad.assessment.validity_checks diff --git a/_sources/api_reference/index.rst.txt b/_sources/api_reference/index.rst.txt new file mode 100644 index 000000000..b88f5d4b2 --- /dev/null +++ b/_sources/api_reference/index.rst.txt @@ -0,0 +1,13 @@ +gnomad +====== + + + +.. toctree:: + :maxdepth: 2 + + variant_qc + utils + resources + sample_qc + assessment diff --git a/_sources/api_reference/resources/config.rst.txt b/_sources/api_reference/resources/config.rst.txt new file mode 100644 index 000000000..0e4e3a5a2 --- /dev/null +++ b/_sources/api_reference/resources/config.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.config +======================= + +Configuration for loading resources. + +.. gnomad_automodulesummary:: gnomad.resources.config + +.. automodule:: gnomad.resources.config diff --git a/_sources/api_reference/resources/grch37/gnomad.rst.txt b/_sources/api_reference/resources/grch37/gnomad.rst.txt new file mode 100644 index 000000000..14ec28588 --- /dev/null +++ b/_sources/api_reference/resources/grch37/gnomad.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.grch37.gnomad +============================== + + + +.. gnomad_automodulesummary:: gnomad.resources.grch37.gnomad + +.. automodule:: gnomad.resources.grch37.gnomad diff --git a/_sources/api_reference/resources/grch37/gnomad_ld.rst.txt b/_sources/api_reference/resources/grch37/gnomad_ld.rst.txt new file mode 100644 index 000000000..e089bf7bc --- /dev/null +++ b/_sources/api_reference/resources/grch37/gnomad_ld.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.grch37.gnomad_ld +================================= + + + +.. gnomad_automodulesummary:: gnomad.resources.grch37.gnomad_ld + +.. automodule:: gnomad.resources.grch37.gnomad_ld diff --git a/_sources/api_reference/resources/grch37/index.rst.txt b/_sources/api_reference/resources/grch37/index.rst.txt new file mode 100644 index 000000000..bca6f583a --- /dev/null +++ b/_sources/api_reference/resources/grch37/index.rst.txt @@ -0,0 +1,11 @@ +gnomad.resources.grch37 +======================= + + + +.. toctree:: + :maxdepth: 2 + + gnomad + gnomad_ld + reference_data diff --git a/_sources/api_reference/resources/grch37/reference_data.rst.txt b/_sources/api_reference/resources/grch37/reference_data.rst.txt new file mode 100644 index 000000000..62a9fb8b2 --- /dev/null +++ b/_sources/api_reference/resources/grch37/reference_data.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.grch37.reference_data +====================================== + + + +.. gnomad_automodulesummary:: gnomad.resources.grch37.reference_data + +.. automodule:: gnomad.resources.grch37.reference_data diff --git a/_sources/api_reference/resources/grch38/gnomad.rst.txt b/_sources/api_reference/resources/grch38/gnomad.rst.txt new file mode 100644 index 000000000..10c657475 --- /dev/null +++ b/_sources/api_reference/resources/grch38/gnomad.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.grch38.gnomad +============================== + + + +.. gnomad_automodulesummary:: gnomad.resources.grch38.gnomad + +.. automodule:: gnomad.resources.grch38.gnomad diff --git a/_sources/api_reference/resources/grch38/index.rst.txt b/_sources/api_reference/resources/grch38/index.rst.txt new file mode 100644 index 000000000..b69f5d4ed --- /dev/null +++ b/_sources/api_reference/resources/grch38/index.rst.txt @@ -0,0 +1,10 @@ +gnomad.resources.grch38 +======================= + + + +.. toctree:: + :maxdepth: 2 + + gnomad + reference_data diff --git a/_sources/api_reference/resources/grch38/reference_data.rst.txt b/_sources/api_reference/resources/grch38/reference_data.rst.txt new file mode 100644 index 000000000..310c936a7 --- /dev/null +++ b/_sources/api_reference/resources/grch38/reference_data.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.grch38.reference_data +====================================== + + + +.. gnomad_automodulesummary:: gnomad.resources.grch38.reference_data + +.. automodule:: gnomad.resources.grch38.reference_data diff --git a/_sources/api_reference/resources/import_resources.rst.txt b/_sources/api_reference/resources/import_resources.rst.txt new file mode 100644 index 000000000..960c276c2 --- /dev/null +++ b/_sources/api_reference/resources/import_resources.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.import_resources +================================= + + + +.. gnomad_automodulesummary:: gnomad.resources.import_resources + +.. automodule:: gnomad.resources.import_resources diff --git a/_sources/api_reference/resources/index.rst.txt b/_sources/api_reference/resources/index.rst.txt new file mode 100644 index 000000000..b03e65cc5 --- /dev/null +++ b/_sources/api_reference/resources/index.rst.txt @@ -0,0 +1,13 @@ +gnomad.resources +================ + + + +.. toctree:: + :maxdepth: 2 + + config + grch37 + grch38 + import_resources + resource_utils diff --git a/_sources/api_reference/resources/resource_utils.rst.txt b/_sources/api_reference/resources/resource_utils.rst.txt new file mode 100644 index 000000000..5006e07af --- /dev/null +++ b/_sources/api_reference/resources/resource_utils.rst.txt @@ -0,0 +1,8 @@ +gnomad.resources.resource_utils +=============================== + + + +.. gnomad_automodulesummary:: gnomad.resources.resource_utils + +.. automodule:: gnomad.resources.resource_utils diff --git a/_sources/api_reference/sample_qc/ancestry.rst.txt b/_sources/api_reference/sample_qc/ancestry.rst.txt new file mode 100644 index 000000000..8861c0e36 --- /dev/null +++ b/_sources/api_reference/sample_qc/ancestry.rst.txt @@ -0,0 +1,8 @@ +gnomad.sample_qc.ancestry +========================= + + + +.. gnomad_automodulesummary:: gnomad.sample_qc.ancestry + +.. automodule:: gnomad.sample_qc.ancestry diff --git a/_sources/api_reference/sample_qc/filtering.rst.txt b/_sources/api_reference/sample_qc/filtering.rst.txt new file mode 100644 index 000000000..ed6bf609f --- /dev/null +++ b/_sources/api_reference/sample_qc/filtering.rst.txt @@ -0,0 +1,8 @@ +gnomad.sample_qc.filtering +========================== + + + +.. gnomad_automodulesummary:: gnomad.sample_qc.filtering + +.. automodule:: gnomad.sample_qc.filtering diff --git a/_sources/api_reference/sample_qc/index.rst.txt b/_sources/api_reference/sample_qc/index.rst.txt new file mode 100644 index 000000000..427194953 --- /dev/null +++ b/_sources/api_reference/sample_qc/index.rst.txt @@ -0,0 +1,14 @@ +gnomad.sample_qc +================ + + + +.. toctree:: + :maxdepth: 2 + + ancestry + filtering + pipeline + platform + relatedness + sex diff --git a/_sources/api_reference/sample_qc/pipeline.rst.txt b/_sources/api_reference/sample_qc/pipeline.rst.txt new file mode 100644 index 000000000..b7f2f152c --- /dev/null +++ b/_sources/api_reference/sample_qc/pipeline.rst.txt @@ -0,0 +1,8 @@ +gnomad.sample_qc.pipeline +========================= + + + +.. gnomad_automodulesummary:: gnomad.sample_qc.pipeline + +.. automodule:: gnomad.sample_qc.pipeline diff --git a/_sources/api_reference/sample_qc/platform.rst.txt b/_sources/api_reference/sample_qc/platform.rst.txt new file mode 100644 index 000000000..68a908e57 --- /dev/null +++ b/_sources/api_reference/sample_qc/platform.rst.txt @@ -0,0 +1,8 @@ +gnomad.sample_qc.platform +========================= + + + +.. gnomad_automodulesummary:: gnomad.sample_qc.platform + +.. automodule:: gnomad.sample_qc.platform diff --git a/_sources/api_reference/sample_qc/relatedness.rst.txt b/_sources/api_reference/sample_qc/relatedness.rst.txt new file mode 100644 index 000000000..f75b1353a --- /dev/null +++ b/_sources/api_reference/sample_qc/relatedness.rst.txt @@ -0,0 +1,8 @@ +gnomad.sample_qc.relatedness +============================ + + + +.. gnomad_automodulesummary:: gnomad.sample_qc.relatedness + +.. automodule:: gnomad.sample_qc.relatedness diff --git a/_sources/api_reference/sample_qc/sex.rst.txt b/_sources/api_reference/sample_qc/sex.rst.txt new file mode 100644 index 000000000..aa8b949b6 --- /dev/null +++ b/_sources/api_reference/sample_qc/sex.rst.txt @@ -0,0 +1,8 @@ +gnomad.sample_qc.sex +==================== + + + +.. gnomad_automodulesummary:: gnomad.sample_qc.sex + +.. automodule:: gnomad.sample_qc.sex diff --git a/_sources/api_reference/utils/annotations.rst.txt b/_sources/api_reference/utils/annotations.rst.txt new file mode 100644 index 000000000..161e28df9 --- /dev/null +++ b/_sources/api_reference/utils/annotations.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.annotations +======================== + + + +.. gnomad_automodulesummary:: gnomad.utils.annotations + +.. automodule:: gnomad.utils.annotations diff --git a/_sources/api_reference/utils/constraint.rst.txt b/_sources/api_reference/utils/constraint.rst.txt new file mode 100644 index 000000000..cf1329aeb --- /dev/null +++ b/_sources/api_reference/utils/constraint.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.constraint +======================= + +Script containing generic constraint functions that may be used in the constraint pipeline. + +.. gnomad_automodulesummary:: gnomad.utils.constraint + +.. automodule:: gnomad.utils.constraint diff --git a/_sources/api_reference/utils/file_utils.rst.txt b/_sources/api_reference/utils/file_utils.rst.txt new file mode 100644 index 000000000..6c9774928 --- /dev/null +++ b/_sources/api_reference/utils/file_utils.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.file_utils +======================= + + + +.. gnomad_automodulesummary:: gnomad.utils.file_utils + +.. automodule:: gnomad.utils.file_utils diff --git a/_sources/api_reference/utils/filtering.rst.txt b/_sources/api_reference/utils/filtering.rst.txt new file mode 100644 index 000000000..9ac71882d --- /dev/null +++ b/_sources/api_reference/utils/filtering.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.filtering +====================== + + + +.. gnomad_automodulesummary:: gnomad.utils.filtering + +.. automodule:: gnomad.utils.filtering diff --git a/_sources/api_reference/utils/gen_stats.rst.txt b/_sources/api_reference/utils/gen_stats.rst.txt new file mode 100644 index 000000000..03c57bfde --- /dev/null +++ b/_sources/api_reference/utils/gen_stats.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.gen_stats +====================== + + + +.. gnomad_automodulesummary:: gnomad.utils.gen_stats + +.. automodule:: gnomad.utils.gen_stats diff --git a/_sources/api_reference/utils/index.rst.txt b/_sources/api_reference/utils/index.rst.txt new file mode 100644 index 000000000..efe9bad03 --- /dev/null +++ b/_sources/api_reference/utils/index.rst.txt @@ -0,0 +1,23 @@ +gnomad.utils +============ + + + +.. toctree:: + :maxdepth: 2 + + annotations + constraint + file_utils + filtering + gen_stats + intervals + liftover + plotting + reference_genome + release + slack + sparse_mt + transcript_annotation + vcf + vep diff --git a/_sources/api_reference/utils/intervals.rst.txt b/_sources/api_reference/utils/intervals.rst.txt new file mode 100644 index 000000000..dbee41ca8 --- /dev/null +++ b/_sources/api_reference/utils/intervals.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.intervals +====================== + + + +.. gnomad_automodulesummary:: gnomad.utils.intervals + +.. automodule:: gnomad.utils.intervals diff --git a/_sources/api_reference/utils/liftover.rst.txt b/_sources/api_reference/utils/liftover.rst.txt new file mode 100644 index 000000000..243b96db0 --- /dev/null +++ b/_sources/api_reference/utils/liftover.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.liftover +===================== + + + +.. gnomad_automodulesummary:: gnomad.utils.liftover + +.. automodule:: gnomad.utils.liftover diff --git a/_sources/api_reference/utils/plotting.rst.txt b/_sources/api_reference/utils/plotting.rst.txt new file mode 100644 index 000000000..b1ee5ef22 --- /dev/null +++ b/_sources/api_reference/utils/plotting.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.plotting +===================== + + + +.. gnomad_automodulesummary:: gnomad.utils.plotting + +.. automodule:: gnomad.utils.plotting diff --git a/_sources/api_reference/utils/reference_genome.rst.txt b/_sources/api_reference/utils/reference_genome.rst.txt new file mode 100644 index 000000000..1e4b3b45d --- /dev/null +++ b/_sources/api_reference/utils/reference_genome.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.reference_genome +============================= + + + +.. gnomad_automodulesummary:: gnomad.utils.reference_genome + +.. automodule:: gnomad.utils.reference_genome diff --git a/_sources/api_reference/utils/release.rst.txt b/_sources/api_reference/utils/release.rst.txt new file mode 100644 index 000000000..fa48843cf --- /dev/null +++ b/_sources/api_reference/utils/release.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.release +==================== + + + +.. gnomad_automodulesummary:: gnomad.utils.release + +.. automodule:: gnomad.utils.release diff --git a/_sources/api_reference/utils/slack.rst.txt b/_sources/api_reference/utils/slack.rst.txt new file mode 100644 index 000000000..34a185476 --- /dev/null +++ b/_sources/api_reference/utils/slack.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.slack +================== + + + +.. gnomad_automodulesummary:: gnomad.utils.slack + +.. automodule:: gnomad.utils.slack diff --git a/_sources/api_reference/utils/sparse_mt.rst.txt b/_sources/api_reference/utils/sparse_mt.rst.txt new file mode 100644 index 000000000..ac1bafb84 --- /dev/null +++ b/_sources/api_reference/utils/sparse_mt.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.sparse_mt +====================== + + + +.. gnomad_automodulesummary:: gnomad.utils.sparse_mt + +.. automodule:: gnomad.utils.sparse_mt diff --git a/_sources/api_reference/utils/transcript_annotation.rst.txt b/_sources/api_reference/utils/transcript_annotation.rst.txt new file mode 100644 index 000000000..008dabd08 --- /dev/null +++ b/_sources/api_reference/utils/transcript_annotation.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.transcript_annotation +================================== + +Utils module containing generic functions that are useful for adding transcript expression-aware annotations. + +.. gnomad_automodulesummary:: gnomad.utils.transcript_annotation + +.. automodule:: gnomad.utils.transcript_annotation diff --git a/_sources/api_reference/utils/vcf.rst.txt b/_sources/api_reference/utils/vcf.rst.txt new file mode 100644 index 000000000..184cac449 --- /dev/null +++ b/_sources/api_reference/utils/vcf.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.vcf +================ + + + +.. gnomad_automodulesummary:: gnomad.utils.vcf + +.. automodule:: gnomad.utils.vcf diff --git a/_sources/api_reference/utils/vep.rst.txt b/_sources/api_reference/utils/vep.rst.txt new file mode 100644 index 000000000..8a8164622 --- /dev/null +++ b/_sources/api_reference/utils/vep.rst.txt @@ -0,0 +1,8 @@ +gnomad.utils.vep +================ + + + +.. gnomad_automodulesummary:: gnomad.utils.vep + +.. automodule:: gnomad.utils.vep diff --git a/_sources/api_reference/variant_qc/evaluation.rst.txt b/_sources/api_reference/variant_qc/evaluation.rst.txt new file mode 100644 index 000000000..9a07c1da4 --- /dev/null +++ b/_sources/api_reference/variant_qc/evaluation.rst.txt @@ -0,0 +1,8 @@ +gnomad.variant_qc.evaluation +============================ + + + +.. gnomad_automodulesummary:: gnomad.variant_qc.evaluation + +.. automodule:: gnomad.variant_qc.evaluation diff --git a/_sources/api_reference/variant_qc/index.rst.txt b/_sources/api_reference/variant_qc/index.rst.txt new file mode 100644 index 000000000..be81532bd --- /dev/null +++ b/_sources/api_reference/variant_qc/index.rst.txt @@ -0,0 +1,13 @@ +gnomad.variant_qc +================= + + + +.. toctree:: + :maxdepth: 2 + + evaluation + ld + pipeline + random_forest + training diff --git a/_sources/api_reference/variant_qc/ld.rst.txt b/_sources/api_reference/variant_qc/ld.rst.txt new file mode 100644 index 000000000..e9d425bd0 --- /dev/null +++ b/_sources/api_reference/variant_qc/ld.rst.txt @@ -0,0 +1,8 @@ +gnomad.variant_qc.ld +==================== + + + +.. gnomad_automodulesummary:: gnomad.variant_qc.ld + +.. automodule:: gnomad.variant_qc.ld diff --git a/_sources/api_reference/variant_qc/pipeline.rst.txt b/_sources/api_reference/variant_qc/pipeline.rst.txt new file mode 100644 index 000000000..d92052dca --- /dev/null +++ b/_sources/api_reference/variant_qc/pipeline.rst.txt @@ -0,0 +1,8 @@ +gnomad.variant_qc.pipeline +========================== + + + +.. gnomad_automodulesummary:: gnomad.variant_qc.pipeline + +.. automodule:: gnomad.variant_qc.pipeline diff --git a/_sources/api_reference/variant_qc/random_forest.rst.txt b/_sources/api_reference/variant_qc/random_forest.rst.txt new file mode 100644 index 000000000..6ef4cda76 --- /dev/null +++ b/_sources/api_reference/variant_qc/random_forest.rst.txt @@ -0,0 +1,8 @@ +gnomad.variant_qc.random_forest +=============================== + + + +.. gnomad_automodulesummary:: gnomad.variant_qc.random_forest + +.. automodule:: gnomad.variant_qc.random_forest diff --git a/_sources/api_reference/variant_qc/training.rst.txt b/_sources/api_reference/variant_qc/training.rst.txt new file mode 100644 index 000000000..2bb8c6292 --- /dev/null +++ b/_sources/api_reference/variant_qc/training.rst.txt @@ -0,0 +1,8 @@ +gnomad.variant_qc.training +========================== + + + +.. gnomad_automodulesummary:: gnomad.variant_qc.training + +.. automodule:: gnomad.variant_qc.training diff --git a/_sources/examples/index.rst.txt b/_sources/examples/index.rst.txt new file mode 100644 index 000000000..d6409e072 --- /dev/null +++ b/_sources/examples/index.rst.txt @@ -0,0 +1,7 @@ +Examples +======== + +.. toctree:: + :maxdepth: 1 + + vep diff --git a/_sources/examples/vep.rst.txt b/_sources/examples/vep.rst.txt new file mode 100644 index 000000000..9961e35da --- /dev/null +++ b/_sources/examples/vep.rst.txt @@ -0,0 +1,39 @@ +Variant Effect Predictor (VEP) +============================== + +To use the `Ensembl Variant Effect Predictor `_ with Hail on Google Dataproc, +the ``--vep`` flag must be included when starting the cluster. Note that a cluster's VEP configuration is +tied to a specific reference genome. + +.. code-block:: shell + + hailctl dataproc start cluster-name --vep GRCh37 --packages gnomad + +.. note:: + + VEP data is stored in requester pays buckets. Reading from these buckets will bill charges to the project + in which the cluster is created. + +Import variants into a sites-only Hail Table:: + + import hail as hl + + ds = hl.import_vcf("/path/to/data.vcf.gz", reference_genome="GRCh37", drop_samples=True).rows() + +Annotate variants with VEP consequences:: + + from gnomad.utils.vep import vep_or_lookup_vep + + ds = vep_or_lookup_vep(ds, reference="GRCh37") + +:py:func:`vep_or_lookup_vep ` uses a precomputed dataset to +drastically speed up this process. + +Identify the most severe consequence for each variant:: + + from gnomad.utils.vep import process_consequences + + ds = process_consequences(ds) + +:py:func:`process_consequences ` adds ``worst_consequence_term``, +``worst_csq_for_variant``, ``worst_csq_by_gene`` and other fields to ``ds.vep``. diff --git a/_sources/getting_started.rst.txt b/_sources/getting_started.rst.txt new file mode 100644 index 000000000..5f93c9ec6 --- /dev/null +++ b/_sources/getting_started.rst.txt @@ -0,0 +1,40 @@ +Getting Started +=============== + +1. `Install Hail `_:: + + pip install hail + +2. Use ``hailctl`` to start a `Google Dataproc `_ cluster with the + ``gnomad`` package installed (see `Hail on the Cloud `_ for more detail on ``hailctl``):: + + hailctl dataproc start cluster-name --packages gnomad + +3. Connect to a `Jupyter Notebook `_ on the cluster:: + + hailctl dataproc connect cluster-name notebook + +4. Import gnomAD data in `Hail Table `_ format: + + * gnomAD v2.1.1 variants:: + + from gnomad.resources.grch37 import gnomad + + gnomad_v2_exomes = gnomad.public_release("exomes") + exomes_ht = gnomad_v2_exomes.ht() + exomes_ht.describe() + + gnomad_v2_genomes = gnomad.public_release("genomes") + genomes_ht = gnomad_v2_genomes.ht() + genomes_ht.describe() + + * gnomAD v3 variants:: + + from gnomad.resources.grch38 import gnomad + gnomad_v3_genomes = gnomad.public_release("genomes") + ht = gnomad_v3_genomes.ht() + ht.describe() + +5. Shut down the cluster when finished with it:: + + hailctl dataproc stop cluster-name diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt new file mode 100644 index 000000000..3178814ff --- /dev/null +++ b/_sources/index.rst.txt @@ -0,0 +1,18 @@ +gnomad +====== + +This package contains a number of `Hail `_ utility functions +and scripts for the `gnomAD project `_ and +the `Translational Genomics Group `_. + +Contents +======== + +.. toctree:: + :maxdepth: 3 + + Getting Started + Examples + API Reference + Resource Sources + Change Log diff --git a/_sources/resource_sources.rst.txt b/_sources/resource_sources.rst.txt new file mode 100644 index 000000000..ce59dad1e --- /dev/null +++ b/_sources/resource_sources.rst.txt @@ -0,0 +1,58 @@ +Resource Sources +================ + +gnomAD data is available through `multiple cloud providers' public datasets programs `_. + +The functions in the :doc:`gnomad.resources ` package can be configured to load data from different sources. + +If Hail determines that is is running in a cloud provider's Spark environment, resources will default to being read from that cloud provider's datasets program. +For example, resource will be read from Azure Open Datasets if Hail determines that it is running on an Azure HDInsight cluster. +Otherwise, resources will default to being read from Google Cloud Public Datasets. +This can be configured using the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable. + +To load resources from a different source (for example, the gnomAD project's public GCS bucket), use: + +.. code-block:: python + + from gnomad.resources.config import gnomad_public_resource_configuration, GnomadPublicResourceSource + + gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD + +To see all available public sources for gnomAD resources, use: + +.. code-block:: python + + from gnomad.resources.config import GnomadPublicResourceSource + + list(GnomadPublicResourceSource) + +.. note:: + + The gnomAD project's bucket (``gs://gnomad-public-requester-pays``) is `requester pays `_, meaning that charges for data access and transfer will be billed to your Google Cloud project. + + Clusters must be configured to read requester pays buckets during creation. For example, + + .. code-block:: + + hailctl dataproc start cluster-name --packages gnomad --requester-pays-allow-buckets gnomad-public-requester-pays + +Custom Sources +-------------- + +Alternatively, instead of using one of the pre-defined public sources, a custom source can be provided. + +.. code-block:: python + + from gnomad.resources.config import gnomad_public_resource_configuration + + gnomad_public_resource_configuration.source = "gs://my-bucket/gnomad-resources" + +Environment Configuration +------------------------- + +The default source can be configured through the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable. This variable can be set to either the name of one of the public datasets programs or the URL of a custom source. + +Examples: + +- ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE="Google Cloud Public Datasets"`` +- ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE="gs://my-bucket/gnomad-resources"`` diff --git a/_static/_sphinx_javascript_frameworks_compat.js b/_static/_sphinx_javascript_frameworks_compat.js new file mode 100644 index 000000000..81415803e --- /dev/null +++ b/_static/_sphinx_javascript_frameworks_compat.js @@ -0,0 +1,123 @@ +/* Compatability shim for jQuery and underscores.js. + * + * Copyright Sphinx contributors + * Released under the two clause BSD licence + */ + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} diff --git a/_static/basic.css b/_static/basic.css new file mode 100644 index 000000000..7577acb1a --- /dev/null +++ b/_static/basic.css @@ -0,0 +1,903 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 360px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +nav.contents, +aside.topic, +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +nav.contents, +aside.topic, +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +nav.contents > :last-child, +aside.topic > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +nav.contents::after, +aside.topic::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +aside.footnote > span, +div.citation > span { + float: left; +} +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { + margin-bottom: 0em; +} +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/_static/css/badge_only.css b/_static/css/badge_only.css new file mode 100644 index 000000000..c718cee44 --- /dev/null +++ b/_static/css/badge_only.css @@ -0,0 +1 @@ +.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} \ No newline at end of file diff --git a/_static/css/fonts/Roboto-Slab-Bold.woff b/_static/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 000000000..6cb600001 Binary files /dev/null and b/_static/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/_static/css/fonts/Roboto-Slab-Bold.woff2 b/_static/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 000000000..7059e2314 Binary files /dev/null and b/_static/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/_static/css/fonts/Roboto-Slab-Regular.woff b/_static/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 000000000..f815f63f9 Binary files /dev/null and b/_static/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/_static/css/fonts/Roboto-Slab-Regular.woff2 b/_static/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 000000000..f2c76e5bd Binary files /dev/null and b/_static/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/_static/css/fonts/fontawesome-webfont.eot b/_static/css/fonts/fontawesome-webfont.eot new file mode 100644 index 000000000..e9f60ca95 Binary files /dev/null and b/_static/css/fonts/fontawesome-webfont.eot differ diff --git a/_static/css/fonts/fontawesome-webfont.svg b/_static/css/fonts/fontawesome-webfont.svg new file mode 100644 index 000000000..855c845e5 --- /dev/null +++ b/_static/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserved. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_static/css/fonts/fontawesome-webfont.ttf b/_static/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 000000000..35acda2fa Binary files /dev/null and b/_static/css/fonts/fontawesome-webfont.ttf differ diff --git a/_static/css/fonts/fontawesome-webfont.woff b/_static/css/fonts/fontawesome-webfont.woff new file mode 100644 index 000000000..400014a4b Binary files /dev/null and b/_static/css/fonts/fontawesome-webfont.woff differ diff --git a/_static/css/fonts/fontawesome-webfont.woff2 b/_static/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 000000000..4d13fc604 Binary files /dev/null and b/_static/css/fonts/fontawesome-webfont.woff2 differ diff --git a/_static/css/fonts/lato-bold-italic.woff b/_static/css/fonts/lato-bold-italic.woff new file mode 100644 index 000000000..88ad05b9f Binary files /dev/null and b/_static/css/fonts/lato-bold-italic.woff differ diff --git a/_static/css/fonts/lato-bold-italic.woff2 b/_static/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 000000000..c4e3d804b Binary files /dev/null and b/_static/css/fonts/lato-bold-italic.woff2 differ diff --git a/_static/css/fonts/lato-bold.woff b/_static/css/fonts/lato-bold.woff new file mode 100644 index 000000000..c6dff51f0 Binary files /dev/null and b/_static/css/fonts/lato-bold.woff differ diff --git a/_static/css/fonts/lato-bold.woff2 b/_static/css/fonts/lato-bold.woff2 new file mode 100644 index 000000000..bb195043c Binary files /dev/null and b/_static/css/fonts/lato-bold.woff2 differ diff --git a/_static/css/fonts/lato-normal-italic.woff b/_static/css/fonts/lato-normal-italic.woff new file mode 100644 index 000000000..76114bc03 Binary files /dev/null and b/_static/css/fonts/lato-normal-italic.woff differ diff --git a/_static/css/fonts/lato-normal-italic.woff2 b/_static/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 000000000..3404f37e2 Binary files /dev/null and b/_static/css/fonts/lato-normal-italic.woff2 differ diff --git a/_static/css/fonts/lato-normal.woff b/_static/css/fonts/lato-normal.woff new file mode 100644 index 000000000..ae1307ff5 Binary files /dev/null and b/_static/css/fonts/lato-normal.woff differ diff --git a/_static/css/fonts/lato-normal.woff2 b/_static/css/fonts/lato-normal.woff2 new file mode 100644 index 000000000..3bf984332 Binary files /dev/null and b/_static/css/fonts/lato-normal.woff2 differ diff --git a/_static/css/theme.css b/_static/css/theme.css new file mode 100644 index 000000000..19a446a0e --- /dev/null +++ b/_static/css/theme.css @@ -0,0 +1,4 @@ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel,.rst-content .menuselection{font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .guilabel,.rst-content .menuselection{border:1px solid #7fbbe3;background:#e7f2fa}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} \ No newline at end of file diff --git a/_static/doctools.js b/_static/doctools.js new file mode 100644 index 000000000..d06a71d75 --- /dev/null +++ b/_static/doctools.js @@ -0,0 +1,156 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Base JavaScript utilities for all Sphinx HTML documentation. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", +]); + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.altKey || event.ctrlKey || event.metaKey) return; + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js new file mode 100644 index 000000000..ac1f83182 --- /dev/null +++ b/_static/documentation_options.js @@ -0,0 +1,14 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: 'master', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, +}; \ No newline at end of file diff --git a/_static/file.png b/_static/file.png new file mode 100644 index 000000000..a858a410e Binary files /dev/null and b/_static/file.png differ diff --git a/_static/jquery.js b/_static/jquery.js new file mode 100644 index 000000000..c4c6022f2 --- /dev/null +++ b/_static/jquery.js @@ -0,0 +1,2 @@ +/*! jQuery v3.6.0 | (c) OpenJS Foundation and other contributors | jquery.org/license */ +!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(C,e){"use strict";var t=[],r=Object.getPrototypeOf,s=t.slice,g=t.flat?function(e){return t.flat.call(e)}:function(e){return t.concat.apply([],e)},u=t.push,i=t.indexOf,n={},o=n.toString,v=n.hasOwnProperty,a=v.toString,l=a.call(Object),y={},m=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType&&"function"!=typeof e.item},x=function(e){return null!=e&&e===e.window},E=C.document,c={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||E).createElement("script");if(o.text=e,t)for(r in c)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function w(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[o.call(e)]||"object":typeof e}var f="3.6.0",S=function(e,t){return new S.fn.init(e,t)};function p(e){var t=!!e&&"length"in e&&e.length,n=w(e);return!m(e)&&!x(e)&&("array"===n||0===t||"number"==typeof t&&0+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp(F),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+F),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\[\\da-fA-F]{1,6}"+M+"?|\\\\([^\\r\\n\\f])","g"),ne=function(e,t){var n="0x"+e.slice(1)-65536;return t||(n<0?String.fromCharCode(n+65536):String.fromCharCode(n>>10|55296,1023&n|56320))},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(p.childNodes),p.childNodes),t[p.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&(T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!N[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&(U.test(t)||z.test(t))){(f=ee.test(t)&&ye(e.parentNode)||e)===e&&d.scope||((s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=S)),o=(l=h(t)).length;while(o--)l[o]=(s?"#"+s:":scope")+" "+xe(l[o]);c=l.join(",")}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){N(t,!0)}finally{s===S&&e.removeAttribute("id")}}}return g(t.replace($,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[S]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e&&e.namespaceURI,n=e&&(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:p;return r!=C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),p!=C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.scope=ce(function(e){return a.appendChild(e).appendChild(C.createElement("div")),"undefined"!=typeof e.querySelectorAll&&!e.querySelectorAll(":scope fieldset div").length}),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=S,!C.getElementsByName||!C.getElementsByName(S).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){var t;a.appendChild(e).innerHTML="",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+S+"-]").length||v.push("~="),(t=C.createElement("input")).setAttribute("name",""),e.appendChild(t),e.querySelectorAll("[name='']").length||v.push("\\["+M+"*name"+M+"*="+M+"*(?:''|\"\")"),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+S+"+*").length||v.push(".#.+[+~]"),e.querySelectorAll("\\\f"),v.push("[\\r\\n\\f]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",F)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},j=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)==(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e==C||e.ownerDocument==p&&y(p,e)?-1:t==C||t.ownerDocument==p&&y(p,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e==C?-1:t==C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]==p?-1:s[r]==p?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if(T(e),d.matchesSelector&&E&&!N[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){N(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=m[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&m(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?S.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?S.grep(e,function(e){return e===n!==r}):"string"!=typeof n?S.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(S.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||D,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:q.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof S?t[0]:t,S.merge(this,S.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),N.test(r[1])&&S.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(S):S.makeArray(e,this)}).prototype=S.fn,D=S(E);var L=/^(?:parents|prev(?:Until|All))/,H={children:!0,contents:!0,next:!0,prev:!0};function O(e,t){while((e=e[t])&&1!==e.nodeType);return e}S.fn.extend({has:function(e){var t=S(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i;ce=E.createDocumentFragment().appendChild(E.createElement("div")),(fe=E.createElement("input")).setAttribute("type","radio"),fe.setAttribute("checked","checked"),fe.setAttribute("name","t"),ce.appendChild(fe),y.checkClone=ce.cloneNode(!0).cloneNode(!0).lastChild.checked,ce.innerHTML="",y.noCloneChecked=!!ce.cloneNode(!0).lastChild.defaultValue,ce.innerHTML="",y.option=!!ce.lastChild;var ge={thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?S.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;n",""]);var me=/<|&#?\w+;/;function xe(e,t,n,r,i){for(var o,a,s,u,l,c,f=t.createDocumentFragment(),p=[],d=0,h=e.length;d\s*$/g;function je(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&S(e).children("tbody")[0]||e}function De(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function qe(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Le(e,t){var n,r,i,o,a,s;if(1===t.nodeType){if(Y.hasData(e)&&(s=Y.get(e).events))for(i in Y.remove(t,"handle events"),s)for(n=0,r=s[i].length;n").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var _t,zt=[],Ut=/(=)\?(?=&|$)|\?\?/;S.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=zt.pop()||S.expando+"_"+wt.guid++;return this[e]=!0,e}}),S.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Ut.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Ut.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Ut,"$1"+r):!1!==e.jsonp&&(e.url+=(Tt.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||S.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?S(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,zt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((_t=E.implementation.createHTMLDocument("").body).innerHTML="
",2===_t.childNodes.length),S.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=N.exec(e))?[t.createElement(i[1])]:(i=xe([e],t,o),o&&o.length&&S(o).remove(),S.merge([],i.childNodes)));var r,i,o},S.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,"position"),c=S(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=S.css(e,"top"),u=S.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===S.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===S.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,"borderTopWidth",!0),i.left+=S.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-S.css(r,"marginTop",!0),left:t.left-i.left-S.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===S.css(e,"position"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each(["top","left"],function(e,n){S.cssHooks[n]=Fe(y.pixelPosition,function(e,t){if(t)return t=We(e,n),Pe.test(t)?S(e).position()[n]+"px":t})}),S.each({Height:"height",Width:"width"},function(a,s){S.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){S.fn[n]=function(e,t){return 0",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/_static/js/html5shiv.min.js b/_static/js/html5shiv.min.js new file mode 100644 index 000000000..cd1c674f5 --- /dev/null +++ b/_static/js/html5shiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/_static/js/theme.js b/_static/js/theme.js new file mode 100644 index 000000000..1fddb6ee4 --- /dev/null +++ b/_static/js/theme.js @@ -0,0 +1 @@ +!function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("
"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/_static/minus.png b/_static/minus.png new file mode 100644 index 000000000..d96755fda Binary files /dev/null and b/_static/minus.png differ diff --git a/_static/plus.png b/_static/plus.png new file mode 100644 index 000000000..7107cec93 Binary files /dev/null and b/_static/plus.png differ diff --git a/_static/pygments.css b/_static/pygments.css new file mode 100644 index 000000000..84ab3030a --- /dev/null +++ b/_static/pygments.css @@ -0,0 +1,75 @@ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #f8f8f8; } +.highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +.highlight .err { border: 1px solid #FF0000 } /* Error */ +.highlight .k { color: #008000; font-weight: bold } /* Keyword */ +.highlight .o { color: #666666 } /* Operator */ +.highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #9C6500 } /* Comment.Preproc */ +.highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +.highlight .gd { color: #A00000 } /* Generic.Deleted */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #E40000 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #008400 } /* Generic.Inserted */ +.highlight .go { color: #717171 } /* Generic.Output */ +.highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #0044DD } /* Generic.Traceback */ +.highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #008000 } /* Keyword.Pseudo */ +.highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #B00040 } /* Keyword.Type */ +.highlight .m { color: #666666 } /* Literal.Number */ +.highlight .s { color: #BA2121 } /* Literal.String */ +.highlight .na { color: #687822 } /* Name.Attribute */ +.highlight .nb { color: #008000 } /* Name.Builtin */ +.highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */ +.highlight .no { color: #880000 } /* Name.Constant */ +.highlight .nd { color: #AA22FF } /* Name.Decorator */ +.highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +.highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +.highlight .nf { color: #0000FF } /* Name.Function */ +.highlight .nl { color: #767600 } /* Name.Label */ +.highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ +.highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #19177C } /* Name.Variable */ +.highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #bbbbbb } /* Text.Whitespace */ +.highlight .mb { color: #666666 } /* Literal.Number.Bin */ +.highlight .mf { color: #666666 } /* Literal.Number.Float */ +.highlight .mh { color: #666666 } /* Literal.Number.Hex */ +.highlight .mi { color: #666666 } /* Literal.Number.Integer */ +.highlight .mo { color: #666666 } /* Literal.Number.Oct */ +.highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +.highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +.highlight .sc { color: #BA2121 } /* Literal.String.Char */ +.highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +.highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +.highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +.highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +.highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +.highlight .sx { color: #008000 } /* Literal.String.Other */ +.highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +.highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +.highlight .ss { color: #19177C } /* Literal.String.Symbol */ +.highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #0000FF } /* Name.Function.Magic */ +.highlight .vc { color: #19177C } /* Name.Variable.Class */ +.highlight .vg { color: #19177C } /* Name.Variable.Global */ +.highlight .vi { color: #19177C } /* Name.Variable.Instance */ +.highlight .vm { color: #19177C } /* Name.Variable.Magic */ +.highlight .il { color: #666666 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/_static/searchtools.js b/_static/searchtools.js new file mode 100644 index 000000000..97d56a74d --- /dev/null +++ b/_static/searchtools.js @@ -0,0 +1,566 @@ +/* + * searchtools.js + * ~~~~~~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for the full-text search. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + + const [docName, title, anchor, descr, score, _filename] = item; + + let listItem = document.createElement("li"); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = docUrlRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = docUrlRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms) + ); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = _( + `Search finished, found ${resultCount} page(s) matching the search query.` + ); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent !== undefined) return docContent.textContent; + console.warn( + "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + /** + * execute search (requires search index to be loaded) + */ + query: (query) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + // array of [docname, title, anchor, descr, score, filename] + let results = []; + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + let score = Math.round(100 * queryLower.length / title.length) + results.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id] of foundEntries) { + let score = Math.round(100 * queryLower.length / entry.length) + results.push([ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // lookup as object + objectTerms.forEach((term) => + results.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); + + // now sort the results by score (in opposite order of appearance, since the + // display function below uses pop() to retrieve items) and then + // alphabetically + results.sort((a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; + }); + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + results = results.reverse(); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord) && !terms[word]) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord) && !titleTerms[word]) + arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); + }); + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) + fileMap.get(file).push(word); + else fileMap.set(file, [word]); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords) => { + const text = Search.htmlToText(htmlText); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js new file mode 100644 index 000000000..aae669d7e --- /dev/null +++ b/_static/sphinx_highlight.js @@ -0,0 +1,144 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + parent.insertBefore( + span, + parent.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(SphinxHighlight.highlightSearchWords); +_ready(SphinxHighlight.initEscapeListener); diff --git a/_static/theme_overrides.css b/_static/theme_overrides.css new file mode 100644 index 000000000..783a8944f --- /dev/null +++ b/_static/theme_overrides.css @@ -0,0 +1,3 @@ +.wy-nav-content { + max-width: none; +} diff --git a/api_reference/assessment/index.html b/api_reference/assessment/index.html new file mode 100644 index 000000000..52575210c --- /dev/null +++ b/api_reference/assessment/index.html @@ -0,0 +1,158 @@ + + + + + + + gnomad.assessment — gnomad master documentation + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/api_reference/assessment/summary_stats.html b/api_reference/assessment/summary_stats.html new file mode 100644 index 000000000..ff1e97a66 --- /dev/null +++ b/api_reference/assessment/summary_stats.html @@ -0,0 +1,491 @@ + + + + + + + gnomad.assessment.summary_stats — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.assessment.summary_stats

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.assessment.summary_stats.freq_bin_expr(...)

Return frequency string annotations based on input AC or AF.

gnomad.assessment.summary_stats.get_summary_counts_dict(...)

Return dictionary containing containing counts of multiple variant categories.

gnomad.assessment.summary_stats.get_summary_ac_dict(...)

Return dictionary containing containing total allele counts for variant categories.

gnomad.assessment.summary_stats.get_summary_counts(ht)

Generate a struct with summary counts across variant categories.

gnomad.assessment.summary_stats.get_an_criteria(mt)

Generate criteria to filter samples based on allele number (AN).

gnomad.assessment.summary_stats.get_tx_expression_expr(...)

Pull appropriate transcript expression annotation struct given a specific locus and alleles (provided in key_expr).

gnomad.assessment.summary_stats.default_generate_gene_lof_matrix(mt, ...)

Generate loss-of-function gene matrix.

gnomad.assessment.summary_stats.get_het_hom_summary_dict(...)

Generate dictionary containing summary counts.

gnomad.assessment.summary_stats.default_generate_gene_lof_summary(mt)

Generate summary counts for loss-of-function (LoF), missense, and synonymous variants.

+
+
+gnomad.assessment.summary_stats.freq_bin_expr(freq_expr, index=0)[source]
+

Return frequency string annotations based on input AC or AF.

+
+

Note

+
    +
  • Default index is 0 because function assumes freq_expr was calculated with annotate_freq.

  • +
  • Frequency index 0 from annotate_freq is frequency for all pops calculated on adj genotypes only.

  • +
+
+
+
Parameters:
+
    +
  • freq_expr (ArrayExpression) – Array of structs containing frequency information.

  • +
  • index (int) – Which index of freq_expr to use for annotation. Default is 0.

  • +
+
+
Return type:
+

StringExpression

+
+
Returns:
+

StringExpression containing bin name based on input AC or AF.

+
+
+
+ +
+
+gnomad.assessment.summary_stats.get_summary_counts_dict(locus_expr, allele_expr, lof_expr, no_lof_flags_expr, most_severe_csq_expr, prefix_str='')[source]
+

Return dictionary containing containing counts of multiple variant categories.

+
+
Categories are:
    +
  • Number of variants

  • +
  • Number of indels

  • +
  • Number of SNVs

  • +
  • Number of LoF variants

  • +
  • Number of LoF variants that pass LOFTEE

  • +
  • Number of LoF variants that pass LOFTEE without any flgs

  • +
  • Number of LoF variants annotated as ‘other splice’ (OS) by LOFTEE

  • +
  • Number of LoF variants that fail LOFTEE

  • +
  • Number of missense variants

  • +
  • Number of synonymous variants

  • +
  • Number of autosomal variants

  • +
  • Number of allosomal variants

  • +
+
+
+
+

Warning

+

Assumes allele_expr contains only two variants (multi-allelics have been split).

+
+
+
Parameters:
+
    +
  • locus_expr (LocusExpression) – LocusExpression.

  • +
  • allele_expr (ArrayExpression) – ArrayExpression containing alleles.

  • +
  • lof_expr (StringExpression) – StringExpression containing LOFTEE annotation.

  • +
  • no_lof_flags_expr (BooleanExpression) – BooleanExpression indicating whether LoF variant has any flags.

  • +
  • most_severe_csq_expr (StringExpression) – StringExpression containing most severe consequence annotation.

  • +
  • prefix_str (str) – Desired prefix string for category names. Default is empty str.

  • +
+
+
Return type:
+

Dict[str, Int64Expression]

+
+
Returns:
+

Dict of categories and counts per category.

+
+
+
+ +
+
+gnomad.assessment.summary_stats.get_summary_ac_dict(ac_expr, lof_expr, no_lof_flags_expr, most_severe_csq_expr)[source]
+

Return dictionary containing containing total allele counts for variant categories.

+
+
Categories are:
    +
  • All variants

  • +
  • LoF variants

  • +
  • LoF variants that pass LOFTEE

  • +
  • LoF variants that pass LOFTEE without any flags

  • +
  • LoF variants that are annotate as ‘other splice’ (OS) by LOFTEE

  • +
  • LoF variants that fail LOFTEE

  • +
  • Missense variants

  • +
  • Synonymous variants

  • +
+
+
+
+

Warning

+

Assumes allele_expr contains only two variants (multi-allelics have been split).

+
+
+
Parameters:
+
+
+
Return type:
+

Dict[str, Int64Expression]

+
+
Returns:
+

Dict of variant categories and their total allele counts.

+
+
+
+ +
+
+gnomad.assessment.summary_stats.get_summary_counts(ht, freq_field='freq', filter_field='filters', filter_decoy=False, canonical_only=True, mane_select_only=False, index=0)[source]
+

Generate a struct with summary counts across variant categories.

+
+
Summary counts:
    +
  • Number of variants

  • +
  • Number of indels

  • +
  • Number of SNVs

  • +
  • Number of LoF variants

  • +
  • Number of LoF variants that pass LOFTEE (including with LoF flags)

  • +
  • Number of LoF variants that pass LOFTEE without LoF flags

  • +
  • Number of OS (other splice) variants annotated by LOFTEE

  • +
  • Number of LoF variants that fail LOFTEE filters

  • +
+
+
+

Also annotates Table’s globals with total variant counts.

+
+
Before calculating summary counts, function:
    +
  • Filters out low confidence regions

  • +
  • Uses the most severe consequence

  • +
  • Filters to canonical transcripts (if canonical_only is True) or MANE Select +transcripts (if mane_select_only is True)

  • +
+
+
Assumes that:
    +
  • Input HT is annotated with VEP.

  • +
  • Multiallelic variants have been split and/or input HT contains bi-allelic variants only.

  • +
  • freq_expr was calculated with annotate_freq.

  • +
  • (Frequency index 0 from annotate_freq is frequency for all pops calculated on adj genotypes only.)

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • freq_field (str) – Name of field in HT containing frequency annotation (array of structs). Default is “freq”.

  • +
  • filter_field (str) – Name of field in HT containing variant filter information. Default is “filters”.

  • +
  • canonical_only (bool) – Whether to filter to canonical transcripts. Default is True.

  • +
  • mane_select_only (bool) – Whether to filter to MANE Select transcripts. Default is False.

  • +
  • filter_decoy (bool) – Whether to filter decoy regions. Default is False.

  • +
  • index (int) – Which index of freq_expr to use for annotation. Default is 0.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table grouped by frequency bin and aggregated across summary count categories.

+
+
+
+ +
+
+gnomad.assessment.summary_stats.get_an_criteria(mt, samples_by_sex=None, meta_root='meta', sex_field='sex_imputation.sex_karyotype', xy_str='XY', xx_str='XX', freq_field='freq', freq_index=0, an_proportion_cutoff=0.8)[source]
+

Generate criteria to filter samples based on allele number (AN).

+

Uses allele number as proxy for call rate.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable.

  • +
  • samples_by_sex (Optional[Dict[str, int]]) – Optional Dictionary containing number of samples (value) for each sample sex (key).

  • +
  • meta_root (str) – Name of field in MatrixTable containing sample metadata information. Default is ‘meta’.

  • +
  • sex_field (str) – Name of field in MatrixTable containing sample sex assignment. Defualt is ‘sex_imputation.sex_karyotype’.

  • +
  • xy_str (str) – String marking whether a sample has XY sex. Default is ‘XY’.

  • +
  • xx_str (str) – String marking whether a sample has XX sex. Default is ‘XX’.

  • +
  • freq_field (str) – Name of field in MT that contains frequency information. Default is ‘freq’.

  • +
  • freq_index (int) – Which index of frequency struct to use. Default is 0.

  • +
  • an_proportion_cutoff (float) – Desired allele number proportion cutoff. Default is 0.8.

  • +
+
+
Return type:
+

BooleanExpression

+
+
+
+ +
+
+gnomad.assessment.summary_stats.get_tx_expression_expr(key_expr, tx_ht, csq_expr, gene_field='ensg', csq_field='csq', tx_struct='tx_annotation')[source]
+

Pull appropriate transcript expression annotation struct given a specific locus and alleles (provided in key_expr).

+

Assumes that key_expr contains a locus and alleles. +Assumes that multi-allelic variants have been split in both tx_ht and key_expr.

+
+
Parameters:
+
    +
  • row_key_expr – StructExpression containing locus and alleles to search in tx_ht.

  • +
  • tx_ht (Table) – Input Table containing transcript expression information.

  • +
  • csq_expr (StructExpression) – Input StructExpression that contains VEP consequence information.

  • +
  • gene_field (str) – Field in csq_expr that contains gene ID.

  • +
  • csq_field (str) – Field in csq_expr that contains most_severe_consequence annotation.

  • +
  • tx_struct (str) – StructExpression that contains transcript expression information.

  • +
  • key_expr (StructExpression) –

  • +
+
+
Return type:
+

Float64Expression

+
+
Returns:
+

StructExpression that contains transcript expression information for given gene ID in csq_expr.

+
+
+
+ +
+
+gnomad.assessment.summary_stats.default_generate_gene_lof_matrix(mt, tx_ht, high_expression_cutoff=0.9, low_expression_cutoff=0.1, filter_field='filters', freq_field='freq', freq_index=0, additional_csq_set={'missense_variant', 'synonymous_variant'}, all_transcripts=False, filter_an=False, filter_to_rare=False, pre_loftee=False, lof_csq_set={'frameshift_variant', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained'}, remove_ultra_common=False)[source]
+

Generate loss-of-function gene matrix.

+

Used to generate summary metrics on LoF variants.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable.

  • +
  • tx_ht (Optional[Table]) – Optional Table containing expression levels per transcript.

  • +
  • high_expression_cutoff (float) – Minimum mean proportion expressed cutoff for a transcript to be considered highly expressed. Default is 0.9.

  • +
  • low_expression_cutoff (float) – Upper mean proportion expressed cutoff for a transcript to lowly expressed. Default is 0.1.

  • +
  • filter_field (str) – Name of field in MT that contains variant filters. Default is ‘filters’.

  • +
  • freq_field (str) – Name of field in MT that contains frequency information. Default is ‘freq’.

  • +
  • freq_index (int) – Which index of frequency struct to use. Default is 0.

  • +
  • additional_csq_set (Set[str]) – Set of additional consequences to keep. Default is {‘missense_variant’, ‘synonymous_variant’}.

  • +
  • all_transcripts (bool) – Whether to use all transcripts instead of just the transcript with most severe consequence. Default is False.

  • +
  • filter_an (bool) – Whether to filter using allele number as proxy for call rate. Default is False.

  • +
  • filter_to_rare (bool) – Whether to filter to rare (AF < 5%) variants. Default is False.

  • +
  • pre_loftee (bool) – Whether LoF consequences have been annotated with LOFTEE. Default is False.

  • +
  • lof_csq_set (Set[str]) – Set of LoF consequence strings. Default is {“splice_acceptor_variant”, “splice_donor_variant”, “stop_gained”, “frameshift_variant”}.

  • +
  • remove_ultra_common (bool) – Whether to remove ultra common (AF > 95%) variants. Default is False.

  • +
+
+
Return type:
+

MatrixTable

+
+
+
+ +
+
+gnomad.assessment.summary_stats.get_het_hom_summary_dict(csq_set, most_severe_csq_expr, defined_sites_expr, num_homs_expr, num_hets_expr, pop_expr)[source]
+

Generate dictionary containing summary counts.

+
+
Summary counts are:
    +
  • Number of sites with defined genotype calls

  • +
  • Number of samples with heterozygous calls

  • +
  • Number of samples with homozygous calls

  • +
+
+
+

Function has option to generate counts by population.

+
+
Parameters:
+
    +
  • csq_set (Set[str]) – Set containing transcript consequence string(s).

  • +
  • most_severe_csq_expr (StringExpression) – StringExpression containing most severe consequence.

  • +
  • defined_sites_expr (Int64Expression) – Int64Expression containing number of sites with defined genotype calls.

  • +
  • num_homs_expr (Int64Expression) – Int64Expression containing number of samples with homozygous genotype calls.

  • +
  • num_hets_expr (Int64Expression) – Int64Expression containing number of samples with heterozygous genotype calls.

  • +
  • pop_expr (StringExpression) – StringExpression containing sample population labels.

  • +
+
+
Return type:
+

Dict[str, Int64Expression]

+
+
Returns:
+

Dictionary of summary annotation names and their values.

+
+
+
+ +
+
+gnomad.assessment.summary_stats.default_generate_gene_lof_summary(mt, collapse_indels=False, tx=False, lof_csq_set={'frameshift_variant', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained'}, meta_root='meta', pop_field='pop', filter_loftee=False)[source]
+

Generate summary counts for loss-of-function (LoF), missense, and synonymous variants.

+

Also calculates p, proportion of of haplotypes carrying a putative LoF (pLoF) variant, +and observed/expected (OE) ratio of samples with homozygous pLoF variant calls.

+
+
Summary counts are (all per gene):
    +
  • Number of samples with no pLoF variants.

  • +
  • Number of samples with heterozygous pLoF variants.

  • +
  • Number of samples with homozygous pLoF variants.

  • +
  • Total number of sites with genotype calls.

  • +
  • All of the above stats grouped by population.

  • +
+
+
+

Assumes MT was created using default_generate_gene_lof_matrix.

+
+

Note

+

Assumes LoF variants in MT were filtered (LOFTEE pass and no LoF flag only). +If LoF variants have not been filtered and filter_loftee is True, +expects MT has the row annotation vep.

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable.

  • +
  • collapse_indels (bool) – Whether to collapse indels. Default is False.

  • +
  • tx (bool) – Whether input MT has transcript expression data. Default is False.

  • +
  • lof_csq_set (Set[str]) – Set containing LoF transcript consequence strings. Default is LOF_CSQ_SET.

  • +
  • meta_root (str) – String indicating top level name for sample metadata. Default is ‘meta’.

  • +
  • pop_field (str) – String indiciating field with sample population assignment information. Default is ‘pop’.

  • +
  • filter_loftee (bool) – Filters to LOFTEE pass variants (and no LoF flags) only. Default is False.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with het/hom summary counts.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/assessment/validity_checks.html b/api_reference/assessment/validity_checks.html new file mode 100644 index 000000000..0bf254bcd --- /dev/null +++ b/api_reference/assessment/validity_checks.html @@ -0,0 +1,711 @@ + + + + + + + gnomad.assessment.validity_checks — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.assessment.validity_checks

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.assessment.validity_checks.generic_field_check(ht, ...)

Check generic logical condition cond_expr involving annotations in a Hail Table when n_fail is absent and print the results to stdout.

gnomad.assessment.validity_checks.make_filters_expr_dict(ht)

Make Hail expressions to measure % variants filtered under varying conditions of interest.

gnomad.assessment.validity_checks.make_group_sum_expr_dict(t, ...)

Compute the sum of call stats annotations for a specified group of annotations, compare to the annotated version, and display the result in stdout.

gnomad.assessment.validity_checks.compare_row_counts(...)

Check if the row counts in two Tables are the same.

gnomad.assessment.validity_checks.summarize_variant_filters(t)

Summarize variants filtered under various conditions in input MatrixTable or Table.

gnomad.assessment.validity_checks.generic_field_check_loop(ht, ...)

Loop through all conditional checks for a given hail Table.

gnomad.assessment.validity_checks.compare_subset_freqs(t, ...)

Perform validity checks on frequency data in input Table.

gnomad.assessment.validity_checks.sum_group_callstats(t)

Compute the sum of annotations for a specified group of annotations, and compare to the annotated version.

gnomad.assessment.validity_checks.summarize_variants(t)

Get summary of variants in a MatrixTable or Table.

gnomad.assessment.validity_checks.check_raw_and_adj_callstats(t, ...)

Perform validity checks on raw and adj data in input Table/MatrixTable.

gnomad.assessment.validity_checks.check_sex_chr_metrics(t, ...)

Perform validity checks for annotations on the sex chromosomes.

gnomad.assessment.validity_checks.compute_missingness(t, ...)

Check amount of missingness in all row annotations.

gnomad.assessment.validity_checks.vcf_field_check(t, ...)

Check that all VCF fields and descriptions are present in input Table and VCF header dictionary.

gnomad.assessment.validity_checks.check_global_and_row_annot_lengths(t, ...)

Check that the lengths of row annotations match the lengths of associated global annotations.

gnomad.assessment.validity_checks.pprint_global_anns(t)

Pretty print global annotations.

gnomad.assessment.validity_checks.validate_release_t(t)

Perform a battery of validity checks on a specified group of subsets in a MatrixTable containing variant annotations.

gnomad.assessment.validity_checks.count_vep_annotated_variants_per_interval(...)

Calculate the count of VEP annotated variants in vep_ht per interval defined by interval_ht.

+
+
+gnomad.assessment.validity_checks.generic_field_check(ht, check_description, display_fields, cond_expr=None, verbose=False, show_percent_sites=False, n_fail=None, ht_count=None)[source]
+

Check generic logical condition cond_expr involving annotations in a Hail Table when n_fail is absent and print the results to stdout.

+

Displays the number of rows (and percent of rows, if show_percent_sites is True) in the Table that fail, either previously computed as n_fail or that match the cond_expr, and fail to be the desired condition (check_description). +If the number of rows that match the cond_expr or n_fail is 0, then the Table passes that check; otherwise, it fails.

+
+

Note

+

cond_expr and check_description are opposites and should never be the same. +E.g., If cond_expr filters for instances where the raw AC is less than adj AC, +then it is checking sites that fail to be the desired condition (check_description) +of having a raw AC greater than or equal to the adj AC.

+
+
+
Parameters:
+
    +
  • ht (Table) – Table containing annotations to be checked.

  • +
  • check_description (str) – String describing the condition being checked; is displayed in stdout summary message.

  • +
  • display_fields (StructExpression) – StructExpression containing annotations to be displayed in case of failure (for troubleshooting purposes); these fields are also displayed if verbose is True.

  • +
  • cond_expr (BooleanExpression) – Optional logical expression referring to annotations in ht to be checked.

  • +
  • verbose (bool) – If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.

  • +
  • show_percent_sites (bool) – Show percentage of sites that fail checks. Default is False.

  • +
  • n_fail (Optional[int]) – Optional number of sites that fail the conditional checks (previously computed). If not supplied, cond_expr is used to filter the Table and obtain the count of sites that fail the checks.

  • +
  • ht_count (Optional[int]) – Optional number of sites within hail Table (previously computed). If not supplied, a count of sites in the Table is performed.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.make_filters_expr_dict(ht, extra_filter_checks=None, variant_filter_field='RF')[source]
+

Make Hail expressions to measure % variants filtered under varying conditions of interest.

+
+
Checks for:
    +
  • Total number of variants

  • +
  • +
    Fraction of variants removed due to:
      +
    • Any filter

    • +
    • Inbreeding coefficient filter in combination with any other filter

    • +
    • AC0 filter in combination with any other filter

    • +
    • variant_filter_field filtering in combination with any other filter

    • +
    • Only inbreeding coefficient filter

    • +
    • Only AC0 filter

    • +
    • Only filtering defined by variant_filter_field

    • +
    +
    +
    +
  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Table containing ‘filter’ annotation to be examined.

  • +
  • extra_filter_checks (Optional[Dict[str, Expression]]) – Optional dictionary containing filter condition name (key) extra filter expressions (value) to be examined.

  • +
  • variant_filter_field (str) – String of variant filtration used in the filters annotation on ht (e.g. RF, VQSR, AS_VQSR). Default is “RF”.

  • +
+
+
Return type:
+

Dict[str, Expression]

+
+
Returns:
+

Dictionary containing Hail aggregation expressions to examine filter flags.

+
+
+
+ +
+
+gnomad.assessment.validity_checks.make_group_sum_expr_dict(t, subset, label_groups, sort_order=['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group'], delimiter='-', metric_first_field=True, metrics=['AC', 'AN', 'nhomalt'])[source]
+

Compute the sum of call stats annotations for a specified group of annotations, compare to the annotated version, and display the result in stdout.

+

For example, if subset1 consists of pop1, pop2, and pop3, check that t.info.AC-subset1 == sum(t.info.AC-subset1-pop1, t.info.AC-subset1-pop2, t.info.AC-subset1-pop3).

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table containing call stats annotations to be summed.

  • +
  • subset (str) – String indicating sample subset.

  • +
  • label_groups (Dict[str, List[str]]) – Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. “sex” or “pop”, and value is a list of all possible values for that grouping (e.g. [“XY”, “XX”] or [“afr”, “nfe”, “amr”]).

  • +
  • sort_order (List[str]) – List containing order to sort label group combinations. Default is SORT_ORDER.

  • +
  • delimiter (str) – String to use as delimiter when making group label combinations. Default is “-“.

  • +
  • metric_first_field (bool) – If True, metric precedes subset in the Table’s fields, e.g. AC-hgdp. If False, subset precedes metric, hgdp-AC. Default is True.

  • +
  • metrics (List[str]) – List of metrics to sum and compare to annotationed versions. Default is [“AC”, “AN”, “nhomalt”].

  • +
+
+
Return type:
+

Dict[str, Dict[str, Union[Int64Expression, StructExpression]]]

+
+
Returns:
+

Dictionary of sample sum field check expressions and display fields.

+
+
+
+ +
+
+gnomad.assessment.validity_checks.compare_row_counts(ht1, ht2)[source]
+

Check if the row counts in two Tables are the same.

+
+
Parameters:
+
    +
  • ht1 (Table) – First Table to be checked.

  • +
  • ht2 (Table) – Second Table to be checked.

  • +
+
+
Return type:
+

bool

+
+
Returns:
+

Whether the row counts are the same.

+
+
+
+ +
+
+gnomad.assessment.validity_checks.summarize_variant_filters(t, variant_filter_field='RF', problematic_regions=['lcr', 'segdup', 'nonpar'], single_filter_count=False, site_gt_check_expr=None, extra_filter_checks=None, n_rows=50, n_cols=140)[source]
+

Summarize variants filtered under various conditions in input MatrixTable or Table.

+
+
Summarize counts for:
    +
  • Total number of variants

  • +
  • +
    Fraction of variants removed due to:
      +
    • Any filter

    • +
    • Inbreeding coefficient filter in combination with any other filter

    • +
    • AC0 filter in combination with any other filter

    • +
    • variant_filter_field filtering in combination with any other filter in combination with any other filter

    • +
    • Only inbreeding coefficient filter

    • +
    • Only AC0 filter

    • +
    • Only variant_filter_field filtering

    • +
    +
    +
    +
  • +
+
+
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table to be checked.

  • +
  • variant_filter_field (str) – String of variant filtration used in the filters annotation on ht (e.g. RF, VQSR, AS_VQSR). Default is “RF”.

  • +
  • problematic_regions (List[str]) – List of regions considered problematic to run filter check in. Default is [“lcr”, “segdup”, “nonpar”].

  • +
  • single_filter_count (bool) – If True, explode the Table’s filter column and give a supplement total count of each filter. Default is False.

  • +
  • site_gt_check_expr (Dict[str, BooleanExpression]) – Optional dictionary of strings and boolean expressions typically used to log how many monoallelic or 100% heterozygous sites are in the Table.

  • +
  • extra_filter_checks (Optional[Dict[str, Expression]]) – Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined.

  • +
  • n_rows (int) – Number of rows to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 50.

  • +
  • n_cols (int) – Number of columns to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 140.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.generic_field_check_loop(ht, field_check_expr, verbose, show_percent_sites=False, ht_count=None)[source]
+

Loop through all conditional checks for a given hail Table.

+

This loop allows aggregation across the hail Table once, as opposed to aggregating during every conditional check.

+
+
Parameters:
+
    +
  • ht (Table) – Table containing annotations to be checked.

  • +
  • field_check_expr (Dict[str, Dict[str, Any]]) – Dictionary whose keys are conditions being checked and values are the expressions for filtering to condition.

  • +
  • verbose (bool) – If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.

  • +
  • show_percent_sites (bool) – Show percentage of sites that fail checks. Default is False.

  • +
  • ht_count (int) – Previously computed sum of sites within hail Table. Default is None.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.compare_subset_freqs(t, subsets, verbose, show_percent_sites=True, delimiter='-', metric_first_field=True, metrics=['AC', 'AN', 'nhomalt'])[source]
+

Perform validity checks on frequency data in input Table.

+
+
Check:
    +
  • +
    Number of sites where callset frequency is equal to a subset frequency (raw and adj)
      +
    • eg. t.info.AC-adj != t.info.AC-subset1-adj

    • +
    +
    +
    +
  • +
  • Total number of sites where the raw allele count annotation is defined

  • +
+
+
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table.

  • +
  • subsets (List[str]) – List of sample subsets.

  • +
  • verbose (bool) – If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.

  • +
  • show_percent_sites (bool) – If True, show the percentage and count of overall sites that fail; if False, only show the number of sites that fail.

  • +
  • delimiter (str) – String to use as delimiter when making group label combinations. Default is “-“.

  • +
  • metric_first_field (bool) – If True, metric precedes subset, e.g. AC-non_v2-. If False, subset precedes metric, non_v2-AC-XY. Default is True.

  • +
  • metrics (List[str]) – List of metrics to compare between subset and entire callset. Default is [“AC”, “AN”, “nhomalt”].

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.sum_group_callstats(t, sexes=['XX', 'XY'], subsets=[''], pops=['afr', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'remaining', 'sas'], groups=['adj'], additional_subsets_and_pops=None, verbose=False, sort_order=['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group'], delimiter='-', metric_first_field=True, metrics=['AC', 'AN', 'nhomalt'])[source]
+

Compute the sum of annotations for a specified group of annotations, and compare to the annotated version.

+

Displays results from checking the sum of the specified annotations in stdout. +Also checks that annotations for all expected sample populations are present.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input Table.

  • +
  • sexes (List[str]) – List of sexes in table.

  • +
  • subsets (List[str]) – List of sample subsets that contain pops passed in pops parameter. An empty string, e.g. “”, should be passed to test entire callset. Default is [“”].

  • +
  • pops (List[str]) – List of pops contained within the subsets. Default is POPS[CURRENT_MAJOR_RELEASE][“exomes”].

  • +
  • groups (List[str]) – List of callstat groups, e.g. “adj” and “raw” contained within the callset. gnomAD does not store the raw callstats for the pop or sex groupings of any subset. Default is [“adj”]

  • +
  • sample_sum_sets_and_pops – Dict with subset (keys) and list of the subset’s specific populations (values). Default is None.

  • +
  • verbose (bool) – If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. Default is False.

  • +
  • sort_order (List[str]) – List containing order to sort label group combinations. Default is SORT_ORDER.

  • +
  • delimiter (str) – String to use as delimiter when making group label combinations. Default is “-“.

  • +
  • metric_first_field (bool) – If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True.

  • +
  • metrics (List[str]) – List of metrics to sum and compare to annotationed versions. Default is [“AC”, “AN”, “nhomalt”].

  • +
  • additional_subsets_and_pops (Dict[str, List[str]]) –

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.summarize_variants(t)[source]
+

Get summary of variants in a MatrixTable or Table.

+

Print the number of variants to stdout and check that each chromosome has variant calls.

+
+
Parameters:
+

t (Union[MatrixTable, Table]) – Input MatrixTable or Table to be checked.

+
+
Return type:
+

Struct

+
+
Returns:
+

Struct of variant summary

+
+
+
+ +
+
+gnomad.assessment.validity_checks.check_raw_and_adj_callstats(t, subsets, verbose, delimiter='-', metric_first_field=True)[source]
+

Perform validity checks on raw and adj data in input Table/MatrixTable.

+
+
Check that:
    +
  • Raw AC and AF are not 0

  • +
  • AC and AF are not negative

  • +
  • Raw values for AC, AN, nhomalt in each sample subset are greater than or equal to their corresponding adj values

  • +
+
+
+

Raw and adj call stat annotations must be in an info struct annotation on the Table/MatrixTable, e.g. t.info.AC-raw.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table to check.

  • +
  • subsets (List[str]) – List of sample subsets.

  • +
  • verbose (bool) – If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.

  • +
  • delimiter (str) – String to use as delimiter when making group label combinations. Default is “-“.

  • +
  • metric_first_field (bool) – If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.check_sex_chr_metrics(t, info_metrics, contigs, verbose, delimiter='-')[source]
+

Perform validity checks for annotations on the sex chromosomes.

+
+
Check:
    +
  • That metrics for chrY variants in XX samples are NA and not 0

  • +
  • That nhomalt counts are equal to XX nhomalt counts for all non-PAR chrX variants

  • +
+
+
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table.

  • +
  • info_metrics (List[str]) – List of metrics in info struct of input Table.

  • +
  • contigs (List[str]) – List of contigs present in input Table.

  • +
  • verbose (bool) – If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks.

  • +
  • delimiter (str) – String to use as the delimiter in XX metrics. Default is “-“.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.compute_missingness(t, info_metrics, non_info_metrics, n_sites, missingness_threshold)[source]
+

Check amount of missingness in all row annotations.

+

Print metric to sdout if the percentage of metric annotations missingness exceeds the missingness_threshold.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table.

  • +
  • info_metrics (List[str]) – List of metrics in info struct of input Table.

  • +
  • non_info_metrics (List[str]) – List of row annotations minus info struct from input Table.

  • +
  • n_sites (int) – Number of sites in input Table.

  • +
  • missingness_threshold (float) – Upper cutoff for allowed amount of missingness.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.vcf_field_check(t, header_dict, row_annotations=None, entry_annotations=None, hists=['gq_hist_alt', 'gq_hist_all', 'dp_hist_alt', 'dp_hist_all', 'ab_hist_alt'])[source]
+

Check that all VCF fields and descriptions are present in input Table and VCF header dictionary.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table to be exported to VCF.

  • +
  • header_dict (Dict[str, Dict[str, Dict[str, str]]]) – VCF header dictionary.

  • +
  • row_annotations (List[str]) – List of row annotations in MatrixTable or Table.

  • +
  • entry_annotations (List[str]) – List of entry annotations to use if running this check on a MatrixTable.

  • +
  • hists (List[str]) – List of variant histogram annotations. Default is HISTS.

  • +
+
+
Return type:
+

bool

+
+
Returns:
+

Boolean with whether all expected fields and descriptions are present.

+
+
+
+ +
+
+gnomad.assessment.validity_checks.check_global_and_row_annot_lengths(t, row_to_globals_check, check_all_rows=False)[source]
+

Check that the lengths of row annotations match the lengths of associated global annotations.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table.

  • +
  • row_to_globals_check (Dict[str, List[str]]) – Dictionary with row annotation (key) and list of associated global annotations (value) to compare.

  • +
  • check_all_rows (bool) – If True, check all rows in t; if False, check only the first row. Default is False.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.pprint_global_anns(t)[source]
+

Pretty print global annotations.

+
+
Parameters:
+

t (Union[MatrixTable, Table]) – Input MatrixTable or Table.

+
+
Return type:
+

None

+
+
+
+ +
+
+gnomad.assessment.validity_checks.validate_release_t(t, subsets=[''], pops=['afr', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'remaining', 'sas'], missingness_threshold=0.5, site_gt_check_expr=None, verbose=False, show_percent_sites=True, delimiter='-', metric_first_field=True, sum_metrics=['AC', 'AN', 'nhomalt'], sexes=['XX', 'XY'], groups=['adj'], sample_sum_sets_and_pops=None, sort_order=['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group'], variant_filter_field='RF', problematic_regions=['lcr', 'segdup', 'nonpar'], single_filter_count=False, summarize_variants_check=True, filters_check=True, raw_adj_check=True, subset_freq_check=True, samples_sum_check=True, sex_chr_check=True, missingness_check=True, pprint_globals=False, row_to_globals_check=None, check_all_rows_in_row_to_global_check=False)[source]
+

Perform a battery of validity checks on a specified group of subsets in a MatrixTable containing variant annotations.

+

Includes: +- Summaries of % filter status for different partitions of variants +- Histogram outlier bin checks +- Checks on AC, AN, and AF annotations +- Checks that subgroup annotation values add up to the supergroup annotation values +- Checks on sex-chromosome annotations; and summaries of % missingness in variant annotations

+

All annotations must be within an info struct, e.g. t.info.AC-raw.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table containing variant annotations to check.

  • +
  • subsets (List[str]) – List of subsets to be checked.

  • +
  • pops (List[str]) – List of pops within main callset. Default is POPS[CURRENT_MAJOR_RELEASE][“exomes”].

  • +
  • missingness_threshold (float) – Upper cutoff for allowed amount of missingness. Default is 0.5.

  • +
  • site_gt_check_expr (Dict[str, BooleanExpression]) – Optional boolean expression or dictionary of strings and boolean expressions typically used to log how many monoallelic or 100% heterozygous sites are in the Table.

  • +
  • verbose (bool) – If True, display top values of relevant annotations being checked, regardless of whether check conditions are violated; if False, display only top values of relevant annotations if check conditions are violated.

  • +
  • show_percent_sites (bool) – Show percentage of sites that fail checks. Default is False.

  • +
  • delimiter (str) – String to use as delimiter when making group label combinations. Default is “-“.

  • +
  • metric_first_field (bool) – If True, metric precedes label group, e.g. AC-afr-male. If False, label group precedes metric, afr-male-AC. Default is True.

  • +
  • sum_metrics (List[str]) – List of metrics to sum and compare to annotationed versions and between subsets and entire callset. Default is [“AC”, “AN”, “nhomalt”].

  • +
  • sexes (List[str]) – List of sexes in table. Default is SEXES.

  • +
  • groups (List[str]) – List of callstat groups, e.g. “adj” and “raw” contained within the callset. gnomAD does not store the raw callstats for the pop or sex groupings of any subset. Default is [“adj”]

  • +
  • sample_sum_sets_and_pops (Dict[str, List[str]]) – Dict with subset (keys) and populations within subset (values) for sample sum check.

  • +
  • sort_order (List[str]) – List containing order to sort label group combinations. Default is SORT_ORDER.

  • +
  • variant_filter_field (str) – String of variant filtration used in the filters annotation on ht (e.g. RF, VQSR, AS_VQSR). Default is “RF”.

  • +
  • problematic_regions (List[str]) – List of regions considered problematic to run filter check in. Default is [“lcr”, “segdup”, “nonpar”].

  • +
  • single_filter_count (bool) – If True, explode the Table’s filter column and give a supplement total count of each filter. Default is False.

  • +
  • summarize_variants_check (bool) – When true, runs the summarize_variants method. Default is True.

  • +
  • filters_check (bool) – When True, runs the summarize_variant_filters method. Default is True.

  • +
  • raw_adj_check (bool) – When True, runs the check_raw_and_adj_callstats method. Default is True.

  • +
  • subset_freq_check (bool) – When True, runs the compare_subset_freqs method. Default is True.

  • +
  • samples_sum_check (bool) – When True, runs the sum_group_callstats method. Default is True.

  • +
  • sex_chr_check (bool) – When True, runs the check_sex_chr_metricss method. Default is True.

  • +
  • missingness_check (bool) – When True, runs the compute_missingness method. Default is True.

  • +
  • pprint_globals (bool) – When True, Pretty Print the globals of the input Table. Default is True.

  • +
  • row_to_globals_check (Optional[Dict[str, List[str]]]) – Optional dictionary of globals (keys) and rows (values) to be checked. When passed, function checks that the lengths of the global and row annotations are equal.

  • +
  • check_all_rows_in_row_to_global_check (bool) – If True, check all rows in t in row_to_globals_check; if False, check only the first row. Default is False.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None (stdout display of results from the battery of validity checks).

+
+
+
+ +
+
+gnomad.assessment.validity_checks.count_vep_annotated_variants_per_interval(vep_ht, interval_ht)[source]
+

Calculate the count of VEP annotated variants in vep_ht per interval defined by interval_ht.

+
+

Note

+
    +
  • vep_ht must contain the ‘vep.transcript_consequences’ array field, which +contains a ‘biotype’ field to determine whether a variant is in a +“protein-coding” gene.

  • +
  • interval_ht should be indexed by ‘locus’ and contain a ‘gene_stable_ID’ +field. For example, an interval Table containing the intervals of +protein-coding genes of a specific Ensembl release.

  • +
+
+
+
The returned Table will have the following fields added:
    +
  • n_total_variants: The number of total variants in the interval.

  • +
  • n_pcg_variants: The number of variants in the interval that are annotated as +“protein-coding”.

  • +
+
+
+
+
Parameters:
+
    +
  • vep_ht (Table) – VEP-annotated Table.

  • +
  • interval_ht (Table) – Interval Table.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Interval Table with annotations for the counts of total variants and +variants annotated as “protein-coding” in biotype.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/index.html b/api_reference/index.html new file mode 100644 index 000000000..329654c3b --- /dev/null +++ b/api_reference/index.html @@ -0,0 +1,169 @@ + + + + + + + gnomad — gnomad master documentation + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/api_reference/resources/config.html b/api_reference/resources/config.html new file mode 100644 index 000000000..d00696920 --- /dev/null +++ b/api_reference/resources/config.html @@ -0,0 +1,189 @@ + + + + + + + gnomad.resources.config — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.config

+

Configuration for loading resources.

+ + + + + + + + + +

gnomad.resources.config.GnomadPublicResourceSource(value)

Sources for public gnomAD resources.

gnomad.resources.config.get_default_public_resource_source()

Get the default source for public gnomAD resources.

+

Configuration for loading resources.

+
+
+class gnomad.resources.config.GnomadPublicResourceSource(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]
+

Sources for public gnomAD resources.

+
+
+GNOMAD = 'gnomAD'
+
+ +
+
+GOOGLE_CLOUD_PUBLIC_DATASETS = 'Google Cloud Public Datasets'
+
+ +
+
+REGISTRY_OF_OPEN_DATA_ON_AWS = 'Registry of Open Data on AWS'
+
+ +
+
+AZURE_OPEN_DATASETS = 'Azure Open Datasets'
+
+ +
+ +
+
+gnomad.resources.config.get_default_public_resource_source()[source]
+

Get the default source for public gnomAD resources.

+

The default source is determined by…

+
    +
  • If the GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE environment variable is set, use the source configured there.

  • +
  • Otherwise, if Hail determines that is is running in a cloud provider’s Spark environment, use the source from that cloud provider. +For example, use Azure Open Datasets if running on an Azure HDInsight cluster.

  • +
  • Otherwise, use Google Cloud Public Datasets.

  • +
+
+
Return type:
+

Union[GnomadPublicResourceSource, str]

+
+
Returns:
+

Default resource source

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/grch37/gnomad.html b/api_reference/resources/grch37/gnomad.html new file mode 100644 index 000000000..f4f61f86d --- /dev/null +++ b/api_reference/resources/grch37/gnomad.html @@ -0,0 +1,240 @@ + + + + + + + gnomad.resources.grch37.gnomad — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.grch37.gnomad

+ + + + + + + + + + + + + + + + + + +

gnomad.resources.grch37.gnomad.public_release(...)

Retrieve publicly released versioned table resource.

gnomad.resources.grch37.gnomad.coverage(...)

Retrieve gnomAD's coverage table by data_type.

gnomad.resources.grch37.gnomad.liftover(...)

Get the 38 liftover of gnomad v2.1.1.

gnomad.resources.grch37.gnomad.public_pca_loadings([...])

Return the TableResource containing sites and loadings from population PCA.

gnomad.resources.grch37.gnomad.release_vcf_path(...)

Publically released VCF.

+
+
+gnomad.resources.grch37.gnomad.public_release(data_type)[source]
+

Retrieve publicly released versioned table resource.

+
+
Parameters:
+

data_type (str) – One of “exomes” or “genomes”

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

Release Table

+
+
+
+ +
+
+gnomad.resources.grch37.gnomad.coverage(data_type)[source]
+

Retrieve gnomAD’s coverage table by data_type.

+
+
Parameters:
+

data_type (str) – One of “exomes” or “genomes”

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

Coverage Table

+
+
+
+ +
+
+gnomad.resources.grch37.gnomad.liftover(data_type)[source]
+

Get the 38 liftover of gnomad v2.1.1.

+
+
Parameters:
+

data_type (str) – One of “exomes” or “genomes”

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

Release Table

+
+
+
+ +
+
+gnomad.resources.grch37.gnomad.public_pca_loadings(subpop='')[source]
+

Return the TableResource containing sites and loadings from population PCA.

+
+
Parameters:
+

subpop (str) – Can be empty (“”) -> global, “eas” or “nfe”

+
+
Return type:
+

GnomadPublicTableResource

+
+
Returns:
+

gnomAD public PCA loadings TableResource

+
+
+
+ +
+
+gnomad.resources.grch37.gnomad.release_vcf_path(data_type, version, contig)[source]
+

Publically released VCF. Provide specific contig, i.e. “20”, to retrieve contig specific VCF.

+
+
Parameters:
+
    +
  • data_type (str) – One of “exomes” or “genomes”

  • +
  • version (str) – One of the release versions of gnomAD on GRCh37

  • +
  • contig (str) – Single contig “1” to “Y”

  • +
+
+
Return type:
+

str

+
+
Returns:
+

Path to VCF

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/grch37/gnomad_ld.html b/api_reference/resources/grch37/gnomad_ld.html new file mode 100644 index 000000000..b7e275862 --- /dev/null +++ b/api_reference/resources/grch37/gnomad_ld.html @@ -0,0 +1,187 @@ + + + + + + + gnomad.resources.grch37.gnomad_ld — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.grch37.gnomad_ld

+ + + + + + + + + + + + +

gnomad.resources.grch37.gnomad_ld.ld_matrix(pop)

Get resource for the LD matrix for the given population.

gnomad.resources.grch37.gnomad_ld.ld_index(pop)

Get resource for the LD indices for the given population.

gnomad.resources.grch37.gnomad_ld.ld_scores(pop)

Get resource for the LD scores for the given population.

+
+
+gnomad.resources.grch37.gnomad_ld.ld_matrix(pop)[source]
+

Get resource for the LD matrix for the given population.

+
+
Parameters:
+

pop (str) –

+
+
Return type:
+

GnomadPublicBlockMatrixResource

+
+
+
+ +
+
+gnomad.resources.grch37.gnomad_ld.ld_index(pop)[source]
+

Get resource for the LD indices for the given population.

+
+
Parameters:
+

pop (str) –

+
+
Return type:
+

GnomadPublicTableResource

+
+
+
+ +
+
+gnomad.resources.grch37.gnomad_ld.ld_scores(pop)[source]
+

Get resource for the LD scores for the given population.

+
+
Parameters:
+

pop (str) –

+
+
Return type:
+

GnomadPublicTableResource

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/grch37/index.html b/api_reference/resources/grch37/index.html new file mode 100644 index 000000000..671fa05c5 --- /dev/null +++ b/api_reference/resources/grch37/index.html @@ -0,0 +1,153 @@ + + + + + + + gnomad.resources.grch37 — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+ + +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/grch37/reference_data.html b/api_reference/resources/grch37/reference_data.html new file mode 100644 index 000000000..18163fc78 --- /dev/null +++ b/api_reference/resources/grch37/reference_data.html @@ -0,0 +1,162 @@ + + + + + + + gnomad.resources.grch37.reference_data — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.grch37.reference_data

+ + + + + + +

gnomad.resources.grch37.reference_data.get_truth_ht()

Return a table with annotations from the latest version of the corresponding truth data.

+
+
+gnomad.resources.grch37.reference_data.get_truth_ht()[source]
+

Return a table with annotations from the latest version of the corresponding truth data.

+
+
The following annotations are included:
    +
  • hapmap

  • +
  • kgp_omni (1000 Genomes intersection Onni 2.5M array)

  • +
  • kgp_phase_1_hc (high confidence sites in 1000 genonmes)

  • +
  • mills (Mills & Devine indels)

  • +
+
+
+
+
Return type:
+

Table

+
+
Returns:
+

A table with the latest version of popular truth data annotations

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/grch38/gnomad.html b/api_reference/resources/grch38/gnomad.html new file mode 100644 index 000000000..48a6d1417 --- /dev/null +++ b/api_reference/resources/grch38/gnomad.html @@ -0,0 +1,391 @@ + + + + + + + gnomad.resources.grch38.gnomad — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.grch38.gnomad

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.resources.grch38.gnomad.SUBSETS

Order to sort subgroupings during VCF export by version.

gnomad.resources.grch38.gnomad.GROUPS

Group names used to generate labels for high quality genotypes and all raw genotypes.

gnomad.resources.grch38.gnomad.SEXES

Sample sexes used in VCF export.

gnomad.resources.grch38.gnomad.POPS

Global ancestry groups in gnomAD by version.

gnomad.resources.grch38.gnomad.COHORTS_WITH_POP_STORED_AS_SUBPOP

Subsets in gnomAD v3.1 that are broken down by their known subpops instead of global pops in the frequency struct.

gnomad.resources.grch38.gnomad.TGP_POPS

1000 Genomes Project (1KG/TGP) subpops.

gnomad.resources.grch38.gnomad.HGDP_POPS

Human Genome Diversity Project (HGDP) subpops.

gnomad.resources.grch38.gnomad.TGP_POP_NAMES

1000 Genomes Project (1KG/TGP) pop label map.

gnomad.resources.grch38.gnomad.POPS_TO_REMOVE_FOR_POPMAX

Populations that are removed before popmax calculations.

gnomad.resources.grch38.gnomad.DOWNSAMPLINGS

List of the downsampling numbers to use for frequency calculations by version.

gnomad.resources.grch38.gnomad.public_release(...)

Retrieve publicly released versioned table resource.

gnomad.resources.grch38.gnomad.coverage(...)

Retrieve gnomAD's coverage table by data_type.

gnomad.resources.grch38.gnomad.all_sites_an(...)

Retrieve gnomAD's all sites allele number table by data_type.

gnomad.resources.grch38.gnomad.coverage_tsv_path(...)

Retrieve gnomAD's coverage table by data_type.

gnomad.resources.grch38.gnomad.release_vcf_path(...)

Publically released VCF.

gnomad.resources.grch38.gnomad.add_grpMaxFAF95_v4(ht)

Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'.

gnomad.resources.grch38.gnomad.gnomad_gks(...)

Perform gnomad GKS annotations on a range of variants at once.

+
+
+gnomad.resources.grch38.gnomad.SUBSETS = {'v3': ['non_v2', 'non_topmed', 'non_cancer', 'controls_and_biobanks', 'non_neuro', 'tgp', 'hgdp'], 'v4': ['non_ukb']}
+

Order to sort subgroupings during VCF export by version.

+

Ensures that INFO labels in VCF are in desired order (e.g., tgp_raw_AC_esn_XX).

+
+ +
+
+gnomad.resources.grch38.gnomad.GROUPS = ['adj', 'raw']
+

Group names used to generate labels for high quality genotypes and all raw genotypes.

+

Used in VCF export.

+
+ +
+
+gnomad.resources.grch38.gnomad.SEXES = ['XX', 'XY']
+

Sample sexes used in VCF export.

+

Used to stratify frequency annotations (AC, AN, AF) for each sex.

+
+ +
+
+gnomad.resources.grch38.gnomad.POPS = {'v3': {'genomes': ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid']}, 'v4': {'exomes': ['afr', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'remaining', 'sas'], 'genomes': ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'remaining', 'sas']}}
+

Global ancestry groups in gnomAD by version.

+
+ +
+
+gnomad.resources.grch38.gnomad.COHORTS_WITH_POP_STORED_AS_SUBPOP = ['tgp', 'hgdp']
+

Subsets in gnomAD v3.1 that are broken down by their known subpops instead of global pops in the frequency struct.

+
+ +
+
+gnomad.resources.grch38.gnomad.TGP_POPS = ['esn', 'pur', 'pjl', 'clm', 'jpt', 'chb', 'stu', 'itu', 'tsi', 'mxl', 'ceu', 'msl', 'yri', 'beb', 'fin', 'khv', 'cdx', 'lwk', 'acb', 'asw', 'ibs', 'gbr', 'pel', 'gih', 'chs', 'gwd']
+

1000 Genomes Project (1KG/TGP) subpops.

+
+ +
+
+gnomad.resources.grch38.gnomad.HGDP_POPS = ['japanese', 'papuanhighlands', 'papuansepik', 'adygei', 'orcadian', 'biaka', 'yakut', 'han', 'northernhan', 'uygur', 'miao', 'mongolian', 'balochi', 'bedouin', 'russian', 'daur', 'pima', 'hezhen', 'sindhi', 'yi', 'oroqen', 'san', 'tuscan', 'tu', 'palestinian', 'tujia', 'druze', 'pathan', 'basque', 'makrani', 'bergamoitalian', 'naxi', 'karitiana', 'sardinian', 'mbuti', 'mozabite', 'yoruba', 'lahu', 'dai', 'cambodian', 'bougainville', 'french', 'brahui', 'hazara', 'bantusouthafrica', 'surui', 'mandenka', 'kalash', 'xibo', 'colombian', 'bantukenya', 'she', 'burusho', 'maya']
+

Human Genome Diversity Project (HGDP) subpops.

+
+ +
+
+gnomad.resources.grch38.gnomad.TGP_POP_NAMES = {'acb': 'African Caribbean', 'asw': 'African-American', 'beb': 'Bengali', 'cdx': 'Chinese Dai', 'ceu': 'Utah Residents (European Ancestry)', 'chb': 'Han Chinese', 'chs': 'Southern Han Chinese', 'clm': 'Colombian', 'esn': 'Esan', 'fin': 'Finnish', 'gbr': 'British', 'gih': 'Gujarati', 'gwd': 'Gambian', 'ibs': 'Iberian', 'itu': 'Indian Telugu', 'jpt': 'Japanese', 'khv': 'Kinh', 'lwk': 'Luhya', 'msl': 'Mende', 'mxl': 'Mexican-American', 'pel': 'Peruvian', 'pjl': 'Punjabi', 'pur': 'Puerto Rican', 'stu': 'Sri Lankan Tamil', 'tsi': 'Toscani', 'yri': 'Yoruba'}
+

1000 Genomes Project (1KG/TGP) pop label map.

+
+ +
+
+gnomad.resources.grch38.gnomad.POPS_TO_REMOVE_FOR_POPMAX = {'v3': {'ami', 'asj', 'fin', 'mid', 'oth', 'remaining'}, 'v4': {'ami', 'asj', 'fin', 'oth', 'remaining'}}
+

Populations that are removed before popmax calculations.

+
+ +
+
+gnomad.resources.grch38.gnomad.DOWNSAMPLINGS = {'v3': [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 15000, 20000, 25000, 30000, 40000, 50000, 60000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 110000, 120000], 'v4': [10, 100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000]}
+

List of the downsampling numbers to use for frequency calculations by version.

+
+ +
+
+gnomad.resources.grch38.gnomad.public_release(data_type)[source]
+

Retrieve publicly released versioned table resource.

+
+
Parameters:
+

data_type (str) – One of “exomes” or “genomes”

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

Release Table

+
+
+
+ +
+
+gnomad.resources.grch38.gnomad.coverage(data_type)[source]
+

Retrieve gnomAD’s coverage table by data_type.

+
+
Parameters:
+

data_type (str) – One of “exomes” or “genomes”

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

Coverage Table

+
+
+
+ +
+
+gnomad.resources.grch38.gnomad.all_sites_an(data_type)[source]
+

Retrieve gnomAD’s all sites allele number table by data_type.

+
+
Parameters:
+

data_type (str) – One of “exomes” or “genomes”

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

All sites allele number VersionedTableResource

+
+
+
+ +
+
+gnomad.resources.grch38.gnomad.coverage_tsv_path(data_type, version=None)[source]
+

Retrieve gnomAD’s coverage table by data_type.

+
+
Parameters:
+
    +
  • data_type (str) – One of “exomes” or “genomes”

  • +
  • version (Optional[str]) –

  • +
+
+
Return type:
+

str

+
+
Returns:
+

Coverage Table

+
+
+
+ +
+
+gnomad.resources.grch38.gnomad.release_vcf_path(data_type, version, contig)[source]
+

Publically released VCF. Provide specific contig, i.e. “chr20”, to retrieve contig specific VCF.

+
+
Parameters:
+
    +
  • data_type (str) – One of “exomes” or “genomes”

  • +
  • version (str) – One of the release versions of gnomAD on GRCh37

  • +
  • contig (str) – Single contig “chr1” to “chrY”

  • +
+
+
Return type:
+

str

+
+
Returns:
+

Path to VCF

+
+
+
+ +
+
+gnomad.resources.grch38.gnomad.add_grpMaxFAF95_v4(ht)[source]
+

Add a grpMaxFAF95 struct with ‘popmax’ and ‘popmax_population’.

+

Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures.

+
+
Parameters:
+

ht (Table) – Input hail table.

+
+
Return type:
+

Table

+
+
Returns:
+

Annotated hail table.

+
+
+
+ +
+
+gnomad.resources.grch38.gnomad.gnomad_gks(locus_interval, version, data_type='genomes', by_ancestry_group=False, by_sex=False, vrs_only=False, custom_ht=None, skip_checkpoint=False, skip_coverage=False, custom_coverage_ht=None)[source]
+

Perform gnomad GKS annotations on a range of variants at once.

+
+
Parameters:
+
    +
  • locus_interval (IntervalExpression) – Hail IntervalExpression of locus<reference_genome>. +e.g. hl.locus_interval(‘chr1’, 6424776, 6461367, reference_genome=”GRCh38”)

  • +
  • version (str) – String of version of gnomAD release to use.

  • +
  • data_type (str) – String of either “exomes” or “genomes” for the type of reads that are desired.

  • +
  • by_ancestry_group (bool) – Boolean to pass to obtain frequency information for each cohort.

  • +
  • by_sex (bool) – Boolean to pass to return frequency information for each cohort split by chromosomal sex.

  • +
  • vrs_only (bool) – Boolean to pass for only VRS info to be returned +(will not include allele frequency information).

  • +
  • custom_ht (Table) – Table to use instead of what public_release() method would return for the version.

  • +
  • skip_checkpoint (bool) – Bool to pass to skip checkpointing selected fields +(checkpointing may be desirable for large datasets by reducing data copies across the cluster).

  • +
  • skip_coverage (bool) – Bool to pass to skip adding coverage statistics.

  • +
  • custom_coverage_ht (Table) – Custom table to use for coverage statistics instead of the release coverage table.

  • +
+
+
Return type:
+

list

+
+
Returns:
+

List of dictionaries containing VRS information +(and freq info split by ancestry groups and sex if desired) for specified variant.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/grch38/index.html b/api_reference/resources/grch38/index.html new file mode 100644 index 000000000..708081e25 --- /dev/null +++ b/api_reference/resources/grch38/index.html @@ -0,0 +1,158 @@ + + + + + + + gnomad.resources.grch38 — gnomad master documentation + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/api_reference/resources/grch38/reference_data.html b/api_reference/resources/grch38/reference_data.html new file mode 100644 index 000000000..b9f7b7493 --- /dev/null +++ b/api_reference/resources/grch38/reference_data.html @@ -0,0 +1,161 @@ + + + + + + + gnomad.resources.grch38.reference_data — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.grch38.reference_data

+ + + + + + +

gnomad.resources.grch38.reference_data.get_truth_ht()

Return a table with annotations from the latest version of the corresponding truth data.

+
+
+gnomad.resources.grch38.reference_data.get_truth_ht()[source]
+

Return a table with annotations from the latest version of the corresponding truth data.

+
+
The following annotations are included:
    +
  • hapmap

  • +
  • kgp_omni (1000 Genomes intersection Onni 2.5M array)

  • +
  • kgp_phase_1_hc (high confidence sites in 1000 genonmes)

  • +
  • mills (Mills & Devine indels)

  • +
+
+
+
+
Return type:
+

Table

+
+
Returns:
+

A table with the latest version of popular truth data annotations

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/import_resources.html b/api_reference/resources/import_resources.html new file mode 100644 index 000000000..2f4a14b8f --- /dev/null +++ b/api_reference/resources/import_resources.html @@ -0,0 +1,199 @@ + + + + + + + gnomad.resources.import_resources — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.import_resources

+ + + + + + + + + + + + +

gnomad.resources.import_resources.get_module_importable_resources(module)

Take a module that was imported and generates a list of all resources in this module that can be imported (i.e.

gnomad.resources.import_resources.get_resources_descriptions(...)

Return a string listing all resources in the input dict along with the path from which they are imported and the path at which they are stored.

gnomad.resources.import_resources.main(args)

Import selected resources.

+
+
+gnomad.resources.import_resources.get_module_importable_resources(module, prefix=None)[source]
+

Take a module that was imported and generates a list of all resources in this module that can be imported (i.e. with a path and import_func).

+
+
The dict produced is as follows:
    +
  • keys: {prefix}.{resource_name}.{version} (with prefix only present if prefix is set, and version only present for versioned resources)

  • +
  • values: ({resource_name}[ version {version}], resource) with resource_name set to the variable name in the module and the version present for versioned resources.

  • +
+
+
+

The following example will generate a dict with all the resources in gnomad.resources.grch37 that can be imported:

+
import gnomad.resources.grch37 as grch37
+grch37_resources = get_module_importable_resources(grch37, prefix='grch37')
+
+
+
+
Parameters:
+
    +
  • module – Input module

  • +
  • prefix (Optional[str]) –

  • +
+
+
Return type:
+

Dict[str, Tuple[str, BaseResource]]

+
+
Returns:
+

+
+
+
+ +
+
+gnomad.resources.import_resources.get_resources_descriptions(resources, width=100)[source]
+

Return a string listing all resources in the input dict along with the path from which they are imported and the path at which they are stored.

+
+
Parameters:
+
    +
  • resources (Dict[str, Tuple[str, BaseResource]]) – A dict returned from get_module_importable_resources

  • +
  • width (Optional[int]) – Maximum width of lines in the returned string

  • +
+
+
Return type:
+

str

+
+
+
+ +
+
+gnomad.resources.import_resources.main(args)[source]
+

Import selected resources.

+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/resources/index.html b/api_reference/resources/index.html new file mode 100644 index 000000000..820bee63b --- /dev/null +++ b/api_reference/resources/index.html @@ -0,0 +1,177 @@ + + + + + + + gnomad.resources — gnomad master documentation + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/api_reference/resources/resource_utils.html b/api_reference/resources/resource_utils.html new file mode 100644 index 000000000..c58a4bd89 --- /dev/null +++ b/api_reference/resources/resource_utils.html @@ -0,0 +1,1023 @@ + + + + + + + gnomad.resources.resource_utils — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.resources.resource_utils

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.resources.resource_utils.GNOMAD_PUBLIC_BUCKETS

Public buckets used to stage gnomAD data.

gnomad.resources.resource_utils.BaseResource([...])

Generic abstract resource class.

gnomad.resources.resource_utils.TableResource([...])

A Hail Table resource.

gnomad.resources.resource_utils.MatrixTableResource([...])

A Hail MatrixTable resource.

gnomad.resources.resource_utils.VariantDatasetResource([...])

A Hail VariantDataset resource.

gnomad.resources.resource_utils.PedigreeResource([...])

A pedigree resource.

gnomad.resources.resource_utils.BlockMatrixResource([...])

A Hail BlockMatrix resource.

gnomad.resources.resource_utils.ExpressionResource([...])

A Hail Expression resource.

gnomad.resources.resource_utils.BaseVersionedResource(...)

Class for a versioned resource.

gnomad.resources.resource_utils.VersionedTableResource(...)

Versioned Table resource.

gnomad.resources.resource_utils.VersionedMatrixTableResource(...)

Versioned MatrixTable resource.

gnomad.resources.resource_utils.VersionedVariantDatasetResource(...)

Versioned VariantDataset resource.

gnomad.resources.resource_utils.VersionedPedigreeResource(...)

Versioned Pedigree resource.

gnomad.resources.resource_utils.VersionedBlockMatrixResource(...)

Versioned BlockMatrix resource.

gnomad.resources.resource_utils.ResourceNotAvailable

Exception raised if a resource is not available from the selected source.

gnomad.resources.resource_utils.GnomadPublicResource([...])

Base class for the gnomAD project's public resources.

gnomad.resources.resource_utils.GnomadPublicTableResource([...])

Resource class for a public Hail Table published by the gnomAD project.

gnomad.resources.resource_utils.GnomadPublicMatrixTableResource([...])

Resource class for a public Hail MatrixTable published by the gnomAD project.

gnomad.resources.resource_utils.GnomadPublicPedigreeResource([...])

Resource class for a public pedigree published by the gnomAD project.

gnomad.resources.resource_utils.GnomadPublicBlockMatrixResource([...])

Resource class for a public Hail BlockMatrix published by the gnomAD project.

gnomad.resources.resource_utils.DataException

gnomad.resources.resource_utils.import_sites_vcf(...)

Import site-level data from a VCF into a Hail Table.

gnomad.resources.resource_utils.import_gencode(...)

Import GENCODE annotations GTF file as a Hail Table.

+
+
+gnomad.resources.resource_utils.GNOMAD_PUBLIC_BUCKETS = ('gnomad-public', 'gnomad-public-requester-pays')
+

Public buckets used to stage gnomAD data.

+

gnomad-public is a legacy bucket and contains one readme text file.

+

The gnomAD Production Team writes output data to gnomad-public-requester-pays, and all data in this bucket +syncs to the public bucket gcp-public-data–gnomad.

+
+ +
+
+class gnomad.resources.resource_utils.BaseResource(path=None, import_args=None, import_func=None)[source]
+

Generic abstract resource class.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The resource path

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be kept track of (e.g. .vcf path for an imported VCF)

  • +
  • import_func (Optional[Callable]) – A function used to import the resource. import_func will be passed the import_args dictionary as kwargs.

  • +
+
+
+
+
+expected_file_extensions: List[str] = []
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+property path
+
+ +
+
+abstract import_resource(overwrite=True, **kwargs)[source]
+

Abstract method to import the resource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If True, overwrite an existing file at the destination.

  • +
  • kwargs – Any other parameters to be passed to the underlying hail write function (acceptable parameters depend on specific resource types)

  • +
+
+
Return type:
+

None

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.TableResource(path=None, import_args=None, import_func=None)[source]
+

A Hail Table resource.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The Table path (typically ending in .ht)

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF)

  • +
  • import_func (Optional[Callable]) – A function used to import the Table. import_func will be passed the import_args dictionary as kwargs.

  • +
+
+
+
+
+expected_file_extensions: List[str] = ['.ht']
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+ht(force_import=False, read_args=None)[source]
+

Read and return the Hail Table resource.

+
+
Parameters:
+
    +
  • force_import (bool) – If True, force the import of the resource even if it +already exists.

  • +
  • read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to hl.read_table.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Hail Table resource

+
+
+
+ +
+
+import_resource(overwrite=True, **kwargs)[source]
+

Import the TableResource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If True, overwrite an existing file at the destination.

  • +
  • kwargs – Any other parameters to be passed to hl.Table.write

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.MatrixTableResource(path=None, import_args=None, import_func=None)[source]
+

A Hail MatrixTable resource.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The MatrixTable path (typically ending in .mt)

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF)

  • +
  • import_func (Optional[Callable]) – A function used to import the MatrixTable. import_func will be passed the import_args dictionary as kwargs.

  • +
+
+
+
+
+expected_file_extensions: List[str] = ['.mt']
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+mt(force_import=False, read_args=None)[source]
+

Read and return the Hail MatrixTable resource.

+
+
Parameters:
+
    +
  • force_import (bool) – If True, force the import of the resource even if it +already exists.

  • +
  • read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to hl.read_matrix_table.

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

Hail MatrixTable resource

+
+
+
+ +
+
+import_resource(overwrite=True, **kwargs)[source]
+

Import the MatrixTable resource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If set, existing file(s) will be overwritten

  • +
  • kwargs – Any other parameters to be passed to hl.MatrixTable.write

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.VariantDatasetResource(path=None, import_args=None, import_func=None)[source]
+

A Hail VariantDataset resource.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The VariantDataset path (typically ending in .vds)

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF)

  • +
  • import_func (Optional[Callable]) – A function used to import the VariantDataset. import_func will be passed the import_args dictionary as kwargs.

  • +
+
+
+
+
+expected_file_extensions: List[str] = ['.vds']
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+vds(force_import=False, read_args=None)[source]
+

Read and return the Hail VariantDataset resource.

+
+
Parameters:
+
    +
  • force_import (bool) – If True, force the import of the resource even if it +already exists.

  • +
  • read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to hl.vds.read_vds.

  • +
+
+
Return type:
+

VariantDataset

+
+
Returns:
+

Hail VariantDataset resource

+
+
+
+ +
+
+import_resource(overwrite=True, **kwargs)[source]
+

Import the VariantDataset resource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If set, existing file(s) will be overwritten

  • +
  • kwargs – Any other parameters to be passed to hl.vds.VariantDataset.write

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.PedigreeResource(path=None, import_args=None, import_func=None, quant_pheno=False, delimiter='\\\\\\\\s+', missing='NA')[source]
+

A pedigree resource.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The Pedigree path (typically ending in .fam or .ped)

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be kept track of and/or passed to the import_func (e.g. .vcf path for an imported VCF)

  • +
  • import_func (Optional[Callable[..., Pedigree]]) – A function used to import the Pedigree. import_func will be passed the import_args dictionary as kwargs.

  • +
  • quant_pheno (bool) – If True, phenotype is interpreted as quantitative.

  • +
  • delimiter (str) – Field delimiter regex.

  • +
  • missing (str) – The string used to denote missing values. For case-control, 0, -9, and non-numeric are also treated as missing.

  • +
+
+
+
+
+expected_file_extensions: List[str] = ['.fam', '.ped']
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+ht()[source]
+

Read the pedigree into a family HT using hl.import_fam().

+
+
Return type:
+

Table

+
+
Returns:
+

Family table

+
+
+
+ +
+
+pedigree()[source]
+

Read the pedigree into an hl.Pedigree using hl.Pedigree.read().

+
+
Parameters:
+

delimiter – Delimiter used in the ped file

+
+
Return type:
+

Pedigree

+
+
Returns:
+

pedigree

+
+
+
+ +
+
+import_resource(overwrite=True, **kwargs)[source]
+

Import the Pedigree resource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If set, existing file(s) will be overwritten. IMPORTANT: Currently there is no implementation of this method when overwrite is set the False

  • +
  • kwargs – Any other parameters to be passed to hl.Pedigree.write

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.BlockMatrixResource(path=None, import_args=None, import_func=None)[source]
+

A Hail BlockMatrix resource.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The BlockMatrix path (typically ending in .bm)

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be kept track of and/or passed to the import_func.

  • +
  • import_func (Optional[Callable]) – A function used to import the BlockMatrix. import_func will be passed the import_args dictionary as kwargs.

  • +
+
+
+
+
+expected_file_extensions: List[str] = ['.bm']
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+bm(read_args=None)[source]
+

Read and return the Hail MatrixTable resource.

+
+
Parameters:
+

read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to BlockMatrix.read.

+
+
Return type:
+

BlockMatrix

+
+
Returns:
+

Hail MatrixTable resource

+
+
+
+ +
+
+import_resource(overwrite=True, **kwargs)[source]
+

Import the BlockMatrixResource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If True, overwrite an existing file at the destination.

  • +
  • kwargs – Any additional parameters to be passed to BlockMatrix.write

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.ExpressionResource(path=None, import_args=None, import_func=None)[source]
+

A Hail Expression resource.

+
+
Parameters:
+
    +
  • path (Optional[str]) – The Expression path (typically ending in .he).

  • +
  • import_args (Optional[Dict[str, Any]]) – Any sources that are required for the import and need to be +kept track of and/or passed to the import_func (e.g. .vcf path for an imported +VCF).

  • +
  • import_func (Optional[Callable]) – A function used to import the Expression. import_func will be +passed the import_args dictionary as kwargs.

  • +
+
+
+
+
+expected_file_extensions: List[str] = ['.he']
+

Expected file extensions for this resource type. If path doesn’t end with one of these, a warning is logged.

+
+ +
+
+he(force_import=False, read_args=None)[source]
+

Read and return the Hail Expression resource.

+
+
Parameters:
+
    +
  • force_import (bool) – If True, force the import of the resource even if it +already exists.

  • +
  • read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to hl.experimental.read_expression.

  • +
+
+
Return type:
+

Expression

+
+
Returns:
+

Hail Expression resource.

+
+
+
+ +
+
+import_resource(overwrite=True, **kwargs)[source]
+

Import the Expression resource using its import_func and writes it in its path.

+
+
Parameters:
+
    +
  • overwrite (bool) – If set, existing file(s) will be overwritten.

  • +
  • kwargs – Any other parameters to be passed to hl.experimental. +write_expression.

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing.

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.BaseVersionedResource(default_version, versions)[source]
+

Class for a versioned resource.

+

The attributes and methods of the versioned resource are those of the default version of the resource. +In addition, all versions of the resource are stored in the versions attribute.

+
+
Parameters:
+
    +
  • default_version (str) – The default version of this resource (must be in the versions dict)

  • +
  • versions (Dict[str, BaseResource]) – A dict of version name -> resource.

  • +
+
+
+
+
+resource_class
+

alias of BaseResource

+
+ +
+
+default_version
+
+ +
+
+versions
+
+ +
+ +
+
+class gnomad.resources.resource_utils.VersionedTableResource(default_version, versions)[source]
+

Versioned Table resource.

+

The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. +In addition, all versions of the resource are stored in the versions attribute.

+
+
Parameters:
+
    +
  • default_version (str) – The default version of this Table resource (must to be in the versions dict)

  • +
  • versions (Dict[str, TableResource]) – A dict of version name -> TableResource.

  • +
+
+
+
+
+resource_class
+

alias of TableResource

+
+ +
+ +
+
+class gnomad.resources.resource_utils.VersionedMatrixTableResource(default_version, versions)[source]
+

Versioned MatrixTable resource.

+

The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. +In addition, all versions of the resource are stored in the versions attribute.

+
+
Parameters:
+
    +
  • default_version (str) – The default version of this MatrixTable resource (must to be in the versions dict)

  • +
  • versions (Dict[str, MatrixTableResource]) – A dict of version name -> MatrixTableResource.

  • +
+
+
+
+
+resource_class
+

alias of MatrixTableResource

+
+ +
+ +
+
+class gnomad.resources.resource_utils.VersionedVariantDatasetResource(default_version, versions)[source]
+

Versioned VariantDataset resource.

+

The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. +In addition, all versions of the resource are stored in the versions attribute. +:type default_version: str +:param default_version: The default version of this VariantDataset resource (must to be in the versions dict)

+
+
Parameters:
+

versions (Dict[str, VariantDatasetResource]) – A dict of version name -> VariantDatasetResource.

+
+
+
+
+resource_class
+

alias of VariantDatasetResource

+
+ +
+ +
+
+class gnomad.resources.resource_utils.VersionedPedigreeResource(default_version, versions)[source]
+

Versioned Pedigree resource.

+

The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. +In addition, all versions of the resource are stored in the versions attribute.

+
+
Parameters:
+
    +
  • default_version (str) – The default version of this Pedigree resource (must be in the versions dict)

  • +
  • versions (Dict[str, PedigreeResource]) – A dict of version name -> PedigreeResource.

  • +
+
+
+
+
+resource_class
+

alias of PedigreeResource

+
+ +
+ +
+
+class gnomad.resources.resource_utils.VersionedBlockMatrixResource(default_version, versions)[source]
+

Versioned BlockMatrix resource.

+

The attributes (path, import_args and import_func) of the versioned resource are those of the default version of the resource. +In addition, all versions of the resource are stored in the versions attribute.

+
+
Parameters:
+
    +
  • default_version (str) – The default version of this BlockMatrix resource (must to be in the versions dict)

  • +
  • versions (Dict[str, BlockMatrixResource]) – A dict of version name -> BlockMatrixResource.

  • +
+
+
+
+
+resource_class
+

alias of BlockMatrixResource

+
+ +
+ +
+
+exception gnomad.resources.resource_utils.ResourceNotAvailable[source]
+

Exception raised if a resource is not available from the selected source.

+
+ +
+
+class gnomad.resources.resource_utils.GnomadPublicResource(path=None, import_args=None, import_func=None)[source]
+

Base class for the gnomAD project’s public resources.

+
+
Parameters:
+
    +
  • path (Optional[str]) –

  • +
  • import_args (Optional[Dict[str, Any]]) –

  • +
  • import_func (Optional[Callable]) –

  • +
+
+
+
+
+is_resource_available()[source]
+

Check if this resource is available from the selected source.

+
+
Return type:
+

bool

+
+
Returns:
+

True if the resource is available.

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.GnomadPublicTableResource(path=None, import_args=None, import_func=None)[source]
+

Resource class for a public Hail Table published by the gnomAD project.

+
+
Parameters:
+
    +
  • path (Optional[str]) –

  • +
  • import_args (Optional[Dict[str, Any]]) –

  • +
  • import_func (Optional[Callable]) –

  • +
+
+
+
+
+ht(force_import=False, read_args=None)
+

Read and return the Hail Table resource.

+
+
Parameters:
+
    +
  • force_import (bool) – If True, force the import of the resource even if it +already exists.

  • +
  • read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to hl.read_table.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Hail Table resource

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.GnomadPublicMatrixTableResource(path=None, import_args=None, import_func=None)[source]
+

Resource class for a public Hail MatrixTable published by the gnomAD project.

+
+
Parameters:
+
    +
  • path (Optional[str]) –

  • +
  • import_args (Optional[Dict[str, Any]]) –

  • +
  • import_func (Optional[Callable]) –

  • +
+
+
+
+
+mt(force_import=False, read_args=None)
+

Read and return the Hail MatrixTable resource.

+
+
Parameters:
+
    +
  • force_import (bool) – If True, force the import of the resource even if it +already exists.

  • +
  • read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to hl.read_matrix_table.

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

Hail MatrixTable resource

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.GnomadPublicPedigreeResource(path=None, import_args=None, import_func=None, quant_pheno=False, delimiter='\\\\\\\\s+', missing='NA')[source]
+

Resource class for a public pedigree published by the gnomAD project.

+
+
Parameters:
+
    +
  • path (Optional[str]) –

  • +
  • import_args (Optional[Dict[str, Any]]) –

  • +
  • import_func (Optional[Callable[..., Pedigree]]) –

  • +
  • quant_pheno (bool) –

  • +
  • delimiter (str) –

  • +
  • missing (str) –

  • +
+
+
+
+
+ht()
+

Read the pedigree into a family HT using hl.import_fam().

+
+
Return type:
+

Table

+
+
Returns:
+

Family table

+
+
+
+ +
+
+pedigree()
+

Read the pedigree into an hl.Pedigree using hl.Pedigree.read().

+
+
Parameters:
+

delimiter – Delimiter used in the ped file

+
+
Return type:
+

Pedigree

+
+
Returns:
+

pedigree

+
+
+
+ +
+ +
+
+class gnomad.resources.resource_utils.GnomadPublicBlockMatrixResource(path=None, import_args=None, import_func=None)[source]
+

Resource class for a public Hail BlockMatrix published by the gnomAD project.

+
+
Parameters:
+
    +
  • path (Optional[str]) –

  • +
  • import_args (Optional[Dict[str, Any]]) –

  • +
  • import_func (Optional[Callable]) –

  • +
+
+
+
+
+bm(read_args=None)
+

Read and return the Hail MatrixTable resource.

+
+
Parameters:
+

read_args (Optional[Dict[str, Any]]) – Any additional arguments to pass to BlockMatrix.read.

+
+
Return type:
+

BlockMatrix

+
+
Returns:
+

Hail MatrixTable resource

+
+
+
+ +
+ +
+
+exception gnomad.resources.resource_utils.DataException[source]
+
+ +
+
+gnomad.resources.resource_utils.import_sites_vcf(**kwargs)[source]
+

Import site-level data from a VCF into a Hail Table.

+
+
Return type:
+

Table

+
+
+
+ +
+
+gnomad.resources.resource_utils.import_gencode(gtf_path, **kwargs)[source]
+

Import GENCODE annotations GTF file as a Hail Table.

+
+
Parameters:
+

gtf_path (str) – Path to GENCODE GTF file.

+
+
Return type:
+

Table

+
+
Returns:
+

Table with GENCODE annotation information.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/ancestry.html b/api_reference/sample_qc/ancestry.html new file mode 100644 index 000000000..e25b5ebe8 --- /dev/null +++ b/api_reference/sample_qc/ancestry.html @@ -0,0 +1,348 @@ + + + + + + + gnomad.sample_qc.ancestry — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.sample_qc.ancestry

+ + + + + + + + + + + + + + + + + + + + + +

gnomad.sample_qc.ancestry.pc_project(mt, ...)

Project samples in mt on pre-computed PCs.

gnomad.sample_qc.ancestry.apply_onnx_classification_model(...)

Apply an ONNX classification model fit to a pandas dataframe data_pd.

gnomad.sample_qc.ancestry.apply_sklearn_classification_model(...)

Apply an sklearn classification model fit to a pandas dataframe data_pd.

gnomad.sample_qc.ancestry.convert_sklearn_rf_to_onnx(fit)

Convert a sklearn random forest model to ONNX.

gnomad.sample_qc.ancestry.assign_population_pcs(...)

Use a random forest model to assign population labels based on the results of PCA.

gnomad.sample_qc.ancestry.run_pca_with_relateds(qc_mt)

Run PCA excluding the given related or additional samples, and project those samples in the PC space to return scores for all samples.

+
+
+gnomad.sample_qc.ancestry.pc_project(mt, loadings_ht, loading_location='loadings', af_location='pca_af')[source]
+

Project samples in mt on pre-computed PCs.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – MT containing the samples to project

  • +
  • loadings_ht (Table) – HT containing the PCA loadings and allele frequencies used for the PCA

  • +
  • loading_location (str) – Location of expression for loadings in loadings_ht

  • +
  • af_location (str) – Location of expression for allele frequency in loadings_ht

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with scores calculated from loadings in column scores

+
+
+
+ +
+
+gnomad.sample_qc.ancestry.apply_onnx_classification_model(data_pd, fit)[source]
+

Apply an ONNX classification model fit to a pandas dataframe data_pd.

+
+
Parameters:
+
    +
  • data_pd (DataFrame) – Pandas dataframe containing the data to be classified.

  • +
  • fit (ModelProto) – ONNX model to be applied.

  • +
+
+
Return type:
+

Tuple[ndarray, DataFrame]

+
+
Returns:
+

Tuple of classification and probabilities.

+
+
+
+ +
+
+gnomad.sample_qc.ancestry.apply_sklearn_classification_model(data_pd, fit)[source]
+

Apply an sklearn classification model fit to a pandas dataframe data_pd.

+
+
Parameters:
+
    +
  • data_pd (DataFrame) – Pandas dataframe containing the data to be classified.

  • +
  • fit (Any) – Sklearn model to be applied.

  • +
+
+
Return type:
+

Tuple[ndarray, DataFrame]

+
+
Returns:
+

Tuple of classification and probabilities.

+
+
+
+ +
+
+gnomad.sample_qc.ancestry.convert_sklearn_rf_to_onnx(fit, target_opset=None)[source]
+

Convert a sklearn random forest model to ONNX.

+
+
Parameters:
+
    +
  • fit (Any) – Sklearn random forest model to be converted.

  • +
  • target_opset (Optional[int]) – An optional target ONNX opset version to convert the model to.

  • +
+
+
Return type:
+

ModelProto

+
+
Returns:
+

ONNX model.

+
+
+
+ +
+
+gnomad.sample_qc.ancestry.assign_population_pcs(pop_pca_scores, pc_cols, known_col='known_pop', fit=None, seed=42, prop_train=0.8, n_estimators=100, min_prob=0.9, output_col='pop', missing_label='oth', pc_expr='scores', convert_model_func=None, apply_model_func=<function apply_sklearn_classification_model>)[source]
+

Use a random forest model to assign population labels based on the results of PCA.

+

Default values for model and assignment parameters are those used in gnomAD.

+
+
As input, this function can either take:
    +
  • +
    A Hail Table (typically the output of hwe_normalized_pca). In this case,
      +
    • +
      pc_cols should be one of::
        +
      • A list of integers where each element is one of the PCs to use.

      • +
      • A list of strings where each element is one of the PCs to use.

      • +
      • An ArrayExpression of Floats where each element is one of the PCs. +to use

      • +
      +
      +
      +
    • +
    • A Hail Table will be returned as output.

    • +
    +
    +
    +
  • +
  • +
    A Pandas DataFrame. In this case:
      +
    • Each PC should be in a separate column and pc_cols is the list of all +the columns containing the PCs to use.

    • +
    • A pandas DataFrame is returned as output.

    • +
    +
    +
    +
  • +
+
+
+
+

Note

+

If you have a Pandas Dataframe and have all PCs as an array in a single column, +the expand_pd_array_col`can be used to expand this column into multiple `PC +columns.

+
+
+
Parameters:
+
    +
  • pop_pca_scores (Union[Table, DataFrame]) – Input Hail Table or Pandas Dataframe.

  • +
  • pc_cols (Union[ArrayExpression, List[int], List[str]]) – List of which PCs to use/columns storing the PCs to use. Values +provided should be 1-based and should be a list of integers when passing in a +Hail Table (i.e. [1, 2, 4, 5]) or a list of strings when passing in a Pandas +Dataframe (i.e. [“PC1”, “PC2”, “PC4”, “PC5”]). When passing a HT this can also +be an ArrayExpression containing all the PCs to use.

  • +
  • known_col (str) – Column storing the known population labels.

  • +
  • fit (Any) – Fit from a previously trained random forest model (i.e., the output +from a previous RandomForestClassifier() call).

  • +
  • seed (int) – Random seed.

  • +
  • prop_train (float) – Proportion of known data used for training.

  • +
  • n_estimators (int) – Number of trees to use in the RF model.

  • +
  • min_prob (float) – Minimum probability of belonging to a given population for the +population to be set (otherwise set to None).

  • +
  • output_col (str) – Output column storing the assigned population.

  • +
  • missing_label (str) – Label for samples for which the assignment probability is +smaller than min_prob.

  • +
  • pc_expr (Union[ArrayExpression, str]) – Column storing the list of PCs. Only used if pc_cols is a List of +integers. Default is scores.

  • +
  • convert_model_func (Optional[Callable[[Any], Any]]) – Optional function to convert the model to ONNX format. +Default is no conversion.

  • +
  • apply_model_func (Callable[[DataFrame, Any], Any]) – Function to apply the model to the data. Default is +apply_sklearn_classification_model, which will apply a sklearn classification +model to the data. This default will work if no fit is set, or the supplied +fit is a sklearn classification model.

  • +
+
+
Return type:
+

Tuple[Union[Table, DataFrame], Any]

+
+
Returns:
+

Hail Table or Pandas Dataframe (depending on input) containing sample IDs +and imputed population labels, trained random forest model.

+
+
+
+ +
+
+gnomad.sample_qc.ancestry.run_pca_with_relateds(qc_mt, related_samples_to_drop=None, additional_samples_to_drop=None, n_pcs=10, autosomes_only=True)[source]
+

Run PCA excluding the given related or additional samples, and project those samples in the PC space to return scores for all samples.

+

The related_samples_to_drop and additional_samples_to_drop Tables have to be keyed by the sample ID and all samples present in these +tables will be excluded from the PCA.

+

The loadings Table returned also contains a pca_af annotation which is the allele frequency +used for PCA. This is useful to project other samples in the PC space.

+
+
Parameters:
+
    +
  • qc_mt (MatrixTable) – Input QC MT

  • +
  • related_samples_to_drop (Optional[Table]) – Optional table of related samples to drop when generating the PCs, these samples will be projected in the PC space

  • +
  • additional_samples_to_drop (Optional[Table]) – Optional table of additional samples to drop when generating the PCs, these samples will be projected in the PC space

  • +
  • n_pcs (int) – Number of PCs to compute

  • +
  • autosomes_only (bool) – Whether to run the analysis on autosomes only

  • +
+
+
Return type:
+

Tuple[List[float], Table, Table]

+
+
Returns:
+

eigenvalues, scores and loadings

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/filtering.html b/api_reference/sample_qc/filtering.html new file mode 100644 index 000000000..cc5c2e1b9 --- /dev/null +++ b/api_reference/sample_qc/filtering.html @@ -0,0 +1,337 @@ + + + + + + + gnomad.sample_qc.filtering — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.sample_qc.filtering

+ + + + + + + + + + + + + + + + + + +

gnomad.sample_qc.filtering.compute_qc_metrics_residuals(ht, ...)

Compute QC metrics residuals after regressing out PCs (and optionally PC^2).

gnomad.sample_qc.filtering.compute_stratified_metrics_filter(ht, ...)

Compute median, MAD, and upper and lower thresholds for each metric used in outlier filtering.

gnomad.sample_qc.filtering.compute_stratified_sample_qc(...)

Run hl.sample_qc on different strata and then also merge the results into a single expression.

gnomad.sample_qc.filtering.merge_sample_qc_expr(...)

Create an expression that merges results from non-overlapping strata of hail.sample_qc.

gnomad.sample_qc.filtering.determine_nearest_neighbors(ht, ...)

Determine the nearest neighbors of each sample with information in scores_expr.

+
+
+gnomad.sample_qc.filtering.compute_qc_metrics_residuals(ht, pc_scores, qc_metrics, use_pc_square=True, n_pcs=None, regression_sample_inclusion_expr=<BooleanExpression of type bool>, strata=None)[source]
+

Compute QC metrics residuals after regressing out PCs (and optionally PC^2).

+
+

Note

+

The regression_sample_inclusion_expr can be used to select a subset of the +samples to include in the regression calculation. Residuals are always computed +for all samples.

+
+
+
Parameters:
+
    +
  • ht (Table) – Input sample QC metrics HT.

  • +
  • pc_scores (ArrayNumericExpression) – The expression in the input HT that stores the PC scores.

  • +
  • qc_metrics (Dict[str, NumericExpression]) – A dictionary with the name of each QC metric to compute +residuals for and their corresponding expression in the input HT.

  • +
  • use_pc_square (bool) – Whether to use PC^2 in the regression or not.

  • +
  • n_pcs (Optional[int]) – Numer of PCs to use. If not set, then all PCs in pc_scores are used.

  • +
  • regression_sample_inclusion_expr (BooleanExpression) – An optional expression to select samples +to include in the regression calculation.

  • +
  • strata (Optional[Dict[str, Expression]]) – Optional dictionary used for stratification. Keys are strata names +and values are filtering expressions. These expressions should refer to +data with discrete types!

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with QC metrics residuals.

+
+
+
+ +
+
+gnomad.sample_qc.filtering.compute_stratified_metrics_filter(ht, qc_metrics, strata=None, lower_threshold=4.0, upper_threshold=4.0, metric_threshold=None, filter_name='qc_metrics_filters', comparison_sample_expr=None)[source]
+

Compute median, MAD, and upper and lower thresholds for each metric used in outlier filtering.

+
+
Parameters:
+
    +
  • ht (Table) – HT containing relevant sample QC metric annotations.

  • +
  • qc_metrics (Dict[str, NumericExpression]) – List of metrics (name and expr) for which to compute the +critical values for filtering outliers.

  • +
  • strata (Optional[Dict[str, Expression]]) – Dictionary of annotations used for stratification. These metrics should be +discrete types!

  • +
  • lower_threshold (float) – Lower MAD threshold.

  • +
  • upper_threshold (float) – Upper MAD threshold.

  • +
  • metric_threshold (Optional[Dict[str, Tuple[float, float]]]) – Can be used to specify different (lower, upper) thresholds +for one or more metrics.

  • +
  • filter_name (str) – Name of resulting filters annotation.

  • +
  • comparison_sample_expr (Union[BooleanExpression, CollectionExpression, None]) – Optional BooleanExpression or CollectionExpression +of sample IDs to use for computation of the metric median, MAD, and upper and +lower thresholds to use for each sample. For instance, this works well with the +output of determine_nearest_neighbors or a boolean expression defining +releasable samples.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table grouped by strata, with upper and lower threshold values computed +for each sample QC metric.

+
+
+
+ +
+
+gnomad.sample_qc.filtering.compute_stratified_sample_qc(mtds, strata, tmp_ht_prefix, gt_col=None)[source]
+

Run hl.sample_qc on different strata and then also merge the results into a single expression.

+
+

Note

+

Strata should be non-overlapping, e.g. SNV vs indels or bi-allelic vs multi-allelic

+
+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input MatrixTable or VariantDataset

  • +
  • strata (Dict[str, BooleanExpression]) – Strata names and filtering expressions

  • +
  • tmp_ht_prefix (Optional[str]) – Optional path prefix to write the intermediate strata results to (recommended for larger datasets)

  • +
  • gt_col (Optional[str]) – Name of entry field storing the genotype. Default: ‘GT’

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Sample QC table, including strat-specific numbers

+
+
+
+ +
+
+gnomad.sample_qc.filtering.merge_sample_qc_expr(sample_qc_exprs)[source]
+

Create an expression that merges results from non-overlapping strata of hail.sample_qc.

+

E.g.:

+
    +
  • Compute autosomes and sex chromosomes metrics separately, then merge results

  • +
  • Compute bi-allelic and multi-allelic metrics separately, then merge results

  • +
+

Note regarding the merging of dp_stats and gq_stats: +Because n is needed to aggregate stdev, n_called is used for this purpose. +This should work very well on a standard GATK VCF and it essentially assumes that:

+
    +
  • samples that are called have DP and GQ fields

  • +
  • samples that are not called do not have DP and GQ fields

  • +
+

Even if these assumptions are broken for some genotypes, it shouldn’t matter too much.

+
+
Parameters:
+

sample_qc_exprs (List[StructExpression]) – List of sample QC struct expressions for each stratification

+
+
Return type:
+

StructExpression

+
+
Returns:
+

Combined sample QC results

+
+
+
+ +
+
+gnomad.sample_qc.filtering.determine_nearest_neighbors(ht, scores_expr, strata=None, n_pcs=None, n_neighbors=50, n_jobs=-1, add_neighbor_distances=False, distance_metric='euclidean', use_approximation=False, n_trees=10)[source]
+

Determine the nearest neighbors of each sample with information in scores_expr.

+
+

Note

+

If strata is provided, the nearest neighbors for each sample is limited to the +other samples with the same strata values. If n_neighbors is greater than the +number of samples in a stratification grouping, all samples within the +stratification are returned and a warning is raised indicating that any sample +within the stratification group has less than the expected n_neighbors.

+
+
+
The following annotations are in the returned Table:
    +
  • nearest_neighbors

  • +
  • nearest_neighbor_dists (if add_neighbor_distances is True)

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • scores_expr (ArrayNumericExpression) – Expression in the input HT that stores the PC scores.

  • +
  • strata (Optional[Dict[str, Expression]]) – Optional dictionary used for stratification. Keys are strata names +and values are filtering expressions. These expressions should refer to +data with discrete types!

  • +
  • n_pcs (Optional[int]) – Number of PCs to use. If not set, then all PCs in scores_expr are +used.

  • +
  • n_neighbors (int) – Number of nearest neighbors to identify for each sample. +Default is 50.

  • +
  • n_jobs (int) – Number of threads to use when finding the nearest neighbors. Default +is -1 which uses the number of CPUs on the head node -1.

  • +
  • add_neighbor_distances (bool) – Whether to return an annotation for the nearest +neighbor distances.

  • +
  • distance_metric (str) – Distance metric to use. Default is euclidean. Options +using scikit-learn are: “euclidean”, “cityblock”, “cosine”, “haversine”, “l1”, +“l2”, and “manhattan”. Options using Annoy: “angular”, “euclidean”, “manhattan”, +“hamming”, and “dot”.

  • +
  • use_approximation (bool) – Whether to use the package Annoy to determine approximate +nearest neighbors instead of using scikit-learn’s NearestNeighbors. This +method is faster, but only needed for very large datasets, for instance +> 500,000 samples.

  • +
  • n_trees (int) – Number of trees to use in the annoy approximation approach. +n_trees is provided during build time and affects the build time and the +index size. A larger value will give more accurate results, but larger indexes. +Default is 10.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with an annotation for the nearest neighbors and optionally their +distances.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/index.html b/api_reference/sample_qc/index.html new file mode 100644 index 000000000..d01ea415e --- /dev/null +++ b/api_reference/sample_qc/index.html @@ -0,0 +1,189 @@ + + + + + + + gnomad.sample_qc — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/pipeline.html b/api_reference/sample_qc/pipeline.html new file mode 100644 index 000000000..9dd1ae7ee --- /dev/null +++ b/api_reference/sample_qc/pipeline.html @@ -0,0 +1,345 @@ + + + + + + + gnomad.sample_qc.pipeline — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.sample_qc.pipeline

+ + + + + + + + + + + + + + + +

gnomad.sample_qc.pipeline.filter_rows_for_qc(mt)

Annotate rows with sites_callrate, site_inbreeding_coeff and af, then apply thresholds.

gnomad.sample_qc.pipeline.get_qc_mt(mt[, ...])

Create a QC-ready MT.

gnomad.sample_qc.pipeline.infer_sex_karyotype(...)

Create a Table with X_karyotype, Y_karyotype, and sex_karyotype.

gnomad.sample_qc.pipeline.annotate_sex(mtds)

Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

+
+
+gnomad.sample_qc.pipeline.filter_rows_for_qc(mt, min_af=0.001, min_callrate=0.99, min_inbreeding_coeff_threshold=-0.8, min_hardy_weinberg_threshold=1e-08, apply_hard_filters=True, bi_allelic_only=True, snv_only=True)[source]
+

Annotate rows with sites_callrate, site_inbreeding_coeff and af, then apply thresholds.

+

AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from +GATK best practices.

+
+

Note

+

This function expect the typical info annotation of type struct with fields MQ, FS and QD +if applying hard filters.

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MT

  • +
  • min_af (Optional[float]) – Minimum site AF to keep. Not applied if set to None.

  • +
  • min_callrate (Optional[float]) – Minimum site call rate to keep. Not applied if set to None.

  • +
  • min_inbreeding_coeff_threshold (Optional[float]) – Minimum site inbreeding coefficient to keep. Not applied if set to None.

  • +
  • min_hardy_weinberg_threshold (Optional[float]) – Minimum site HW test p-value to keep. Not applied if set to None.

  • +
  • apply_hard_filters (bool) – Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30.

  • +
  • bi_allelic_only (bool) – Whether to only keep bi-allelic sites or include multi-allelic sites too.

  • +
  • snv_only (bool) – Whether to only keep SNVs or include other variant types.

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

annotated and filtered table

+
+
+
+ +
+
+gnomad.sample_qc.pipeline.get_qc_mt(mt, bi_allelic_only=True, snv_only=True, adj_only=True, min_af=0.001, min_callrate=0.99, min_inbreeding_coeff_threshold=-0.8, min_hardy_weinberg_threshold=1e-08, apply_hard_filters=True, ld_r2=0.1, filter_lcr=True, filter_decoy=True, filter_segdup=True, filter_exome_low_coverage_regions=False, high_conf_regions=None, checkpoint_path=None, n_partitions=None, block_size=None)[source]
+

Create a QC-ready MT.

+
+
Has options to filter to the following:
    +
  • Variants outside known problematic regions

  • +
  • Bi-allelic sites only

  • +
  • SNVs only

  • +
  • Variants passing hard thresholds

  • +
  • Variants passing the set call rate and MAF thresholds

  • +
  • Genotypes passing on gnomAD ADJ criteria (GQ>=20, DP>=10, AB>0.2 for hets)

  • +
+
+
+

In addition, the MT will be LD-pruned if ld_r2 is set.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MT.

  • +
  • bi_allelic_only (bool) – Whether to only keep bi-allelic sites or include multi-allelic sites too.

  • +
  • snv_only (bool) – Whether to only keep SNVs or include other variant types.

  • +
  • adj_only (bool) – If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation.

  • +
  • min_af (Optional[float]) – Minimum allele frequency to keep. Not applied if set to None.

  • +
  • min_callrate (Optional[float]) – Minimum call rate to keep. Not applied if set to None.

  • +
  • min_inbreeding_coeff_threshold (Optional[float]) – Minimum site inbreeding coefficient to keep. Not applied if set to None.

  • +
  • min_hardy_weinberg_threshold (Optional[float]) – Minimum site HW test p-value to keep. Not applied if set to None.

  • +
  • apply_hard_filters (bool) – Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30.

  • +
  • ld_r2 (Optional[float]) – Minimum r2 to keep when LD-pruning (set to None for no LD pruning).

  • +
  • filter_lcr (bool) – Filter LCR regions.

  • +
  • filter_decoy (bool) – Filter decoy regions.

  • +
  • filter_segdup (bool) – Filter segmental duplication regions.

  • +
  • filter_exome_low_coverage_regions (bool) – If set, only high coverage exome regions (computed from gnomAD are kept).

  • +
  • high_conf_regions (Optional[List[str]]) – If given, the data will be filtered to only include variants in those regions.

  • +
  • checkpoint_path (Optional[str]) – If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead.

  • +
  • n_partitions (Optional[int]) – If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. checkpoint_path must also be specified as the MT will first be written to the checkpoint_path before being reread with the new number of partitions.

  • +
  • block_size (Optional[int]) – If given, set the block size to this value when LD pruning.

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

Filtered MT.

+
+
+
+ +
+
+gnomad.sample_qc.pipeline.infer_sex_karyotype(ploidy_ht, f_stat_cutoff=0.5, use_gaussian_mixture_model=False, normal_ploidy_cutoff=5, aneuploidy_cutoff=6, chr_x_frac_hom_alt_expr=None, normal_chr_x_hom_alt_cutoff=5)[source]
+

Create a Table with X_karyotype, Y_karyotype, and sex_karyotype.

+

This function uses get_ploidy_cutoffs to determine X and Y ploidy cutoffs and then get_sex_expr to get +karyotype annotations from those cutoffs.

+

By default f_stat_cutoff will be used to roughly split samples into ‘XX’ and ‘XY’ for use in get_ploidy_cutoffs. +If use_gaussian_mixture_model is True a gaussian mixture model will be used to split samples into ‘XX’ and ‘XY’ +instead of f-stat.

+
+
Parameters:
+
    +
  • ploidy_ht (Table) – Input Table with chromosome X and chromosome Y ploidy values and optionally f-stat.

  • +
  • f_stat_cutoff (float) – f-stat to roughly divide ‘XX’ from ‘XY’ samples. Assumes XX samples are below cutoff and XY +are above cutoff. Default is 0.5.

  • +
  • use_gaussian_mixture_model (bool) – Use gaussian mixture model to split samples into ‘XX’ and ‘XY’ instead of f-stat.

  • +
  • normal_ploidy_cutoff (int) – Number of standard deviations to use when determining sex chromosome ploidy cutoffs +for XX, XY karyotypes.

  • +
  • aneuploidy_cutoff (int) – Number of standard deviations to use when determining sex chromosome ploidy cutoffs for +aneuploidies.

  • +
  • chr_x_frac_hom_alt_expr (Optional[NumericExpression]) – Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X.

  • +
  • normal_chr_x_hom_alt_cutoff (int) – Number of standard deviations to use when determining cutoffs for the fraction +of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X for for XX and XY karyotypes. Only +used if chr_x_frac_hom_alt_expr is supplied.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of samples imputed sex karyotype.

+
+
+
+ +
+
+gnomad.sample_qc.pipeline.annotate_sex(mtds, is_sparse=True, excluded_intervals=None, included_intervals=None, normalization_contig='chr20', sites_ht=None, aaf_expr=None, gt_expr='GT', f_stat_cutoff=0.5, aaf_threshold=0.001, variants_only_x_ploidy=False, variants_only_y_ploidy=False, variants_filter_lcr=True, variants_filter_segdup=True, variants_filter_decoy=False, variants_snv_only=False, coverage_mt=None, compute_x_frac_variants_hom_alt=False, compute_fstat=True, infer_karyotype=True, use_gaussian_mixture_model=False)[source]
+

Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

+
+
Return Table with the following fields:
    +
  • s (str): Sample

  • +
  • normalization_contig`_mean_dp (float32): Sample’s mean coverage over the specified `normalization_contig.

  • +
  • chrX_mean_dp (float32): Sample’s mean coverage over chromosome X.

  • +
  • chrY_mean_dp (float32): Sample’s mean coverage over chromosome Y.

  • +
  • chrX_ploidy (float32): Sample’s imputed ploidy over chromosome X.

  • +
  • chrY_ploidy (float32): Sample’s imputed ploidy over chromosome Y.

  • +
+
+
If compute_fstat:
    +
  • f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.

  • +
  • n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.

  • +
  • expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.

  • +
  • observed_homs (int64): Observed number of homozygotes. Calculated using hl.impute_sex.

  • +
+
+
If infer_karyotype:
    +
  • X_karyotype (str): Sample’s chromosome X karyotype.

  • +
  • Y_karyotype (str): Sample’s chromosome Y karyotype.

  • +
  • sex_karyotype (str): Sample’s sex karyotype.

  • +
+
+
+
+
+
+

Note

+

In order to infer sex karyotype (infer_karyotype`=True), one of `compute_fstat or +use_gaussian_mixture_model must be set to True.

+
+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input MatrixTable or VariantDataset.

  • +
  • is_sparse (bool) – Whether input MatrixTable is in sparse data format. Default is True.

  • +
  • excluded_intervals (Optional[Table]) – Optional table of intervals to exclude from the computation. This option is currently +not implemented for imputing sex chromosome ploidy on a VDS.

  • +
  • included_intervals (Optional[Table]) – Optional table of intervals to use in the computation. REQUIRED for exomes.

  • +
  • normalization_contig (str) – Which chromosome to use to normalize sex chromosome coverage. Used in determining sex +chromosome ploidies. Default is “chr20”.

  • +
  • sites_ht (Optional[Table]) – Optional Table of sites and alternate allele frequencies for filtering the input MatrixTable prior to imputing sex.

  • +
  • aaf_expr (Optional[str]) – Optional. Name of field in input MatrixTable with alternate allele frequency.

  • +
  • gt_expr (str) – Name of entry field storing the genotype. Default is ‘GT’.

  • +
  • f_stat_cutoff (float) – f-stat to roughly divide ‘XX’ from ‘XY’ samples. Assumes XX samples are below cutoff and XY +samples are above cutoff. Default is 0.5.

  • +
  • aaf_threshold (float) – Minimum alternate allele frequency to be used in f-stat calculations. Default is 0.001.

  • +
  • variants_only_x_ploidy (bool) – Whether to use depth of only variant data for the x ploidy estimation.

  • +
  • variants_only_y_ploidy (bool) – Whether to use depth of only variant data for the y ploidy estimation.

  • +
  • variants_filter_lcr (bool) – Whether to filter out variants in LCR regions for variants only ploidy estimation and +fraction of homozygous alternate variants on chromosome X. Default is True.

  • +
  • variants_filter_segdup (bool) – Whether to filter out variants in segdup regions for variants only ploidy estimation +and fraction of homozygous alternate variants on chromosome X. Default is True.

  • +
  • variants_filter_decoy (bool) – Whether to filter out variants in decoy regions for variants only ploidy estimation +and fraction of homozygous alternate variants on chromosome X. Default is False. Note: this option doesn’t +exist for GRCh38.

  • +
  • variants_snv_only (bool) – Whether to filter to only single nucleotide variants for variants only ploidy estimation +and fraction of homozygous alternate variants on chromosome X. Default is False.

  • +
  • coverage_mt (Optional[MatrixTable]) – Optional precomputed coverage MatrixTable to use in reference based VDS ploidy estimation.

  • +
  • compute_x_frac_variants_hom_alt (bool) – Whether to return an annotation for the fraction of homozygous alternate +variants on chromosome X. Default is False.

  • +
  • compute_fstat (bool) – Whether to compute f-stat. Default is True.

  • +
  • infer_karyotype (bool) – Whether to infer sex karyotypes. Default is True.

  • +
  • use_gaussian_mixture_model (bool) – Whether to use gaussian mixture model to split samples into ‘XX’ and ‘XY’ +instead of f-stat. Default is False.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of samples and their imputed sex karyotypes.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/platform.html b/api_reference/sample_qc/platform.html new file mode 100644 index 000000000..45a6cb6ea --- /dev/null +++ b/api_reference/sample_qc/platform.html @@ -0,0 +1,221 @@ + + + + + + + gnomad.sample_qc.platform — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.sample_qc.platform

+ + + + + + + + + + + + +

gnomad.sample_qc.platform.compute_callrate_mt(mt, ...)

Compute a sample/interval MT with each entry containing the call rate for that sample/interval.

gnomad.sample_qc.platform.run_platform_pca(...)

Run PCA on a sample/interval MT with each entry containing the call rate.

gnomad.sample_qc.platform.assign_platform_from_pcs(...)

Assign platforms using HBDSCAN on the results of call rate PCA.

+
+
+gnomad.sample_qc.platform.compute_callrate_mt(mt, intervals_ht, bi_allelic_only=True, autosomes_only=True, match=True)[source]
+

Compute a sample/interval MT with each entry containing the call rate for that sample/interval.

+

This can be used as input for imputing exome sequencing platforms.

+
+

Note

+

The input interval HT should have a key of type Interval. +The resulting table will have a key of the same type as the intervals_ht table and +contain an interval_info field containing all non-key fields of the intervals_ht.

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MT

  • +
  • intervals_ht (Table) – Table containing the intervals. This table has to be keyed by locus.

  • +
  • bi_allelic_only (bool) – If set, only bi-allelic sites are used for the computation

  • +
  • autosomes_only (bool) – If set, only autosomal intervals are used.

  • +
  • matches – If set, returns all intervals in intervals_ht that overlap the locus in the input MT.

  • +
  • match (bool) –

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

Callrate MT

+
+
+
+ +
+
+gnomad.sample_qc.platform.run_platform_pca(callrate_mt, binarization_threshold=0.25, n_pcs=10)[source]
+

Run PCA on a sample/interval MT with each entry containing the call rate.

+

When binzarization_threshold is set, the callrate is transformed to a 0/1 value based on the threshold. +E.g. with the default threshold of 0.25, all entries with a callrate < 0.25 are considered as 0s, others as 1s.

+
+
Parameters:
+
    +
  • callrate_mt (MatrixTable) – Input callrate MT

  • +
  • binarization_threshold (Optional[float]) – binzarization_threshold. None is no threshold desired

  • +
  • n_pcs (int) – Number of PCs to compute

  • +
+
+
Return type:
+

Tuple[List[float], Table, Table]

+
+
Returns:
+

eigenvalues, scores_ht, loadings_ht

+
+
+
+ +
+
+gnomad.sample_qc.platform.assign_platform_from_pcs(platform_pca_scores_ht, pc_scores_ann='scores', hdbscan_min_cluster_size=None, hdbscan_min_samples=None)[source]
+

Assign platforms using HBDSCAN on the results of call rate PCA.

+
+
Parameters:
+
    +
  • platform_pca_scores_ht (Table) – Input table with the PCA score for each sample

  • +
  • pc_scores_ann (str) – Field containing the scores

  • +
  • hdbscan_min_cluster_size (Optional[int]) – HDBSCAN min_cluster_size parameter. If not specified the smallest of 500 and 0.1*n_samples will be used.

  • +
  • hdbscan_min_samples (int) – HDBSCAN min_samples parameter

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

A Table with a qc_platform annotation containing the platform based on HDBSCAN clustering

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/relatedness.html b/api_reference/sample_qc/relatedness.html new file mode 100644 index 000000000..9960f2dbe --- /dev/null +++ b/api_reference/sample_qc/relatedness.html @@ -0,0 +1,625 @@ + + + + + + + gnomad.sample_qc.relatedness — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.sample_qc.relatedness

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.sample_qc.relatedness.UNRELATED

String representation for a pair of unrelated individuals in this module.

gnomad.sample_qc.relatedness.SECOND_DEGREE_RELATIVES

String representation for a pair of 2nd degree relatives in this module.

gnomad.sample_qc.relatedness.PARENT_CHILD

String representation for a parent-child pair in this module.

gnomad.sample_qc.relatedness.SIBLINGS

String representation for a sibling pair in this module.

gnomad.sample_qc.relatedness.DUPLICATE_OR_TWINS

String representation for a pair of samples who are identical (either MZ twins of duplicate) in this module.

gnomad.sample_qc.relatedness.AMBIGUOUS_RELATIONSHIP

String representation for a pair of samples whose relationship is ambiguous.

gnomad.sample_qc.relatedness.get_duplicated_samples(...)

Extract the list of duplicate samples using a Table ouput from pc_relate.

gnomad.sample_qc.relatedness.get_duplicated_samples_ht(...)

Create a HT with duplicated samples sets.

gnomad.sample_qc.relatedness.explode_duplicate_samples_ht(dups_ht)

Explode the result of get_duplicated_samples_ht, so that each line contains a single sample.

gnomad.sample_qc.relatedness.get_relationship_expr(...)

Return an expression indicating the relationship between a pair of samples given their kin coefficient and IBDO, IBD1, IBD2 values.

gnomad.sample_qc.relatedness.get_slope_int_relationship_expr(...)

Return an expression indicating the relationship between a pair of samples given slope and intercept cutoffs.

gnomad.sample_qc.relatedness.infer_families(...)

Generate a pedigree containing trios inferred from the relationship_ht.

gnomad.sample_qc.relatedness.create_fake_pedigree(n, ...)

Generate a pedigree made of trios created by sampling 3 random samples in the sample list.

gnomad.sample_qc.relatedness.compute_related_samples_to_drop(...)

Compute a Table with the list of samples to drop (and their global rank) to get the maximal independent set of unrelated samples.

gnomad.sample_qc.relatedness.filter_mt_to_trios(mt, ...)

Filter a MatrixTable to a set of trios in fam_ht and annotates with adj.

gnomad.sample_qc.relatedness.generate_trio_stats_expr(trio_mt)

Generate a row-wise expression containing trio transmission stats.

gnomad.sample_qc.relatedness.generate_sib_stats_expr(mt, ...)

Generate a row-wise expression containing the number of alternate alleles in common between sibling pairs.

+
+
+gnomad.sample_qc.relatedness.UNRELATED = 'unrelated'
+

String representation for a pair of unrelated individuals in this module. +Typically >2nd degree relatives, but the threshold is user-dependant.

+
+ +
+
+gnomad.sample_qc.relatedness.SECOND_DEGREE_RELATIVES = 'second degree relatives'
+

String representation for a pair of 2nd degree relatives in this module.

+
+ +
+
+gnomad.sample_qc.relatedness.PARENT_CHILD = 'parent-child'
+

String representation for a parent-child pair in this module.

+
+ +
+
+gnomad.sample_qc.relatedness.SIBLINGS = 'siblings'
+

String representation for a sibling pair in this module.

+
+ +
+
+gnomad.sample_qc.relatedness.DUPLICATE_OR_TWINS = 'duplicate/twins'
+

String representation for a pair of samples who are identical (either MZ twins of duplicate) in this module.

+
+ +
+
+gnomad.sample_qc.relatedness.AMBIGUOUS_RELATIONSHIP = 'ambiguous'
+

String representation for a pair of samples whose relationship is ambiguous. +This is used in the case of a pair of samples which kinship/IBD values do not correspond to any biological relationship between two individuals.

+
+ +
+
+gnomad.sample_qc.relatedness.get_duplicated_samples(relationship_ht, i_col='i', j_col='j', rel_col='relationship')[source]
+

Extract the list of duplicate samples using a Table ouput from pc_relate.

+
+
Parameters:
+
    +
  • relationship_ht (Table) – Table with relationships between pairs of samples

  • +
  • i_col (str) – Column containing the 1st sample

  • +
  • j_col (str) – Column containing the 2nd sample

  • +
  • rel_col (str) – Column containing the sample pair relationship annotated with get_relationship_expr

  • +
+
+
Return type:
+

List[Set[str]]

+
+
Returns:
+

List of sets of samples that are duplicates

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.get_duplicated_samples_ht(duplicated_samples, samples_rankings_ht, rank_ann='rank')[source]
+

Create a HT with duplicated samples sets.

+

Each row is indexed by the sample that is kept and also contains the set of duplicate samples that should be filtered.

+

samples_rankings_ht is a HT containing a global rank for each of the samples (smaller is better).

+
+
Parameters:
+
    +
  • duplicated_samples (List[Set[str]]) – List of sets of duplicated samples

  • +
  • samples_rankings_ht (Table) – HT with global rank for each sample

  • +
  • rank_ann (str) – Annotation in samples_ranking_ht containing each sample global rank (smaller is better).

  • +
+
+
Returns:
+

HT with duplicate sample sets, including which to keep/filter

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.explode_duplicate_samples_ht(dups_ht)[source]
+

Explode the result of get_duplicated_samples_ht, so that each line contains a single sample.

+

An additional annotation is added: dup_filtered indicating which of the duplicated samples was kept. +Requires a field filtered which type should be the same as the input duplicated samples Table key.

+
+
Parameters:
+

dups_ht (Table) – Input HT

+
+
Return type:
+

Table

+
+
Returns:
+

Flattened HT

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.get_relationship_expr(kin_expr, ibd0_expr, ibd1_expr, ibd2_expr, first_degree_kin_thresholds=(0.19, 0.4), second_degree_min_kin=0.1, ibd0_0_max=0.025, ibd0_25_thresholds=(0.1, 0.425), ibd1_0_thresholds=(-0.15, 0.1), ibd1_50_thresholds=(0.275, 0.75), ibd1_100_min=0.75, ibd2_0_max=0.125, ibd2_25_thresholds=(0.1, 0.5), ibd2_100_thresholds=(0.75, 1.25))[source]
+

Return an expression indicating the relationship between a pair of samples given their kin coefficient and IBDO, IBD1, IBD2 values.

+

The kinship coefficient values in the defaults are in line with those output from +hail.methods.pc_relate <https://hail.is/docs/0.2/methods/genetics.html?highlight=pc_relate#hail.methods.pc_relate>.

+
+
Parameters:
+
    +
  • kin_expr (NumericExpression) – Kin coefficient expression

  • +
  • ibd0_expr (NumericExpression) – IBDO expression

  • +
  • ibd1_expr (NumericExpression) – IBD1 expression

  • +
  • ibd2_expr (NumericExpression) – IDB2 expression

  • +
  • first_degree_kin_thresholds (Tuple[float, float]) – (min, max) kinship threshold for 1st degree relatives

  • +
  • second_degree_min_kin (float) – min kinship threshold for 2nd degree relatives

  • +
  • ibd0_0_max (float) – max IBD0 threshold for 0 IBD0 sharing

  • +
  • ibd0_25_thresholds (Tuple[float, float]) – (min, max) thresholds for 0.25 IBD0 sharing

  • +
  • ibd1_0_thresholds (Tuple[float, float]) – (min, max) thresholds for 0 IBD1 sharing. Note that the min is there because pc_relate can output large negative values in some corner cases.

  • +
  • ibd1_50_thresholds (Tuple[float, float]) – (min, max) thresholds for 0.5 IBD1 sharing

  • +
  • ibd1_100_min (float) – min IBD1 threshold for 1.0 IBD1 sharing

  • +
  • ibd2_0_max (float) – max IBD2 threshold for 0 IBD2 sharing

  • +
  • ibd2_25_thresholds (Tuple[float, float]) – (min, max) thresholds for 0.25 IBD2 sharing

  • +
  • ibd2_100_thresholds (Tuple[float, float]) – (min, max) thresholds for 1.00 IBD2 sharing. Note that the min is there because pc_relate can output much larger IBD2 values in some corner cases.

  • +
+
+
Return type:
+

StringExpression

+
+
Returns:
+

The relationship annotation using the constants defined in this module.

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.get_slope_int_relationship_expr(kin_expr, y_expr, parent_child_max_y, second_degree_sibling_lower_cutoff_slope, second_degree_sibling_lower_cutoff_intercept, second_degree_upper_sibling_lower_cutoff_slope, second_degree_upper_sibling_lower_cutoff_intercept, duplicate_twin_min_kin=0.42, second_degree_min_kin=0.1, duplicate_twin_ibd1_min=-0.15, duplicate_twin_ibd1_max=0.1, ibd1_expr=None)[source]
+

Return an expression indicating the relationship between a pair of samples given slope and intercept cutoffs.

+

The kinship coefficient (kin_expr) and an additional metric (y_expr) are used +to define the relationship between a pair of samples. For this function the +slope and intercepts should refer to cutoff lines where the x-axis, or independent +variable is the kinship coefficient and the y-axis, or dependent variable, is +the metric defined by y_expr. Typically, the y-axis metric IBS0, IBS0/IBS2, or +IBD0.

+
+

Note

+

No defaults are provided for the slope and intercept cutoffs because they are +highly dependent on the dataset and the metric used in y_expr.

+
+
+
The relationship expression is determined as follows:
    +
  • If kin_expr < second_degree_min_kin -> UNRELATED

  • +
  • +
    If kin_expr > duplicate_twin_min_kin:
      +
    • +
      If y_expr < parent_child_max_y:
        +
      • +
        If ibd1_expr is defined:
          +
        • If duplicate_twin_ibd1_min <= ibd1_expr <= ` +duplicate_twin_ibd1_max` -> DUPLICATE_OR_TWINS

        • +
        • Else -> AMBIGUOUS_RELATIONSHIP

        • +
        +
        +
        +
      • +
      • Else -> DUPLICATE_OR_TWINS

      • +
      +
      +
      +
    • +
    +
    +
    +
  • +
  • If y_expr < parent_child_max_y -> PARENT_CHILD

  • +
  • +
    If pair is over second_degree_sibling_lower_cutoff line:
      +
    • If pair is over second_degree_upper_sibling_lower_cutoff line -> SIBLINGS

    • +
    • Else -> SECOND_DEGREE_RELATIVES

    • +
    +
    +
    +
  • +
  • If none of the above conditions are met -> AMBIGUOUS_RELATIONSHIP

  • +
+
+
+
+
Parameters:
+
    +
  • kin_expr (NumericExpression) – Kin coefficient expression. Used as the x-axis, or independent +variable, for the slope and intercept cutoffs.

  • +
  • y_expr (NumericExpression) – Expression for the metric to use as the y-axis, or dependent +variable, for the slope and intercept cutoffs. This is typically an expression +for IBS0, IBS0/IBS2, or IBD0.

  • +
  • parent_child_max_y (float) – Maximum value of the metric defined by y_expr for a +parent-child pair.

  • +
  • second_degree_sibling_lower_cutoff_slope (float) – Slope of the line to use as a +lower cutoff for second degree relatives and siblings from parent-child pairs.

  • +
  • second_degree_sibling_lower_cutoff_intercept (float) – Intercept of the line to use +as a lower cutoff for second degree relatives and siblings from parent-child +pairs.

  • +
  • second_degree_upper_sibling_lower_cutoff_slope (float) – Slope of the line to use as +an upper cutoff for second degree relatives and a lower cutoff for siblings.

  • +
  • second_degree_upper_sibling_lower_cutoff_intercept (float) – Intercept of the line to +use as an upper cutoff for second degree relatives and a lower cutoff for +siblings.

  • +
  • duplicate_twin_min_kin (float) – Minimum kinship for duplicate or twin pairs. +Default is 0.42.

  • +
  • second_degree_min_kin (float) – Minimum kinship threshold for 2nd degree relatives. +Default is 0.08838835. Bycroft et al. (2018) calculates a theoretical kinship +of 0.08838835 for a second degree relationship cutoff, but this cutoff should be +determined by evaluation of the kinship distribution.

  • +
  • ibd1_expr (Optional[NumericExpression]) – Optional IBD1 expression. If this expression is provided, +duplicate_twin_ibd1_min and duplicate_twin_ibd1_max will be used as an +additional cutoff for duplicate or twin pairs.

  • +
  • duplicate_twin_ibd1_min (float) – Minimum IBD1 cutoff for duplicate or twin pairs. +Note: the min is because pc_relate can output large negative values in some +corner cases.

  • +
  • duplicate_twin_ibd1_max (float) – Maximum IBD1 cutoff for duplicate or twin pairs.

  • +
+
+
Returns:
+

The relationship annotation using the constants defined in this module.

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.infer_families(relationship_ht, sex, duplicate_samples_ht, i_col='i', j_col='j', relationship_col='relationship')[source]
+

Generate a pedigree containing trios inferred from the relationship_ht.

+

This function takes a hail Table with a row for each pair of related individuals i, j in the data (it’s OK to have +unrelated samples too).

+
+
The relationship_col should be a column specifying the relationship between each two samples as defined in this

module’s constants.

+
+
+

This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple +trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID.

+
+

Note

+

This function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents).

+
+
+
Parameters:
+
    +
  • relationship_ht (Table) – Input relationship table

  • +
  • sex (Union[Table, Dict[str, bool]]) – A Table or dict giving the sex for each sample (TRUE`=female, `FALSE`=male). If a Table is given, it should have a field `is_female.

  • +
  • duplicated_samples – All duplicated samples TO REMOVE (If not provided, this function won’t work as it assumes that each child has exactly two parents)

  • +
  • i_col (str) – Column containing the 1st sample of the pair in the relationship table

  • +
  • j_col (str) – Column containing the 2nd sample of the pair in the relationship table

  • +
  • relationship_col (str) – Column contatining the relationship for the sample pair as defined in this module constants.

  • +
  • duplicate_samples_ht (Table) –

  • +
+
+
Return type:
+

Pedigree

+
+
Returns:
+

Pedigree of complete trios

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.create_fake_pedigree(n, sample_list, exclude_real_probands=False, max_tries=10, real_pedigree=None, sample_list_stratification=None)[source]
+

Generate a pedigree made of trios created by sampling 3 random samples in the sample list.

+
    +
  • If real_pedigree is given, then children in the resulting fake trios will not +include any trio with proband - parents that are in the real ones.

  • +
  • Each sample can be used only once as a proband in the resulting trios.

  • +
  • Sex of probands in fake trios is random.

  • +
+
+
Parameters:
+
    +
  • n (int) – Number of fake trios desired in the pedigree.

  • +
  • sample_list (List[str]) – List of samples.

  • +
  • exclude_real_probands (bool) – If set, then fake trios probands cannot be in the +real trios probands.

  • +
  • max_tries (int) – Maximum number of sampling to try before bailing out (preventing +infinite loop if n is too large w.r.t. the number of samples).

  • +
  • real_pedigree (Optional[Pedigree]) – Optional pedigree to exclude children from.

  • +
  • sample_list_stratification (Optional[Dict[str, str]]) – Optional dictionary with samples as keys and +a value that should be used to stratify samples in sample_list into groups +that the trio should be picked from. This ensures that each fake trio will +contain samples from only the same stratification. For example, if all samples +within a fake trio should be chosen from the same platform, this can be a +dictionary of sample: platform.

  • +
+
+
Return type:
+

Pedigree

+
+
Returns:
+

Fake pedigree.

+
+
+
+ +
+ +

Compute a Table with the list of samples to drop (and their global rank) to get the maximal independent set of unrelated samples.

+
+

Note

+
    +
  • relatedness_ht should be keyed by exactly two fields of the same type, identifying the pair of samples for each row.

  • +
  • rank_ht should be keyed by a single key of the same type as a single sample identifier in relatedness_ht.

  • +
+
+
+
Parameters:
+
    +
  • relatedness_ht (Table) – relatedness HT, as produced by e.g. pc-relate

  • +
  • kin_threshold (float) – Kinship threshold to consider two samples as related

  • +
  • rank_ht (Table) – Table with a global rank for each sample (smaller is preferred)

  • +
  • filtered_samples (Optional[SetExpression]) – An optional set of samples to exclude (e.g. these samples were hard-filtered) These samples will then appear in the resulting samples to drop.

  • +
  • min_related_hard_filter (Optional[int]) – If provided, any sample that is related to more samples than this parameter will be filtered prior to computing the maximal independent set and appear in the results.

  • +
  • keep_samples (Optional[SetExpression]) – An optional set of samples that must be kept. An error is raised (when keep_samples_when_related is False) if any two samples in the list are among the related pairs.

  • +
  • keep_samples_when_related (bool) – Don’t raise an error if keep_samples contains related samples, and keep related samples. Default is False.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

A Table with the list of the samples to drop along with their rank.

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.filter_mt_to_trios(mt, fam_ht)[source]
+

Filter a MatrixTable to a set of trios in fam_ht and annotates with adj.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – A Matrix Table to filter to only trios

  • +
  • fam_ht (Table) – A Table of trios to filter to, loaded using hl.import_fam

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

A MT filtered to trios and adj annotated

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.generate_trio_stats_expr(trio_mt, transmitted_strata={'raw': True}, de_novo_strata={'raw': True}, ac_strata={'raw': True}, proband_is_female_expr=None)[source]
+

Generate a row-wise expression containing trio transmission stats.

+
+
The expression will generate the following counts:
    +
  • Number of alleles in het parents transmitted to the proband

  • +
  • Number of alleles in het parents not transmitted to the proband

  • +
  • Number of de novo mutations

  • +
  • Parent allele count

  • +
  • Proband allele count

  • +
+
+
+

Transmission and de novo mutation metrics and allele counts can be stratified using additional filters. +transmitted_strata, de_novo_strata, and ac_strata all expect a dictionary of filtering expressions keyed +by their desired suffix to append for labeling. The default will perform counts using all genotypes and append +‘raw’ to the label.

+
+

Note

+

Expects that mt is dense if dealing with a sparse MT hl.experimental.densify must be run first.

+
+
+
Parameters:
+
    +
  • trio_mt (MatrixTable) – A trio standard trio MT (with the format as produced by hail.methods.trio_matrix)

  • +
  • transmitted_strata (Dict[str, BooleanExpression]) – Strata for the transmission counts

  • +
  • de_novo_strata (Dict[str, BooleanExpression]) – Strata for the de novo counts

  • +
  • ac_strata (Dict[str, BooleanExpression]) – Strata for the parent and child allele counts

  • +
  • proband_is_female_expr (Optional[BooleanExpression]) – An optional expression giving the sex the proband. If not given, DNMs are only computed for autosomes.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

An expression with the counts

+
+
+
+ +
+
+gnomad.sample_qc.relatedness.generate_sib_stats_expr(mt, sib_ht, i_col='i', j_col='j', strata={'raw': True}, is_female=None)[source]
+

Generate a row-wise expression containing the number of alternate alleles in common between sibling pairs.

+

The sibling sharing counts can be stratified using additional filters using stata.

+
+

Note

+

This function expects that the mt has either been split or filtered to only bi-allelics +If a sample has multiple sibling pairs, only one pair will be counted

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input matrix table

  • +
  • sib_ht (Table) – Table defining sibling pairs with one sample in a col (i_col) and the second in another col (j_col)

  • +
  • i_col (str) – Column containing the 1st sample of the pair in the relationship table

  • +
  • j_col (str) – Column containing the 2nd sample of the pair in the relationship table

  • +
  • strata (Dict[str, BooleanExpression]) – Dict with additional strata to use when computing shared sibling variant counts

  • +
  • is_female (Optional[BooleanExpression]) – An optional column in mt giving the sample sex. If not given, counts are only computed for autosomes.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

A Table with the sibling shared variant counts

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/sample_qc/sex.html b/api_reference/sample_qc/sex.html new file mode 100644 index 000000000..c1d9983db --- /dev/null +++ b/api_reference/sample_qc/sex.html @@ -0,0 +1,352 @@ + + + + + + + gnomad.sample_qc.sex — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.sample_qc.sex

+ + + + + + + + + + + + + + + + + + + + + +

gnomad.sample_qc.sex.adjusted_sex_ploidy_expr(...)

Create an entry expression to convert XY to haploid on non-PAR X/Y and XX to missing on Y.

gnomad.sample_qc.sex.adjust_sex_ploidy(mt, ...)

Convert males to haploid on non-PAR X/Y, sets females to missing on Y.

gnomad.sample_qc.sex.gaussian_mixture_model_karyotype_assignment(sex_ht)

Annotate the input Table with an X karyotype, Y karyotype, and sex karyotype based on a gaussian mixture model.

gnomad.sample_qc.sex.get_ploidy_cutoffs(ht)

Get chromosome X and Y ploidy cutoffs for XY and XX samples.

gnomad.sample_qc.sex.get_chr_x_hom_alt_cutoffs(ht, ...)

Get cutoffs for the fraction homozygous alternate genotypes on chromosome X in 'XY' and 'XX' samples.

gnomad.sample_qc.sex.get_sex_expr(...[, ...])

Create a struct with X_karyotype, Y_karyotype, and sex_karyotype.

+
+
+gnomad.sample_qc.sex.adjusted_sex_ploidy_expr(locus_expr, gt_expr, karyotype_expr, xy_karyotype_str='XY', xx_karyotype_str='XX')[source]
+

Create an entry expression to convert XY to haploid on non-PAR X/Y and XX to missing on Y.

+
+
Parameters:
+
    +
  • locus_expr (LocusExpression) – Locus expression.

  • +
  • gt_expr (CallExpression) – Genotype expression.

  • +
  • karyotype_expr (StringExpression) – Sex karyotype expression.

  • +
  • xy_karyotype_str (str) – String representing XY karyotype. Default is “XY”.

  • +
  • xx_karyotype_str (str) – String representing XX karyotype. Default is “XX”.

  • +
+
+
Return type:
+

CallExpression

+
+
Returns:
+

Genotype adjusted for sex ploidy.

+
+
+
+ +
+
+gnomad.sample_qc.sex.adjust_sex_ploidy(mt, sex_expr, male_str='male', female_str='female')[source]
+

Convert males to haploid on non-PAR X/Y, sets females to missing on Y.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable

  • +
  • sex_expr (StringExpression) – Expression pointing to sex in MT (if not male_str or female_str, no change)

  • +
  • male_str (str) – String for males (default ‘male’)

  • +
  • female_str (str) – String for females (default ‘female’)

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

MatrixTable with fixed ploidy for sex chromosomes

+
+
+
+ +
+
+gnomad.sample_qc.sex.gaussian_mixture_model_karyotype_assignment(sex_ht, chrx_ploidy_expr='chrX_ploidy', chry_ploidy_expr='chrY_ploidy', karyotype_output_prefix='gmm')[source]
+

Annotate the input Table with an X karyotype, Y karyotype, and sex karyotype based on a gaussian mixture model.

+

This function uses two component Gaussian mixture models on chrx_ploidy_expr and chry_ploidy_expr to assign +an X karyotype and a Y karyotype which are then combined into the sex karyotype.

+
+
The following annotations are added:
    +
  • {karyotype_output_prefix}_x_karyotype

  • +
  • {karyotype_output_prefix_y_karyotype

  • +
  • {karyotype_output_prefix}_karyotype = {karyotype_output_prefix}_x_karyotype + {karyotype_output_prefix}_y_karyotype

  • +
+
+
+
+

Note

+

This uses a two component Gaussian mixture model so all samples are given one of the following sex karyotypes: +X, XX, XY, YY. It’s recommended that this annotation is only used to split samples into XX and +XY groups that can then be used in get_ploidy_cutoffs to determine XX and XY ploidy means and stdevs.

+
+
+
Parameters:
+
    +
  • sex_ht (Table) – Input Table with chromosome X and chromosome Y ploidy values.

  • +
  • chrx_ploidy_expr (Union[NumericExpression, str]) – Expression pointing to chromosome X ploidy in sex_ht. Default is ‘chrX_ploidy’.

  • +
  • chry_ploidy_expr (Union[NumericExpression, str]) – Expression pointing to chromosome Y ploidy in sex_ht. Default is ‘chrY_ploidy’.

  • +
  • karyotype_output_prefix (str) – String to use as the prefix for the Gaussian mixture model karyotype output. Default is ‘gmm’.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Input Table with Gaussian mixture model karyotype annotations added.

+
+
+
+ +
+
+gnomad.sample_qc.sex.get_ploidy_cutoffs(ht, f_stat_cutoff=None, normal_ploidy_cutoff=5, aneuploidy_cutoff=6, group_by_expr=None)[source]
+

Get chromosome X and Y ploidy cutoffs for XY and XX samples.

+
+

Note

+

This assumes the input hail Table has the fields chrX_ploidy, and chrY_ploidy, and f_stat if f_stat_cutoff is +set.

+
+

Return a tuple of sex chromosome ploidy cutoffs: ((x_ploidy_cutoffs), (y_ploidy_cutoffs)). +x_ploidy_cutoffs: (upper cutoff for single X, (lower cutoff for double X, upper cutoff for double X), lower cutoff for triple X) +y_ploidy_cutoffs: ((lower cutoff for single Y, upper cutoff for single Y), lower cutoff for double Y)

+

Uses the normal_ploidy_cutoff parameter to determine the ploidy cutoffs for XX and XY karyotypes. +Uses the aneuploidy_cutoff parameter to determine the cutoffs for sex aneuploidies.

+
+

Note

+

f_stat_cutoff or group_by_expr must be supplied. If f_stat_cutoff is supplied then f-stat is used to +split the samples into roughly ‘XX’ and ‘XY’. If group_by_expr is supplied instead, then it must include an +annotation grouping samples by ‘XX’ and ‘XY’. These are both only used to divide samples into XX and XY to +determine means and standard deviations for these categories and are not used in the final karyotype annotation.

+
+
+
Parameters:
+
    +
  • ht (Table) – Table with f_stat and sex chromosome ploidies

  • +
  • f_stat_cutoff (float) – f-stat to roughly divide ‘XX’ from ‘XY’ samples. Assumes XX samples are below cutoff and XY +are above cutoff.

  • +
  • normal_ploidy_cutoff (int) – Number of standard deviations to use when determining sex chromosome ploidy cutoffs +for XX, XY karyotypes.

  • +
  • aneuploidy_cutoff (int) – Number of standard deviations to use when sex chromosome ploidy cutoffs for aneuploidies.

  • +
  • group_by_expr (StringExpression) – Expression grouping samples into ‘XX’ and ‘XY’. Can be used instead of and f_stat_cutoff.

  • +
+
+
Return type:
+

Tuple[Tuple[float, Tuple[float, float], float], Tuple[Tuple[float, float], float]]

+
+
Returns:
+

Tuple of ploidy cutoff tuples: ((x_ploidy_cutoffs), (y_ploidy_cutoffs))

+
+
+
+ +
+
+gnomad.sample_qc.sex.get_chr_x_hom_alt_cutoffs(ht, chr_x_frac_hom_alt_expr, f_stat_cutoff=None, group_by_expr=None, cutoff_stdev=5)[source]
+

Get cutoffs for the fraction homozygous alternate genotypes on chromosome X in ‘XY’ and ‘XX’ samples.

+
+

Note

+

This assumes the input hail Table has the field ‘f_stat’ if f_stat_cutoff is set.

+
+

Return a tuple of cutoffs for the fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on +chromosome X: ((lower cutoff for more than one X, upper cutoff for more than one X), lower cutoff for single X).

+

Uses the cutoff_stdev parameter to determine the fraction of homozygous alternate genotypes +(hom-alt/(hom-alt + het)) on chromosome X cutoffs for ‘XX’ and ‘XY’ karyotypes.

+
+

Note

+

f_stat_cutoff or group_by_expr must be supplied. If f_stat_cutoff is supplied then f-stat is used to +split the samples into roughly ‘XX’ and ‘XY’. If group_by_expr is supplied instead, then it must include an +annotation grouping samples by ‘XX’ and ‘XY’. These are both only used to divide samples into XX and XY to +determine means and standard deviations for these categories and are not used in the final karyotype annotation.

+
+
+
Parameters:
+
    +
  • ht (Table) – Table with f_stat and fraction of homozygous alternate genotypes on chromosome X.

  • +
  • chr_x_frac_hom_alt_expr (NumericExpression) – Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X.

  • +
  • f_stat_cutoff (float) – f-stat to roughly divide ‘XX’ from ‘XY’ samples. Assumes XX samples are below cutoff and XY +are above cutoff.

  • +
  • group_by_expr (StringExpression) – Expression grouping samples into ‘XX’ and ‘XY’. Can be used instead of f_stat_cutoff.

  • +
  • cutoff_stdev (int) – Number of standard deviations to use when determining sex chromosome ploidy cutoffs +for XX, XY karyotypes.

  • +
+
+
Return type:
+

Tuple[Tuple[float, float], float]

+
+
Returns:
+

Tuple of cutoffs: ((lower cutoff for more than one X, upper cutoff for more than one X), lower cutoff for +single X).

+
+
+
+ +
+
+gnomad.sample_qc.sex.get_sex_expr(chr_x_ploidy, chr_y_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs, chr_x_frac_hom_alt_expr=None, chr_x_frac_hom_alt_cutoffs=None)[source]
+

Create a struct with X_karyotype, Y_karyotype, and sex_karyotype.

+

Note that X0 is currently returned as ‘X’.

+
+
Parameters:
+
    +
  • chr_x_ploidy (NumericExpression) – Chromosome X ploidy (or relative ploidy).

  • +
  • chr_y_ploidy (NumericExpression) – Chromosome Y ploidy (or relative ploidy).

  • +
  • x_ploidy_cutoffs (Tuple[float, Tuple[float, float], float]) – Tuple of X chromosome ploidy cutoffs: (upper cutoff for single X, (lower cutoff for +double X, upper cutoff for double X), lower cutoff for triple X).

  • +
  • y_ploidy_cutoffs (Tuple[Tuple[float, float], float]) – Tuple of Y chromosome ploidy cutoffs: ((lower cutoff for single Y, upper cutoff for +single Y), lower cutoff for double Y).

  • +
  • chr_x_frac_hom_alt_expr (Optional[NumericExpression]) – Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X.

  • +
  • chr_x_frac_hom_alt_cutoffs (Optional[Tuple[Tuple[float, float], float]]) – Tuple of cutoffs for the fraction of homozygous alternate genotypes +(hom-alt/(hom-alt + het)) on chromosome X: ((lower cutoff for more than one X, upper cutoff for more than one X), +lower cutoff for single X).

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Struct containing X_karyotype, Y_karyotype, and sex_karyotype.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/annotations.html b/api_reference/utils/annotations.html new file mode 100644 index 000000000..06a230307 --- /dev/null +++ b/api_reference/utils/annotations.html @@ -0,0 +1,1571 @@ + + + + + + + gnomad.utils.annotations — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.annotations

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.annotations.pop_max_expr(freq, ...)

Create an expression containing the frequency information about the population that has the highest AF in freq_meta.

gnomad.utils.annotations.project_max_expr(...)

Create an expression that computes allele frequency information by project for the n_projects with the largest AF at this row.

gnomad.utils.annotations.faf_expr(freq, ...)

Calculate the filtering allele frequency (FAF) for each threshold specified in faf_thresholds.

gnomad.utils.annotations.gen_anc_faf_max_expr(...)

Retrieve the maximum FAF and corresponding genetic ancestry for each of the thresholds in faf.

gnomad.utils.annotations.qual_hist_expr([...])

Return a struct expression with genotype quality histograms based on the arguments given (dp, gq, ad, ab).

gnomad.utils.annotations.age_hists_expr(...)

Return a StructExpression with the age histograms for hets and homs.

gnomad.utils.annotations.get_lowqual_expr(...)

Compute lowqual threshold expression for either split or unsplit alleles based on QUALapprox or AS_QUALapprox.

gnomad.utils.annotations.get_annotations_hists(ht, ...)

Create histograms for variant metrics in ht.info.

gnomad.utils.annotations.create_frequency_bins_expr(AC, AF)

Create bins for frequencies in preparation for aggregating QUAL by frequency bin.

gnomad.utils.annotations.annotate_and_index_source_mt_for_sex_ploidy(...)

Prepare relevant ploidy annotations for downstream calculations on a matrix table.

gnomad.utils.annotations.get_is_haploid_expr([...])

Determine if a genotype or locus and karyotype combination is haploid.

gnomad.utils.annotations.get_gq_dp_adj_expr(...)

Get adj annotation using only GQ and DP.

gnomad.utils.annotations.get_het_ab_adj_expr(...)

Get adj het AB annotation.

gnomad.utils.annotations.get_adj_expr(...[, ...])

Get adj genotype annotation.

gnomad.utils.annotations.annotate_adj(mt[, ...])

Annotate genotypes with adj criteria (assumes diploid).

gnomad.utils.annotations.add_variant_type(...)

Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings.

gnomad.utils.annotations.annotate_allele_info(ht)

Return bi-allelic sites Table with an 'allele_info' annotation.

gnomad.utils.annotations.annotation_type_is_numeric(t)

Given an annotation type, return whether it is a numerical type or not.

gnomad.utils.annotations.annotation_type_in_vcf_info(t)

Given an annotation type, returns whether that type can be natively exported to a VCF INFO field.

gnomad.utils.annotations.bi_allelic_site_inbreeding_expr([...])

Return the site inbreeding coefficient as an expression to be computed on a MatrixTable.

gnomad.utils.annotations.fs_from_sb(sb[, ...])

Compute FS (Fisher strand balance) annotation from the SB (strand balance table) field.

gnomad.utils.annotations.sor_from_sb(sb)

Compute SOR (Symmetric Odds Ratio test) annotation from the SB (strand balance table) field.

gnomad.utils.annotations.pab_max_expr(...[, ...])

Compute the maximum p-value of the binomial test for the alternate allele balance (PAB) for each allele.

gnomad.utils.annotations.bi_allelic_expr(t)

Return a boolean expression selecting bi-allelic sites only, accounting for whether the input MT/HT was split.

gnomad.utils.annotations.unphase_call_expr(...)

Generate unphased version of a call expression (which can be phased or not).

gnomad.utils.annotations.region_flag_expr(t)

Create a region_flag struct that contains flags for problematic regions (i.e., LCR, decoy, segdup, and nonpar regions).

gnomad.utils.annotations.missing_callstats_expr()

Create a missing callstats struct for insertion into frequency annotation arrays when data is missing.

gnomad.utils.annotations.set_female_y_metrics_to_na_expr(t)

Set Y-variant frequency callstats for female-specific metrics to missing structs.

gnomad.utils.annotations.hemi_expr(locus, ...)

Return whether genotypes are hemizygous.

gnomad.utils.annotations.merge_freq_arrays(...)

Merge a list of frequency arrays based on the supplied operation.

gnomad.utils.annotations.merge_histograms(hists)

Merge a list of histogram annotations.

gnomad.utils.annotations.annotate_freq(mt[, ...])

Annotate mt with stratified allele frequencies.

gnomad.utils.annotations.annotate_downsamplings(t, ...)

Annotate MatrixTable or Table with downsampling groups.

gnomad.utils.annotations.build_freq_stratification_list([...])

Build a list of stratification groupings to be used in frequency calculations based on supplied parameters.

gnomad.utils.annotations.generate_freq_group_membership_array(ht, ...)

Generate a Table with a 'group_membership' array for each sample indicating whether the sample belongs to specific stratification groups.

gnomad.utils.annotations.compute_freq_by_strata(mt)

Compute call statistics and, when passed, entry aggregation function(s) by strata.

gnomad.utils.annotations.agg_by_strata(mt, ...)

Get row expression for annotations of each entry aggregation function(s) by strata.

gnomad.utils.annotations.update_structured_annotations(ht, ...)

Update highly structured annotations on a Table.

gnomad.utils.annotations.add_gks_vrs(...)

Generate a dictionary containing VRS information from a given locus and struct of VRS information.

gnomad.utils.annotations.add_gks_va(input_struct)

Generate Python dictionary containing GKS VA annotations.

+
+
+gnomad.utils.annotations.pop_max_expr(freq, freq_meta, pops_to_exclude=None, pop_label='pop')[source]
+

Create an expression containing the frequency information about the population that has the highest AF in freq_meta.

+

Populations specified in pops_to_exclude are excluded and only frequencies from adj populations are considered.

+

This resulting struct contains the following fields:

+
+
    +
  • AC: int32

  • +
  • AF: float64

  • +
  • AN: int32

  • +
  • homozygote_count: int32

  • +
  • pop: str

  • +
+
+
+
Parameters:
+
    +
  • freq (ArrayExpression) – ArrayExpression of Structs with fields [‘AC’, ‘AF’, ‘AN’, ‘homozygote_count’]

  • +
  • freq_meta (ArrayExpression) – ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq)

  • +
  • pops_to_exclude (Optional[Set[str]]) – Set of populations to skip for popmax calcluation

  • +
  • pop_label (str) – Label of the population field in the meta dictionary

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Popmax struct

+
+
+
+ +
+
+gnomad.utils.annotations.project_max_expr(project_expr, gt_expr, alleles_expr, n_projects=5)[source]
+

Create an expression that computes allele frequency information by project for the n_projects with the largest AF at this row.

+

Will return an array with one element per non-reference allele.

+

Each of these elements is itself an array of structs with the following fields:

+
+
    +
  • AC: int32

  • +
  • AF: float64

  • +
  • AN: int32

  • +
  • homozygote_count: int32

  • +
  • project: str

  • +
+
+
+

Note

+

Only projects with AF > 0 are returned. +In case of ties, the project ordering is not guaranteed, and at most n_projects are returned.

+
+
+
Parameters:
+
    +
  • project_expr (StringExpression) – column expression containing the project

  • +
  • gt_expr (CallExpression) – entry expression containing the genotype

  • +
  • alleles_expr (ArrayExpression) – row expression containing the alleles

  • +
  • n_projects (int) – Maximum number of projects to return for each row

  • +
+
+
Return type:
+

ArrayExpression

+
+
Returns:
+

projectmax expression

+
+
+
+ +
+
+gnomad.utils.annotations.faf_expr(freq, freq_meta, locus, pops_to_exclude=None, faf_thresholds=[0.95, 0.99], pop_label='pop')[source]
+

Calculate the filtering allele frequency (FAF) for each threshold specified in faf_thresholds.

+

See http://cardiodb.org/allelefrequencyapp/ for more information.

+

The FAF is computed for each of the following population stratification if found in freq_meta:

+
+
    +
  • All samples, with adj criteria

  • +
  • For each population, with adj criteria

  • +
  • For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes)

  • +
+
+

Each of the FAF entry is a struct with one entry per threshold specified in faf_thresholds of type float64.

+

This returns a tuple with two expressions:

+
+
    +
  1. An array of FAF expressions as described above

  2. +
  3. An array of dict containing the metadata for each of the array elements, in the same format as that produced by annotate_freq.

  4. +
+
+
+
Parameters:
+
    +
  • freq (ArrayExpression) – ArrayExpression of call stats structs (typically generated by hl.agg.call_stats)

  • +
  • freq_meta (ArrayExpression) – ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq)

  • +
  • locus (LocusExpression) – locus

  • +
  • pops_to_exclude (Optional[Set[str]]) – Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations)

  • +
  • faf_thresholds (List[float]) – List of FAF thresholds to compute

  • +
  • pop_label (str) – Label of the population field in the meta dictionary

  • +
+
+
Return type:
+

Tuple[ArrayExpression, List[Dict[str, str]]]

+
+
Returns:
+

(FAF expression, FAF metadata)

+
+
+
+ +
+
+gnomad.utils.annotations.gen_anc_faf_max_expr(faf, faf_meta, pop_label='pop')[source]
+

Retrieve the maximum FAF and corresponding genetic ancestry for each of the thresholds in faf.

+

This resulting struct contains the following fields:

+
+
    +
  • faf95_max: float64

  • +
  • faf95_max_gen_anc: str

  • +
  • faf99_max: float64

  • +
  • faf99_max_gen_anc: str

  • +
+
+
+
Parameters:
+
    +
  • faf (ArrayExpression) – ArrayExpression of Structs of FAF thresholds previously computed. When +faf_expr is used, contains fields ‘faf95’ and ‘faf99’.

  • +
  • faf_meta (ArrayExpression) – ArrayExpression of meta dictionaries corresponding to faf (as +returned by faf_expr)

  • +
  • pop_label (str) – Label of the population field in the meta dictionary

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Genetic ancestry group struct for FAF max

+
+
+
+ +
+
+gnomad.utils.annotations.qual_hist_expr(gt_expr=None, gq_expr=None, dp_expr=None, ad_expr=None, adj_expr=None, ab_expr=None, split_adj_and_raw=False)[source]
+

Return a struct expression with genotype quality histograms based on the arguments given (dp, gq, ad, ab).

+
+

Note

+
    +
  • If gt_expr is provided, will return histograms for non-reference samples only as well as all samples.

  • +
  • gt_expr is required for the allele-balance histogram, as it is only computed on het samples.

  • +
  • If ab_expr is provided, the allele-balance histogram is computed using this expression instead of the ad_expr.

  • +
  • If adj_expr is provided, additional histograms are computed using only adj samples.

  • +
+
+
+
Parameters:
+
    +
  • gt_expr (Optional[CallExpression]) – Entry expression containing genotype.

  • +
  • gq_expr (Optional[NumericExpression]) – Entry expression containing genotype quality.

  • +
  • dp_expr (Optional[NumericExpression]) – Entry expression containing depth.

  • +
  • ad_expr (Optional[ArrayNumericExpression]) – Entry expression containing allelic depth (bi-allelic here).

  • +
  • adj_expr (Optional[BooleanExpression]) – Entry expression containing adj (high quality) genotype status.

  • +
  • ab_expr (Optional[NumericExpression]) – Entry expression containing allele balance (bi-allelic here).

  • +
  • split_adj_and_raw (bool) – Whether to split the adj and raw histograms into separate fields in the returned struct expr.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Genotype quality histograms expression.

+
+
+
+ +
+
+gnomad.utils.annotations.age_hists_expr(adj_expr, gt_expr, age_expr, lowest_boundary=30, highest_boundary=80, n_bins=10)[source]
+

Return a StructExpression with the age histograms for hets and homs.

+
+
Parameters:
+
    +
  • adj_expr (BooleanExpression) – Entry expression containing whether a genotype is high quality (adj) or not

  • +
  • gt_expr (CallExpression) – Entry expression containing the genotype

  • +
  • age_expr (NumericExpression) – Col expression containing the sample’s age

  • +
  • lowest_boundary (int) – Lowest bin boundary (any younger sample will be binned in n_smaller)

  • +
  • highest_boundary (int) – Highest bin boundary (any older sample will be binned in n_larger)

  • +
  • n_bins (int) – Total number of bins

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

A struct with age_hist_het and age_hist_hom

+
+
+
+ +
+
+gnomad.utils.annotations.get_lowqual_expr(alleles, qual_approx_expr, snv_phred_threshold=30, snv_phred_het_prior=30, indel_phred_threshold=30, indel_phred_het_prior=39)[source]
+

Compute lowqual threshold expression for either split or unsplit alleles based on QUALapprox or AS_QUALapprox.

+
+

Note

+

When running This lowqual annotation using QUALapprox, it differs from the GATK LowQual filter. +This is because GATK computes this annotation at the site level, which uses the least stringent prior for mixed sites. +When run using AS_QUALapprox, this implementation can thus be more stringent for certain alleles at mixed sites.

+
+
+
Parameters:
+
    +
  • alleles (ArrayExpression) – Array of alleles

  • +
  • qual_approx_expr (Union[ArrayNumericExpression, NumericExpression]) – QUALapprox or AS_QUALapprox

  • +
  • snv_phred_threshold (int) – Phred-scaled SNV “emission” threshold (similar to GATK emission threshold)

  • +
  • snv_phred_het_prior (int) – Phred-scaled SNV heterozygosity prior (30 = 1/1000 bases, GATK default)

  • +
  • indel_phred_threshold (int) – Phred-scaled indel “emission” threshold (similar to GATK emission threshold)

  • +
  • indel_phred_het_prior (int) – Phred-scaled indel heterozygosity prior (30 = 1/1000 bases, GATK default)

  • +
+
+
Return type:
+

Union[BooleanExpression, ArrayExpression]

+
+
Returns:
+

lowqual expression (BooleanExpression if qual_approx_expr`is Numeric, Array[BooleanExpression] if `qual_approx_expr is ArrayNumeric)

+
+
+
+ +
+
+gnomad.utils.annotations.get_annotations_hists(ht, annotations_hists, log10_annotations=['DP'])[source]
+

Create histograms for variant metrics in ht.info.

+

Used when creating site quality distribution json files.

+
+
Parameters:
+
    +
  • ht (Table) – Table with variant metrics

  • +
  • annotations_hists (Dict[str, Tuple]) – Dictionary of metrics names and their histogram values (start, end, bins)

  • +
  • log10_annotations (List[str]) – List of metrics to log scale

  • +
+
+
Returns:
+

Dictionary of merics and their histograms

+
+
Return type:
+

Dict[str, hl.expr.StructExpression]

+
+
+
+ +
+
+gnomad.utils.annotations.create_frequency_bins_expr(AC, AF)[source]
+

Create bins for frequencies in preparation for aggregating QUAL by frequency bin.

+
+
Bins:
    +
  • singleton

  • +
  • doubleton

  • +
  • 0.00005

  • +
  • 0.0001

  • +
  • 0.0002

  • +
  • 0.0005

  • +
  • 0.001,

  • +
  • 0.002

  • +
  • 0.005

  • +
  • 0.01

  • +
  • 0.02

  • +
  • 0.05

  • +
  • 0.1

  • +
  • 0.2

  • +
  • 0.5

  • +
  • 1

  • +
+
+
+

NOTE: Frequencies should be frequencies from raw data. +Used when creating site quality distribution json files.

+
+
Parameters:
+
    +
  • AC (NumericExpression) – Field in input that contains the allele count information

  • +
  • AF (NumericExpression) – Field in input that contains the allele frequency information

  • +
+
+
Returns:
+

Expression containing bin name

+
+
Return type:
+

hl.expr.StringExpression

+
+
+
+ +
+
+gnomad.utils.annotations.annotate_and_index_source_mt_for_sex_ploidy(locus_expr, karyotype_expr, xy_karyotype_str='XY', xx_karyotype_str='XX')[source]
+

Prepare relevant ploidy annotations for downstream calculations on a matrix table.

+

This method is used as an optimization for the get_is_haploid_expr and +adjusted_sex_ploidy_expr methods.

+

This method annotates the locus_expr source matrix table with the following +fields:

+
+
    +
  • xy: Boolean indicating if the sample is XY.

  • +
  • xx: Boolean indicating if the sample is XX.

  • +
  • in_non_par: Boolean indicating if the locus is in a non-PAR region.

  • +
  • x_nonpar: Boolean indicating if the locus is in a non-PAR region of the X +chromosome.

  • +
  • y_par: Boolean indicating if the locus is in a PAR region of the Y +chromosome.

  • +
  • y_nonpar: Boolean indicating if the locus is in a non-PAR region of the Y +chromosome.

  • +
+
+
+
Parameters:
+
    +
  • locus_expr (LocusExpression) – Locus expression.

  • +
  • karyotype_expr (StringExpression) – Karyotype expression.

  • +
  • xy_karyotype_str (str) – String representing XY karyotype. Default is “XY”.

  • +
  • xx_karyotype_str (str) – String representing XX karyotype. Default is “XX”.

  • +
+
+
Return type:
+

Tuple[StructExpression, StructExpression]

+
+
Returns:
+

Tuple of index expressions for columns and rows.

+
+
+
+ +
+
+gnomad.utils.annotations.get_is_haploid_expr(gt_expr=None, locus_expr=None, karyotype_expr=None, xy_karyotype_str='XY', xx_karyotype_str='XX')[source]
+

Determine if a genotype or locus and karyotype combination is haploid.

+
+

Note

+

One of gt_expr or locus_expr and karyotype_expr is required.

+
+
+
Parameters:
+
    +
  • gt_expr (Optional[CallExpression]) – Optional genotype expression.

  • +
  • locus_expr (Optional[LocusExpression]) – Optional locus expression.

  • +
  • karyotype_expr (Optional[StringExpression]) – Optional sex karyotype expression.

  • +
  • xy_karyotype_str (str) – String representing XY karyotype. Default is “XY”.

  • +
  • xx_karyotype_str (str) – String representing XX karyotype. Default is “XX”.

  • +
+
+
Return type:
+

BooleanExpression

+
+
Returns:
+

Boolean expression indicating if the genotype is haploid.

+
+
+
+ +
+
+gnomad.utils.annotations.get_gq_dp_adj_expr(gq_expr, dp_expr, gt_expr=None, locus_expr=None, karyotype_expr=None, adj_gq=20, adj_dp=10, haploid_adj_dp=5)[source]
+

Get adj annotation using only GQ and DP.

+

Default thresholds correspond to gnomAD values.

+
+

Note

+

This function can be used to annotate adj taking into account only GQ and DP. +It is useful for cases where the GT field is not available, such as in the +reference data of a VariantDataset.

+
+
+

Note

+

One of gt_expr or locus_expr and karyotype_expr is required.

+
+
+
Parameters:
+
    +
  • gq_expr (Union[Int32Expression, Int64Expression]) – GQ expression.

  • +
  • dp_expr (Union[Int32Expression, Int64Expression]) – DP expression.

  • +
  • gt_expr (Optional[CallExpression]) – Optional genotype expression.

  • +
  • locus_expr (Optional[LocusExpression]) – Optional locus expression.

  • +
  • karyotype_expr (Optional[StringExpression]) – Optional sex karyotype expression.

  • +
  • adj_gq (int) – GQ threshold for adj. Default is 20.

  • +
  • adj_dp (int) – DP threshold for adj. Default is 10.

  • +
  • haploid_adj_dp (int) – Haploid DP threshold for adj. Default is 5.

  • +
+
+
Return type:
+

BooleanExpression

+
+
Returns:
+

Boolean expression indicating adj filter.

+
+
+
+ +
+
+gnomad.utils.annotations.get_het_ab_adj_expr(gt_expr, dp_expr, ad_expr, adj_ab=0.2)[source]
+

Get adj het AB annotation.

+
+
Parameters:
+
+
+
Return type:
+

BooleanExpression

+
+
Returns:
+

Boolean expression indicating adj het AB filter.

+
+
+
+ +
+
+gnomad.utils.annotations.get_adj_expr(gt_expr, gq_expr, dp_expr, ad_expr, adj_gq=20, adj_dp=10, adj_ab=0.2, haploid_adj_dp=5)[source]
+

Get adj genotype annotation.

+

Defaults correspond to gnomAD values.

+
+
Parameters:
+
+
+
Return type:
+

BooleanExpression

+
+
+
+ +
+
+gnomad.utils.annotations.annotate_adj(mt, adj_gq=20, adj_dp=10, adj_ab=0.2, haploid_adj_dp=5)[source]
+

Annotate genotypes with adj criteria (assumes diploid).

+

Defaults correspond to gnomAD values.

+
+
Parameters:
+
    +
  • mt (MatrixTable) –

  • +
  • adj_gq (int) –

  • +
  • adj_dp (int) –

  • +
  • adj_ab (float) –

  • +
  • haploid_adj_dp (int) –

  • +
+
+
Return type:
+

MatrixTable

+
+
+
+ +
+
+gnomad.utils.annotations.add_variant_type(alt_alleles)[source]
+

Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings.

+
+
Parameters:
+

alt_alleles (ArrayExpression) –

+
+
Return type:
+

StructExpression

+
+
+
+ +
+
+gnomad.utils.annotations.annotate_allele_info(ht)[source]
+

Return bi-allelic sites Table with an ‘allele_info’ annotation.

+
+

Note

+

This function requires that the input ht is unsplit and returns a split ht.

+
+
+
‘allele_info’ is a struct with the following information:
    +
  • variant_type: Variant type (snv, indel, multi-snv, multi-indel, or mixed).

  • +
  • n_alt_alleles: Total number of alternate alleles observed at variant locus.

  • +
  • has_star: True if the variant contains a star allele.

  • +
  • allele_type: Allele type (snv, insertion, deletion, or mixed).

  • +
  • was_mixed: True if the variant was mixed (i.e. contained both SNVs and indels).

  • +
  • nonsplit_alleles: Array of alleles before splitting.

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Unsplit input Table.

  • +
  • ht

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Split Table with allele data annotation added,

+
+
+
+ +
+
+gnomad.utils.annotations.annotation_type_is_numeric(t)[source]
+

Given an annotation type, return whether it is a numerical type or not.

+
+
Parameters:
+

t (Any) – Type to test

+
+
Return type:
+

bool

+
+
Returns:
+

If the input type is numeric

+
+
+
+ +
+
+gnomad.utils.annotations.annotation_type_in_vcf_info(t)[source]
+

Given an annotation type, returns whether that type can be natively exported to a VCF INFO field.

+
+

Note

+

Types that aren’t natively exportable to VCF will be converted to String on export.

+
+
+
Parameters:
+

t (Any) – Type to test

+
+
Return type:
+

bool

+
+
Returns:
+

If the input type can be exported to VCF

+
+
+
+ +
+
+gnomad.utils.annotations.bi_allelic_site_inbreeding_expr(call=None, callstats_expr=None)[source]
+

Return the site inbreeding coefficient as an expression to be computed on a MatrixTable.

+

This is implemented based on the GATK InbreedingCoeff metric: +https://software.broadinstitute.org/gatk/documentation/article.php?id=8032

+
+

Note

+

The computation is run based on the counts of alternate alleles and thus should only be run on bi-allelic sites.

+
+
+
Parameters:
+
    +
  • call (Optional[CallExpression]) – Expression giving the calls in the MT

  • +
  • callstats_expr (Optional[StructExpression]) – StructExpression containing only alternate allele AC, AN, and homozygote_count as integers. If passed, used to create expression in place of GT calls.

  • +
+
+
Return type:
+

Float32Expression

+
+
Returns:
+

Site inbreeding coefficient expression

+
+
+
+ +
+
+gnomad.utils.annotations.fs_from_sb(sb, normalize=True, min_cell_count=200, min_count=4, min_p_value=1e-320)[source]
+

Compute FS (Fisher strand balance) annotation from the SB (strand balance table) field.

+

FS is the phred-scaled value of the double-sided Fisher exact test on strand balance.

+

Using default values will have the same behavior as the GATK implementation, that is: +- If sum(counts) > 2*`min_cell_count` (default to GATK value of 200), they are normalized +- If sum(counts) < min_count (default to GATK value of 4), returns missing +- Any p-value < min_p_value (default to GATK value of 1e-320) is truncated to that value

+

In addition to the default GATK behavior, setting normalize to False will perform a chi-squared test +for large counts (> min_cell_count) instead of normalizing the cell values.

+
+

Note

+

This function can either take +- an array of length four containing the forward and reverse strands’ counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] +- a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]]

+
+

GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java

+
+
Parameters:
+
    +
  • sb (Union[ArrayNumericExpression, ArrayExpression]) – Count of ref/alt reads on each strand

  • +
  • normalize (bool) – Whether to normalize counts is sum(counts) > min_cell_count (normalize=True), or use a chi sq instead of FET (normalize=False)

  • +
  • min_cell_count (int) – Maximum count for performing a FET

  • +
  • min_count (int) – Minimum total count to output FS (otherwise null it output)

  • +
  • min_p_value (float) –

  • +
+
+
Return type:
+

Int64Expression

+
+
Returns:
+

FS value

+
+
+
+ +
+
+gnomad.utils.annotations.sor_from_sb(sb)[source]
+

Compute SOR (Symmetric Odds Ratio test) annotation from the SB (strand balance table) field.

+
+

Note

+

This function can either take +- an array of length four containing the forward and reverse strands’ counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] +- a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]]

+
+

GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandOddsRatio.java

+
+
Parameters:
+

sb (Union[ArrayNumericExpression, ArrayExpression]) – Count of ref/alt reads on each strand

+
+
Return type:
+

Float64Expression

+
+
Returns:
+

SOR value

+
+
+
+ +
+
+gnomad.utils.annotations.pab_max_expr(gt_expr, ad_expr, la_expr=None, n_alleles_expr=None)[source]
+

Compute the maximum p-value of the binomial test for the alternate allele balance (PAB) for each allele.

+
+

Note

+

This function can take a gt_expr and ad_expr that use local or global +alleles. If they use local alleles, la_expr and n_alleles_expr should be +provided to transform gt_expr and ad_expr to global alleles.

+
+
+
Parameters:
+
    +
  • gt_expr (CallExpression) – Genotype call expression.

  • +
  • ad_expr (ArrayExpression) – Allele depth expression.

  • +
  • la_expr (Optional[ArrayExpression]) – Allele local index expression. When provided gt_expr and +ad_expr are transformed from using local alleles to global alleles using +la_expr.

  • +
  • n_alleles_expr (Optional[Int32Expression]) – Number of alleles expression. Required when ‘la_expr’ is +provided.

  • +
+
+
Return type:
+

ArrayExpression

+
+
Returns:
+

Array expression of maximum p-values.

+
+
+
+ +
+
+gnomad.utils.annotations.bi_allelic_expr(t)[source]
+

Return a boolean expression selecting bi-allelic sites only, accounting for whether the input MT/HT was split.

+
+
Parameters:
+

t (Union[Table, MatrixTable]) – Input HT/MT

+
+
Return type:
+

BooleanExpression

+
+
Returns:
+

Boolean expression selecting only bi-allelic sites

+
+
+
+ +
+
+gnomad.utils.annotations.unphase_call_expr(call_expr)[source]
+

Generate unphased version of a call expression (which can be phased or not).

+
+
Parameters:
+

call_expr (CallExpression) – Input call expression

+
+
Return type:
+

CallExpression

+
+
Returns:
+

unphased call expression

+
+
+
+ +
+
+gnomad.utils.annotations.region_flag_expr(t, non_par=True, prob_regions=None)[source]
+

Create a region_flag struct that contains flags for problematic regions (i.e., LCR, decoy, segdup, and nonpar regions).

+
+

Note

+

No hg38 resources for decoy or self chain are available yet.

+
+
+
Parameters:
+
    +
  • t (Union[Table, MatrixTable]) – Input Table/MatrixTable

  • +
  • non_par (bool) – If True, flag loci that occur within pseudoautosomal regions on sex chromosomes

  • +
  • prob_regions (Dict[str, Table]) – If supplied, flag loci that occur within regions defined in Hail Table(s)

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

region_flag struct row annotation

+
+
+
+ +
+
+gnomad.utils.annotations.missing_callstats_expr()[source]
+

Create a missing callstats struct for insertion into frequency annotation arrays when data is missing.

+
+
Return type:
+

StructExpression

+
+
Returns:
+

Hail Struct with missing values for each callstats element

+
+
+
+ +
+
+gnomad.utils.annotations.set_female_y_metrics_to_na_expr(t, freq_expr='freq', freq_meta_expr='freq_meta', freq_index_dict_expr='freq_index_dict')[source]
+

Set Y-variant frequency callstats for female-specific metrics to missing structs.

+
+
Parameters:
+
    +
  • t (Union[Table, MatrixTable]) – Table or MatrixTable for which to adjust female metrics.

  • +
  • freq_expr (Union[ArrayExpression, str]) – Array expression or string annotation name for the frequency +array. Default is “freq”.

  • +
  • freq_meta_expr (Union[ArrayExpression, str]) – Array expression or string annotation name for the frequency +metadata. Default is “freq_meta”.

  • +
  • freq_index_dict_expr (Union[DictExpression, str]) – Dict expression or string annotation name for the +frequency metadata index dictionary. Default is “freq_index_dict”.

  • +
+
+
Return type:
+

ArrayExpression

+
+
Returns:
+

Hail array expression to set female Y-variant metrics to missing values.

+
+
+
+ +
+
+gnomad.utils.annotations.hemi_expr(locus, sex_expr, gt, male_str='XY')[source]
+

Return whether genotypes are hemizygous.

+

Return missing expression if locus is not in chrX/chrY non-PAR regions.

+
+
Parameters:
+
    +
  • locus (LocusExpression) – Input locus.

  • +
  • sex_expr (StringExpression) – Input StringExpression indicating whether sample is XX or XY.

  • +
  • gt (CallExpression) – Input genotype.

  • +
  • xy_str – String indicating whether sample is XY. Default is “XY”.

  • +
  • male_str (str) –

  • +
+
+
Return type:
+

BooleanExpression

+
+
Returns:
+

BooleanExpression indicating whether genotypes are hemizygous.

+
+
+
+ +
+
+gnomad.utils.annotations.merge_freq_arrays(farrays, fmeta, operation='sum', set_negatives_to_zero=False, count_arrays=None)[source]
+

Merge a list of frequency arrays based on the supplied operation.

+
+

Warning

+

Arrays must be on the same Table.

+
+
+

Note

+

Arrays do not have to contain the same groupings or order of groupings but +the array indices for a freq array in farrays must be the same as its associated +frequency metadata index in fmeta i.e., farrays = [freq1, freq2] then fmeta +must equal [fmeta1, fmeta2] where fmeta1 contains the metadata information +for freq1.

+

If operation is set to “sum”, groups in the merged array +will be the union of groupings found within the arrays’ metadata and all arrays +with be summed by grouping. If operation is set to “diff”, the merged array +will contain groups only found in the first array of fmeta. Any array containing +any of these groups will have thier values subtracted from the values of the first array.

+
+
+
Parameters:
+
    +
  • farrays (List[ArrayExpression]) – List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. All arrays must be on the same Table.

  • +
  • fmeta (List[List[Dict[str, str]]]) – List of frequency metadata for arrays being merged.

  • +
  • operation (str) – Merge operation to perform. Options are “sum” and “diff”. If “diff” is passed, the first freq array in the list will have the other arrays subtracted from it.

  • +
  • set_negatives_to_zero (bool) – If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is False.

  • +
  • count_arrays (Optional[Dict[str, List[ArrayExpression]]]) – Dictionary of Lists of arrays containing counts to merge using the passed operation. Must use the same group indexing as fmeta. Keys are the descriptor names, values are Lists of arrays to merge. Default is None.

  • +
+
+
Return type:
+

Union[Tuple[ArrayExpression, List[Dict[str, int]]], Tuple[ArrayExpression, List[Dict[str, int]], Dict[str, List[ArrayExpression]]]]

+
+
Returns:
+

Tuple of merged frequency array, frequency metadata list and if count_arrays is not None, a dictionary of merged count arrays.

+
+
+
+ +
+
+gnomad.utils.annotations.merge_histograms(hists)[source]
+

Merge a list of histogram annotations.

+

This function merges a list of histogram annotations by summing the arrays +in an element-wise fashion. It keeps one ‘bin_edge’ annotation but merges the +‘bin_freq’, ‘n_smaller’, and ‘n_larger’ annotations by summing them.

+
+

Note

+

Bin edges are assumed to be the same for all histograms.

+
+
+
Parameters:
+

hists (List[StructExpression]) – List of histogram structs to merge.

+
+
Return type:
+

Expression

+
+
Returns:
+

Merged histogram struct.

+
+
+
+ +
+
+gnomad.utils.annotations.annotate_freq(mt, sex_expr=None, pop_expr=None, subpop_expr=None, additional_strata_expr=None, downsamplings=None, downsampling_expr=None, ds_pop_counts=None, entry_agg_funcs=None, annotate_mt=True)[source]
+

Annotate mt with stratified allele frequencies.

+
+
The output Matrix table will include:
    +
  • row annotation freq containing the stratified allele frequencies

  • +
  • global annotation freq_meta with metadata

  • +
  • global annotation freq_meta_sample_count with sample count information

  • +
+
+
+
+

Note

+

Currently this only supports bi-allelic sites.

+
+
The input mt needs to have the following entry fields:
    +
  • GT: a CallExpression containing the genotype

  • +
  • adj: a BooleanExpression containing whether the genotype is of high quality +or not.

  • +
+
+
+

All expressions arguments need to be expression on the input mt.

+
+

freq row annotation

+

The freq row annotation is an Array of Structs, with each Struct containing the +following fields:

+
+
    +
  • AC: int32

  • +
  • AF: float64

  • +
  • AN: int32

  • +
  • homozygote_count: int32

  • +
+
+

Each element of the array corresponds to a stratification of the data, and the +metadata about these annotations is stored in the globals.

+

Global freq_meta metadata annotation

+

The global annotation freq_meta is added to the input mt. It is a list of dict. +Each element of the list contains metadata on a frequency stratification and the +index in the list corresponds to the index of that frequency stratification in the +freq row annotation.

+

Global freq_meta_sample_count annotation

+

The global annotation freq_meta_sample_count is added to the input mt. This is a +sample count per sample grouping defined in the freq_meta global annotation.

+

The additional_strata_expr parameter

+

If the additional_strata_expr parameter is used, frequencies will be computed for +each of the strata dictionaries across all values. For example, if +additional_strata_expr is set to [{‘platform’: mt.platform}, +{‘platform’:mt.platform, ‘pop’: mt.pop}, {‘age_bin’: mt.age_bin}], then +frequencies will be computed for each of the values of mt.platform, each of the +combined values of mt.platform and mt.pop, and each of the values of +mt.age_bin.

+

The downsamplings parameter

+

If the downsamplings parameter is used without the downsampling_expr, +frequencies will be computed for all samples and by population (if pop_expr is +specified) by downsampling the number of samples without replacement to each of the +numbers specified in the downsamplings array, provided that there are enough +samples in the dataset. In addition, if pop_expr is specified, a downsampling to +each of the exact number of samples present in each population is added. Note that +samples are randomly sampled only once, meaning that the lower downsamplings are +subsets of the higher ones. If the downsampling_expr parameter is used with the +downsamplings parameter, the downsamplings parameter informs the function which +downsampling groups were already created and are to be used in the frequency +calculation.

+

The downsampling_expr and ds_pop_counts parameters

+

If the downsampling_expr parameter is used, downsamplings must also be set +and frequencies will be computed for all samples and by population (if pop_expr +is specified) using the downsampling indices to each of the numbers specified in +the downsamplings array. The function expects a ‘global_idx’, and if pop_expr +is used, a ‘pop_idx’ within the downsampling_expr to be used to determine if a +sample belongs within a certain downsampling group, i.e. the index is less than +the group size. The function `annotate_downsamplings can be used to to create +the downsampling_expr, downsamplings, and ds_pop_counts expressions.

+

The entry_agg_funcs parameter

+

If the entry_agg_funcs parameter is used, the output MatrixTable will also +contain the annotations specified in the entry_agg_funcs parameter. The keys of +the dict are the names of the annotations and the values are tuples of functions. +The first function is used to transform the mt entries in some way, and the +second function is used to aggregate the output from the first function. For +example, if entry_agg_funcs is set to {‘adj_samples’: (get_adj_expr, hl.agg.sum)}`, +then the output MatrixTable will contain an annotation adj_samples which is an +array of the number of adj samples per strata in each row.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable

  • +
  • sex_expr (Optional[StringExpression]) – When specified, frequencies are stratified by sex. If pop_expr +is also specified, then a pop/sex stratifiction is added.

  • +
  • pop_expr (Optional[StringExpression]) – When specified, frequencies are stratified by population. If +sex_expr is also specified, then a pop/sex stratifiction is added.

  • +
  • subpop_expr (Optional[StringExpression]) – When specified, frequencies are stratified by sub-continental +population. Note that pop_expr is required as well when using this option.

  • +
  • additional_strata_expr (Union[List[Dict[str, StringExpression]], Dict[str, StringExpression], None]) – When specified, frequencies are stratified by the +given additional strata. This can e.g. be used to stratify by platform, +platform-pop, platform-pop-sex.

  • +
  • downsamplings (Optional[List[int]]) – When specified, frequencies are computed by downsampling the +data to the number of samples given in the list. Note that if pop_expr is +specified, downsamplings by population is also computed.

  • +
  • downsampling_expr (Optional[StructExpression]) – When specified, frequencies are computed using the +downsampling indices in the provided StructExpression. Note that if pop_idx +is specified within the struct, downsamplings by population is also computed.

  • +
  • ds_pop_counts (Optional[Dict[str, int]]) – When specified, frequencies are computed by downsampling the +data to the number of samples per pop in the dict. The key is the population +and the value is the number of samples.

  • +
  • entry_agg_funcs (Optional[Dict[str, Tuple[Callable, Callable]]]) – When specified, additional annotations are added to the +output Table/MatrixTable. The keys of the dict are the names of the annotations +and the values are tuples of functions. The first function is used to transform +the mt entries in some way, and the second function is used to aggregate the +output from the first function.

  • +
  • annotate_mt (bool) – Whether to return the full MatrixTable with annotations added +instead of only a Table with freq and other annotations. Default is True.

  • +
+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

MatrixTable or Table with freq annotation.

+
+
+
+ +
+
+gnomad.utils.annotations.annotate_downsamplings(t, downsamplings, pop_expr=None)[source]
+

Annotate MatrixTable or Table with downsampling groups.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table.

  • +
  • downsamplings (List[int]) – List of downsampling sizes.

  • +
  • pop_expr (Optional[StringExpression]) – Optional expression for population group. When provided, population +sample sizes are added as values to downsamplings.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

MatrixTable or Table with downsampling annotations.

+
+
+
+ +
+
+gnomad.utils.annotations.build_freq_stratification_list(sex_expr=None, pop_expr=None, subpop_expr=None, additional_strata_expr=None, downsampling_expr=None)[source]
+

Build a list of stratification groupings to be used in frequency calculations based on supplied parameters.

+
+

Note

+

This function is primarily used through annotate_freq but can be used +independently if desired. The returned list of stratifications can be passed to +generate_freq_group_membership_array.

+
+
+
Parameters:
+
    +
  • sex_expr (Optional[StringExpression]) – When specified, the returned list contains a stratification for +sex. If pop_expr is also specified, then the returned list also contains a +pop/sex stratification.

  • +
  • pop_expr (Optional[StringExpression]) – When specified, the returned list contains a stratification for +population. If sex_expr is also specified, then the returned list also +contains a pop/sex stratification.

  • +
  • subpop_expr (Optional[StringExpression]) – When specified, the returned list contains a stratification for +sub-continental population. Note that pop_expr is required as well when using +this option.

  • +
  • additional_strata_expr (Union[List[Dict[str, StringExpression]], Dict[str, StringExpression], None]) – When specified, the returned list contains a +stratification for each of the additional strata. This can e.g. be used to +stratify by platform, platform-pop, platform-pop-sex.

  • +
  • downsampling_expr (Optional[StructExpression]) – When specified, the returned list contains a +stratification for downsampling. If pop_expr is also specified, then the +returned list also contains a downsampling/pop stratification.

  • +
+
+
Return type:
+

List[Dict[str, StringExpression]]

+
+
Returns:
+

List of dictionaries specifying stratification groups where the keys of +each dictionary are strings and the values are corresponding expressions that +define the values to stratify frequency calculations by.

+
+
+
+ +
+
+gnomad.utils.annotations.generate_freq_group_membership_array(ht, strata_expr, downsamplings=None, ds_pop_counts=None, remove_zero_sample_groups=False, no_raw_group=False)[source]
+

Generate a Table with a ‘group_membership’ array for each sample indicating whether the sample belongs to specific stratification groups.

+
+

Note

+

This function is primarily used through annotate_freq but can be used +independently if desired. Please see the annotate_freq function for more +complete documentation.

+
+
+
The following global annotations are added to the returned Table:
    +
  • freq_meta: Each element of the list contains metadata on a stratification +group.

  • +
  • freq_meta_sample_count: sample count per grouping defined in freq_meta.

  • +
  • If downsamplings or ds_pop_counts are specified, they are also added as +global annotations on the returned Table.

  • +
+
+
+

Each sample is annotated with a ‘group_membership’ array indicating whether the +sample belongs to specific stratification groups. All possible value combinations +are determined for each stratification grouping in the strata_expr list.

+
+
Parameters:
+
    +
  • ht (Table) – Input Table that contains Expressions specified by strata_expr.

  • +
  • strata_expr (List[Dict[str, StringExpression]]) – List of dictionaries specifying stratification groups where +the keys of each dictionary are strings and the values are corresponding +expressions that define the values to stratify frequency calculations by.

  • +
  • downsamplings (Optional[List[int]]) – List of downsampling values to include in the stratifications.

  • +
  • ds_pop_counts (Optional[Dict[str, int]]) – Dictionary of population counts for each downsampling value.

  • +
  • remove_zero_sample_groups (bool) – Whether to remove groups with a sample count of 0. +Default is False.

  • +
  • no_raw_group (bool) – Whether to remove the raw group from the ‘group_membership’ +annotation and the ‘freq_meta’ and ‘freq_meta_sample_count’ global annotations. +Default is False.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with the ‘group_membership’ array annotation.

+
+
+
+ +
+
+gnomad.utils.annotations.compute_freq_by_strata(mt, entry_agg_funcs=None, select_fields=None, group_membership_includes_raw_group=True)[source]
+

Compute call statistics and, when passed, entry aggregation function(s) by strata.

+

The computed call statistics are AC, AF, AN, and homozygote_count. The entry +aggregation functions are applied to the MatrixTable entries and aggregated. The +MatrixTable must contain a ‘group_membership’ annotation (like the one added by +generate_freq_group_membership_array) that is a list of bools to aggregate the +columns by.

+
+

Note

+

This function is primarily used through annotate_freq but can be used +independently if desired. Please see the annotate_freq function for more +complete documentation.

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable.

  • +
  • entry_agg_funcs (Optional[Dict[str, Tuple[Callable, Callable]]]) – Optional dict of entry aggregation functions. When +specified, additional annotations are added to the output Table/MatrixTable. +The keys of the dict are the names of the annotations and the values are tuples +of functions. The first function is used to transform the mt entries in some +way, and the second function is used to aggregate the output from the first +function.

  • +
  • select_fields (Optional[List[str]]) – Optional list of row fields from mt to keep on the output +Table.

  • +
  • group_membership_includes_raw_group (bool) – Whether the ‘group_membership’ +annotation includes an entry for the ‘raw’ group, representing all samples. If +False, the ‘raw’ group is inserted as the second element in all added +annotations using the same ‘group_membership’, resulting +in array lengths of ‘group_membership’+1. If True, the second element of each +added annotation is still the ‘raw’ group, but the group membership is +determined by the values in the second element of ‘group_membership’, and the +output annotations will be the same length as ‘group_membership’. Default is +True.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table or MatrixTable with allele frequencies by strata.

+
+
+
+ +
+
+gnomad.utils.annotations.agg_by_strata(mt, entry_agg_funcs, select_fields=None, group_membership_ht=None, entry_agg_group_membership=None)[source]
+

Get row expression for annotations of each entry aggregation function(s) by strata.

+

The entry aggregation functions are applied to the MatrixTable entries and +aggregated. If no group_membership_ht (like the one returned by +generate_freq_group_membership_array) is supplied, mt must contain a +‘group_membership’ annotation that is a list of bools to aggregate the columns by.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable.

  • +
  • entry_agg_funcs (Dict[str, Tuple[Callable, Callable]]) – Dict of entry aggregation functions where the +keys of the dict are the names of the annotations and the values are tuples +of functions. The first function is used to transform the mt entries in some +way, and the second function is used to aggregate the output from the first +function.

  • +
  • select_fields (Optional[List[str]]) – Optional list of row fields from mt to keep on the output +Table.

  • +
  • group_membership_ht (Optional[Table]) – Optional Table containing group membership annotations +to stratify the aggregations by. If not provided, the ‘group_membership’ +annotation is expected to be present on mt.

  • +
  • entry_agg_group_membership (Optional[Dict[str, List[dict]]]) – Optional dict indicating the subset of group +strata in ‘freq_meta’ to run the entry aggregation functions on. The keys of +the dict can be any of the keys in entry_agg_funcs and the values are lists +of dicts. Each dict in the list contains the strata in ‘freq_meta’ to use for +the corresponding entry aggregation function. If provided, ‘freq_meta’ must be +present in group_membership_ht or mt and represent the same strata as those +in ‘group_membership’. If not provided, all entries of the ‘group_membership’ +annotation will have the entry aggregation functions applied to them.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with annotations of stratified aggregations.

+
+
+
+ +
+
+gnomad.utils.annotations.update_structured_annotations(ht, annotation_update_exprs, annotation_update_label=None)[source]
+

Update highly structured annotations on a Table.

+

This function recursively updates annotations defined by annotation_update_exprs +and if annotation_update_label is supplied, it checks if the sample annotations +are different from the input and adds a flag to the Table, indicating which +annotations have been updated for each sample.

+
+
Parameters:
+
    +
  • ht (Table) – Input Table with structured annotations to update.

  • +
  • annotation_update_exprs (Dict[str, Expression]) – Dictionary of annotations to update, structured as +they are structured on the input ht.

  • +
  • annotation_update_label (Optional[str]) – Optional string of the label to use for an +annotation indicating which annotations have been updated. Default is None, so +no annotation is added.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with updated annotations and optionally a flag indicating which +annotations were changed.

+
+
+
+ +
+
+gnomad.utils.annotations.add_gks_vrs(input_locus, input_vrs)[source]
+

Generate a dictionary containing VRS information from a given locus and struct of VRS information.

+

Dict will have GA4GH GKS VRS structure.

+
+
Parameters:
+
    +
  • input_locus (locus) – Locus field from a struct (locus of result of running .collect() on a Hail table).

  • +
  • input_vrs (struct) – VRS struct (such as from a ht.info.vrs field).

  • +
+
+
Return type:
+

dict

+
+
Returns:
+

Python dictionary conforming to GA4GH GKS VRS structure.

+
+
+
+ +
+
+gnomad.utils.annotations.add_gks_va(input_struct, label_name='gnomAD', label_version='3.1.2', ancestry_groups=None, ancestry_groups_dict=None, by_sex=False, freq_index_dict=None)[source]
+

Generate Python dictionary containing GKS VA annotations.

+

Populate the dictionary with frequency information conforming to the GKS VA frequency schema. +If ancestry_groups or by_sex is provided, also include subcohort schemas for each cohort. +If input_struct has mean_depth, it is added to ancillaryResults. +This annotation is added under the gks_va_freq_dict field of the table. +The focusAllele field is not populated, and must be filled in by the caller.

+
+
Parameters:
+
    +
  • input_struct (struct) – Hail struct for a desired variant (such as result of running .collect()[0] on a Table).

  • +
  • label_name (str) – Label name to use within the returned dictionary. Example: “gnomAD”.

  • +
  • label_version (str) – String listing the version of the table being used. Example: “3.1.2”.

  • +
  • ancestry_groups (list) – List of strings of shortened names of cohorts to return results for. +Example: [‘afr’,’fin’,’nfe’]. Default is None.

  • +
  • ancestry_groups_dict (dict) – Dict mapping shortened genetic ancestry group names to full names. +Example: {‘afr’:’African/African American’}. Default is None.

  • +
  • by_sex (bool) – Boolean to include breakdown of cohorts by inferred sex (XX and XY) as well. +Default is None.

  • +
  • freq_index_dict (dict) –

  • +
+
+
Freq_index_dict:
+

Dict mapping groups to their index for freq info in ht.freq_index_dict[0]. +Default is None.

+
+
Return type:
+

dict

+
+
Returns:
+

Tuple containing a dictionary containing GKS VA frequency information, +(split by ancestry groups and sex if desired) for the specified variant.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/constraint.html b/api_reference/utils/constraint.html new file mode 100644 index 000000000..73c0f660f --- /dev/null +++ b/api_reference/utils/constraint.html @@ -0,0 +1,1154 @@ + + + + + + + gnomad.utils.constraint — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.constraint

+

Script containing generic constraint functions that may be used in the constraint pipeline.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.constraint.COVERAGE_CUTOFF

Minimum median exome coverage differentiating high coverage sites from low coverage sites.

gnomad.utils.constraint.annotate_with_mu(ht, ...)

Annotate SNP mutation rate for the input Table.

gnomad.utils.constraint.count_variants_by_group(ht)

Count number of observed or possible variants by context, ref, alt, and optionally methylation_level.

gnomad.utils.constraint.get_downsampling_freq_indices(...)

Get indices of dictionaries in meta dictionaries that only have the "downsampling" key with specified genetic_ancestry_label and "variant_quality" values.

gnomad.utils.constraint.downsampling_counts_expr(...)

Return an aggregation expression to compute an array of counts of all downsamplings found in freq_expr where specified criteria is met.

gnomad.utils.constraint.annotate_mutation_type(t)

Annotate mutation types.

gnomad.utils.constraint.trimer_from_heptamer(t)

Trim heptamer context to create trimer context.

gnomad.utils.constraint.collapse_strand(t)

Return the deduplicated context by collapsing DNA strands.

gnomad.utils.constraint.build_models(coverage_ht)

Build coverage and plateau models.

gnomad.utils.constraint.build_plateau_models(...)

Build plateau models to calibrate mutation rate to compute predicted proportion observed value.

gnomad.utils.constraint.build_coverage_model(...)

Build coverage model.

gnomad.utils.constraint.get_all_pop_lengths(ht, ...)

Get the minimum length of observed variant counts array for each population downsampling.

gnomad.utils.constraint.get_constraint_grouping_expr(...)

Collect annotations used for constraint groupings.

gnomad.utils.constraint.annotate_exploded_vep_for_constraint_groupings(ht)

Annotate Table with annotations used for constraint groupings.

gnomad.utils.constraint.compute_expected_variants(ht, ...)

Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts.

gnomad.utils.constraint.oe_aggregation_expr(ht, ...)

Get aggregation expressions to compute the observed:expected ratio for rows defined by filter_expr.

gnomad.utils.constraint.compute_pli(ht, ...)

Compute the pLI score using the observed and expected variant counts.

gnomad.utils.constraint.oe_confidence_interval(...)

Determine the confidence interval around the observed:expected ratio.

gnomad.utils.constraint.calculate_raw_z_score(...)

Compute the signed raw z-score using observed and expected variant counts.

gnomad.utils.constraint.get_constraint_flags(...)

Determine the constraint flags that define why constraint will not be calculated.

gnomad.utils.constraint.calculate_raw_z_score_sd(...)

Calculate the standard deviation of the raw z-score.

gnomad.utils.constraint.add_gencode_transcript_annotations(ht, ...)

Add GENCODE annotations to Table based on transcript id.

+

Script containing generic constraint functions that may be used in the constraint pipeline.

+
+
+gnomad.utils.constraint.COVERAGE_CUTOFF = 30
+

Minimum median exome coverage differentiating high coverage sites from low coverage sites.

+

Low coverage sites require an extra calibration when computing the proportion of expected variation.

+
+ +
+
+gnomad.utils.constraint.annotate_with_mu(ht, mutation_ht, mu_annotation='mu_snp')[source]
+

Annotate SNP mutation rate for the input Table.

+
+

Note

+

Function expects that`ht` includes`mutation_ht`’s key fields. Note that these +annotations don’t need to be the keys of ht.

+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table to annotate.

  • +
  • mutation_ht (Table) – Mutation rate Table.

  • +
  • mu_annotation (str) – The name of mutation rate annotation in mutation_ht. +Default is ‘mu_snp’.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with mutational rate annotation added.

+
+
+
+ +
+
+gnomad.utils.constraint.count_variants_by_group(ht, freq_expr=None, freq_meta_expr=None, count_singletons=False, count_downsamplings=(), downsamplings=None, additional_grouping=(), partition_hint=100, omit_methylation=False, use_table_group_by=False, singleton_expr=None, max_af=None)[source]
+

Count number of observed or possible variants by context, ref, alt, and optionally methylation_level.

+

Performs variant count aggregations based on specified criteria +(count_singletons, count_downsamplings, and max_af), and grouped by: +‘context’, ‘ref’, ‘alt’, ‘methylation_level’ (optional), and all annotations +provided in additional_grouping.

+

If variant allele frequency information is required based on other parameter +selections (described in detail below) and freq_expr is not supplied, freq_expr +defaults to ht.freq if it exists.

+

freq_expr should be an ArrayExpression of Structs with ‘AC’ and ‘AF’ annotations. +This is the same format as the freq annotation that is created using +annotate_freq().

+
+
Variant allele frequency information is needed when:
    +
  • max_af is not None - freq_expr[0].AF is used to filter to only variants +with a maximum allele frequency of max_af prior to counting variants. In +the standard freq ArrayExpression annotated by annotate_freq(), this +first element corresponds to the allele frequency information for high quality +genotypes (adj).

  • +
  • count_singletons is True and singleton_expr is None - If singleton counts +are requested and no expression is specified to determine whether a variant +is a singleton, singleton_expr defaults to freq_expr[0].AC == 1. In the +standard freq ArrayExpression annotated by annotate_freq(), this +corresponds to allele count of only 1 in the callset after filtering to high +quality genotypes.

  • +
  • count_downsamplings is not empty - When downsampling counts are requested, +freq_expr needs to contain frequency information for downsamplings within +each population requested. In addition to needing freq_expr, this also +requires the use of freq_meta_expr. If freq_meta_expr is None, +freq_meta_expr it defaults to ht.freq_meta if it exists. Similar to +freq_expr, freq_meta_expr is expected to have the same format as +the freq_meta global annotation that is created using annotate_freq(). +freq_meta_expr is used to determine the index of allele frequency +information within freq_expr for each population requested and it’s +downsamplings.

  • +
+
+
+

This function will return a Table with annotations used for grouping (‘context’, +‘ref’, ‘alt’, ‘methylation_level’ (optional), additional_grouping) and +‘variant_count’ annotation.

+
+

Note

+
+
The following annotations should be present in ht:
    +
  • ref - the reference allele

  • +
  • alt - the alternate base

  • +
  • context - trinucleotide genomic context

  • +
  • methylation_level - methylation level (optional if omit_methylation==True)

  • +
  • freq - allele frequency information (AC, AN, AF, homozygote count; not +required if freq_expr is given)

  • +
  • freq_meta - an ordered list containing the frequency aggregation group +for each element of the freq array row annotation (not required if +freq_meta_expr is given)

  • +
+
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Hail Table.

  • +
  • freq_expr (Optional[ArrayExpression]) – ArrayExpression of Structs with ‘AC’ and ‘AF’ annotations. If +freq_expr is None and any of count_downsamplings, max_af, and +count_singletons is True, freq_expr would be ht.freq.

  • +
  • freq_meta_expr (Optional[ArrayExpression]) – ArrayExpression of meta dictionaries corresponding to +freq_expr. If count_downsamplings and freq_meta_expr is None, +freq_meta_expr would be ht.freq_meta.

  • +
  • count_singletons (bool) – Whether to count singletons (defined by singleton_expr). +Default is False.

  • +
  • count_downsamplings (Tuple[str]) – Tuple of populations to use for downsampling counts. +Default is ().

  • +
  • downsamplings (Optional[List[int]]) – Optional List of integers specifying what downsampling +indices to obtain. Default is None, which will return all downsampling counts.

  • +
  • additional_grouping (Tuple[str]) – Additional features to group by. e.g. ‘exome_coverage’. +Default is ().

  • +
  • partition_hint (int) – Target number of partitions for aggregation. Default is 100.

  • +
  • omit_methylation (bool) – Whether to omit ‘methylation_level’ from the grouping when +counting variants. Default is False.

  • +
  • use_table_group_by (bool) – Whether to group ht before aggregating the variant +counts. If use_table_group_by is False, function will return a hl. +StructExpression. Default is False.

  • +
  • singleton_expr (Optional[BooleanExpression]) – Expression for defining a singleton. When count_singletons +is True and singleton_expr is None, singleton_expression would be freq_expr +[0].AC == 1. Default is None.

  • +
  • max_af (Optional[float]) – Maximum variant allele frequency to keep. By default, no cutoff is +applied.

  • +
+
+
Return type:
+

Union[Table, Any]

+
+
Returns:
+

Table including ‘variant_count’ annotation and if requested, +singleton_count and downsampling counts.

+
+
+
+ +
+
+gnomad.utils.constraint.get_downsampling_freq_indices(freq_meta_expr, pop='global', variant_quality='adj', genetic_ancestry_label=None, subset=None, downsamplings=None)[source]
+

Get indices of dictionaries in meta dictionaries that only have the “downsampling” key with specified genetic_ancestry_label and “variant_quality” values.

+
+
Parameters:
+
    +
  • freq_meta_expr (ArrayExpression) – ArrayExpression containing the set of groupings for each +element of the freq_expr array (e.g., [{‘group’: ‘adj’}, {‘group’: ‘adj’, +‘pop’: ‘nfe’}, {‘downsampling’: ‘5000’, ‘group’: ‘adj’, ‘pop’: ‘global’}]).

  • +
  • pop (str) – Population to use for filtering by the genetic_ancestry_label key in +freq_meta_expr. Default is ‘global’.

  • +
  • variant_quality (str) – Variant quality to use for filtering by the ‘group’ key in +freq_meta_expr. Default is ‘adj’.

  • +
  • genetic_ancestry_label (Optional[str]) – Label defining the genetic ancestry groups. If None, +“gen_anc” or “pop” is used (in that order of preference) if present. Default is +None.

  • +
  • subset (Optional[str]) – Subset to use for filtering by the ‘subset’ key in freq_meta_expr. +Default is None, which will return all downsampling indices without a ‘subset’ +key in freq_meta_expr.

  • +
  • downsamplings (Optional[List[int]]) – Optional List of integers specifying what downsampling +indices to obtain. Default is None, which will return all downsampling indices.

  • +
+
+
Return type:
+

ArrayExpression

+
+
Returns:
+

ArrayExpression of indices of dictionaries in freq_meta_expr that only +have the “downsampling” key with specified genetic_ancestry_label and +“variant_quality” values.

+
+
+
+ +
+
+gnomad.utils.constraint.downsampling_counts_expr(freq_expr, freq_meta_expr, pop='global', variant_quality='adj', singleton=False, max_af=None, genetic_ancestry_label=None, subset=None, downsamplings=None)[source]
+

Return an aggregation expression to compute an array of counts of all downsamplings found in freq_expr where specified criteria is met.

+

The frequency metadata (freq_meta_expr) should be in a similar format to the +freq_meta annotation added by annotate_freq(). Each downsampling should have +‘group’, genetic_ancestry_label, and ‘downsampling’ keys. Included downsamplings +are those where ‘group’ == variant_quality and genetic_ancestry_label == pop.

+
+
Parameters:
+
    +
  • freq_expr (ArrayExpression) – ArrayExpression of Structs with ‘AC’ and ‘AF’ annotations.

  • +
  • freq_meta_expr (ArrayExpression) – ArrayExpression containing the set of groupings for each +element of the freq_expr array (e.g., [{‘group’: ‘adj’}, {‘group’: ‘adj’, +‘pop’: ‘nfe’}, {‘downsampling’: ‘5000’, ‘group’: ‘adj’, ‘pop’: ‘global’}]).

  • +
  • pop (str) – Population to use for filtering by the genetic_ancestry_label key in +freq_meta_expr. Default is ‘global’.

  • +
  • variant_quality (str) – Variant quality to use for filtering by the ‘group’ key in +freq_meta_expr. Default is ‘adj’.

  • +
  • singleton (bool) – Whether to filter to only singletons before counting (AC == 1). +Default is False.

  • +
  • max_af (Optional[float]) – Maximum variant allele frequency to keep. By default no allele +frequency cutoff is applied.

  • +
  • genetic_ancestry_label (Optional[str]) – Label defining the genetic ancestry groups. If None, +“gen_anc” or “pop” is used (in that order of preference) if present. Default is +None.

  • +
  • subset (Optional[str]) – Subset to use for filtering by the ‘subset’ key in freq_meta_expr. +Default is None, which will return all downsampling counts without a ‘subset’ +key in freq_meta_expr. If specified, only downsamplings with the specified +subset will be included.

  • +
  • downsamplings (Optional[List[int]]) – Optional List of integers specifying what downsampling +indices to obtain. Default is None, which will return all downsampling counts.

  • +
+
+
Return type:
+

ArrayExpression

+
+
Returns:
+

Aggregation Expression for an array of the variant counts in downsamplings +for specified population.

+
+
+
+ +
+
+gnomad.utils.constraint.annotate_mutation_type(t, context_length=None, num_scan_context_length=100)[source]
+

Annotate mutation types.

+
+
The following annotations are added to the output Table:
    +
  • cpg

  • +
  • transition

  • +
  • mutation_type - one of “CpG”, “non-CpG transition”, or “transversion”

  • +
  • mutation_type_model

  • +
+
+
+

..note:

+
+

This function uses the term ‘mutation_type’ because ‘variant_type’ is already +used in this repo to indicate a variant’s multiallelic and SNP/indel status.

+
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input Table or MatrixTable.

  • +
  • context_length (Optional[int]) – Length of the ‘context’ annotation in ‘t’. If this is not +specified, the value will be determined by examining the first +num_scan_context_length values of the ‘context’ annotation. Default is None.

  • +
  • num_scan_context_length (Optional[int]) – Number of values in the ‘context’ annotation to use +for determining context_length if it is not specified. If set to None, all +values in ‘context’ will be used. Default is 100.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Table with mutation type annotations added.

+
+
+
+ +
+
+gnomad.utils.constraint.trimer_from_heptamer(t)[source]
+

Trim heptamer context to create trimer context.

+
+
Parameters:
+

t (Union[MatrixTable, Table]) – Input MatrixTable or Table with context annotation.

+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

MatrixTable or Table with trimer context annotated.

+
+
+
+ +
+
+gnomad.utils.constraint.collapse_strand(t)[source]
+

Return the deduplicated context by collapsing DNA strands.

+

Function returns the reverse complement for ‘ref, ‘alt’, and ‘context’ if the +reference allele is either ‘G’ or ‘T’.

+
+
The following annotations are added to the output Table:
    +
  • was_flipped - whether the ‘ref, ‘alt’, and ‘context’ were flipped (reverse +complement taken)

  • +
+
+
+
+
Parameters:
+
+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Table with deduplicated context annotation (ref, alt, context, +was_flipped).

+
+
+
+ +
+
+gnomad.utils.constraint.build_models(coverage_ht, weighted=False, pops=(), keys=('context', 'ref', 'alt', 'methylation_level', 'mu_snp'), high_cov_definition=30, upper_cov_cutoff=None, skip_coverage_model=False)[source]
+

Build coverage and plateau models.

+

This function builds models (plateau_models) using linear regression to calibrate +mutation rate estimates against the proportion observed of each substitution, +context, and methylation level in coverage_ht.

+

Two plateau models are fit, one for CpG transitions, and one for the remainder of +sites (transversions and non CpG transitions).

+

The plateau models only consider high coverage sites, or sites above a median +coverage of high_cov_definition and median coverage below upper_cov_cutoff.

+

Plateau model: adjusts proportion of expected variation based on location in the +genome and CpG status. +The x and y of the plateau models: +- x: mu_snp - mutation rate +- y: proportion observed (‘observed_variants’ or ‘observed_{pop}’ / ‘possible_variants’)

+

This function also builds models (coverage models) to calibrate the proportion of +expected variation at low coverage sites (sites below high_cov_definition).

+

The coverage models are built by creating a scaling factor across all high coverage +sites, applying this ratio to the low coverage sites, and running a linear +regression.

+

Coverage model: corrects proportion of expected variation at low coverage sites. +Low coverage sites are defined as sites with median coverage < high_cov_definition.

+

The x and y of the coverage model: +- x: log10 groupings of exome coverage at low coverage sites +- y: sum(‘observed_variants’)/ (high_coverage_scale_factor * sum(‘possible_variants’ * ‘mu_snp’) at low coverage sites

+
+
high_coverage_scale_factor = sum(‘observed_variants’) /

sum(‘possible_variants’ * ‘mu_snp’) at high coverage sites

+
+
+
+

Note

+

This function expects that the input Table(coverage_ht) was created using +get_proportion_observed_by_coverage, which means that coverage_ht should +contain only high quality synonymous variants below 0.1% frequency.

+

This function also expects that the following fields are present in +coverage_ht: +- context - trinucleotide genomic context +- ref - the reference allele +- alt - the alternate allele +- methylation_level - methylation level +- cpg - whether the site is CpG site +- exome_coverage - median exome coverage at integer values between 1-100 +- observed_variants - the number of observed variants in the dataset for each +variant. Note that the term “variant” here refers to a specific substitution, +context, methylation level, and coverage combination +- downsampling_counts_{pop} (optional) - array of observed variant counts per +population after downsampling. Used only when pops is specified. +- mu_snp - mutation rate +- possible_variants - the number of possible variants in the dataset for each +variant

+
+
+
Parameters:
+
    +
  • coverage_ht (Table) – Input coverage Table.

  • +
  • weighted (bool) – Whether to weight the plateau models (a linear regression +model) by ‘possible_variants’. Default is False.

  • +
  • pops (Tuple[str]) – List of populations used to build plateau models. +Default is ().

  • +
  • keys (Tuple[str]) – Annotations used to group observed and possible variant counts. +Default is (“context”, “ref”, “alt”, “methylation_level”, “mu_snp”).

  • +
  • high_cov_definition (int) – Lower median coverage cutoff. Sites with coverage above this cutoff +are considered well covered. Default is COVERAGE_CUTOFF.

  • +
  • upper_cov_cutoff (Optional[int]) – Upper median coverage cutoff. Sites with coverage above this cutoff +are excluded from the high coverage Table. Default is None.

  • +
  • skip_coverage_model (bool) – Whether to skip generating the coverage model. If set to True, +None is returned instead of the coverage model. Default is False.

  • +
+
+
Return type:
+

Tuple[Optional[Tuple[float, float]], StructExpression]

+
+
Returns:
+

Coverage model and plateau models.

+
+
+
+ +
+
+gnomad.utils.constraint.build_plateau_models(cpg_expr, mu_snp_expr, observed_variants_expr, possible_variants_expr, pops_observed_variants_array_expr=[], weighted=False)[source]
+

Build plateau models to calibrate mutation rate to compute predicted proportion observed value.

+

The x and y of the plateau models: +- x: mu_snp_expr +- y: observed_variants_expr / possible_variants_expr +or pops_observed_variants_array_expr`[index] / `possible_variants_expr +if pops is specified

+
+
Parameters:
+
    +
  • cpg_expr (BooleanExpression) – BooleanExpression noting whether a site is a CPG site.

  • +
  • mu_snp_expr (Float64Expression) – Float64Expression of the mutation rate.

  • +
  • observed_variants_expr (Int64Expression) – Int64Expression of the observed variant counts.

  • +
  • possible_variants_expr (Int64Expression) – Int64Expression of the possible variant counts.

  • +
  • pops_observed_variants_array_expr (List[ArrayExpression]) – Nested ArrayExpression with all observed +variant counts ArrayNumericExpressions for specified populations. e.g., [[1,1, +1],[1,1,1]]. Default is None.

  • +
  • weighted (bool) – Whether to generalize the model to weighted least squares using +‘possible_variants’. Default is False.

  • +
+
+
Return type:
+

Dict[str, Union[Dict[bool, ArrayExpression], ArrayExpression]]

+
+
Returns:
+

A dictionary of intercepts and slopes of plateau models. The keys are +‘total’ (for all sites) and ‘pop’ (optional; for populations). The values for +‘total’ is a dictionary (e.g., <DictExpression of type dict<bool, +array<float64>>>), and the value for ‘pop’ is a nested list of dictionaries (e. +g., <ArrayExpression of type array<array<dict<bool, array<float64>>>>>). The +key of the dictionary in the nested list is CpG status (BooleanExpression), and +the value is an ArrayExpression containing intercept and slope values.

+
+
+
+ +
+
+gnomad.utils.constraint.build_coverage_model(low_coverage_oe_expr, log_coverage_expr)[source]
+

Build coverage model.

+

This function uses linear regression to build a model of log10(coverage) to correct +proportion of expected variation at low coverage sites.

+

The x and y of the coverage model: +- x: log_coverage_expr +- y: low_coverage_oe_expr

+
+
Parameters:
+
    +
  • low_coverage_oe_expr (Float64Expression) – The Float64Expression of observed:expected ratio +for a given coverage level.

  • +
  • log_coverage_expr (Float64Expression) – The Float64Expression of log10 coverage.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

StructExpression with intercept and slope of the model.

+
+
+
+ +
+
+gnomad.utils.constraint.get_all_pop_lengths(ht, pops, obs_expr)[source]
+

Get the minimum length of observed variant counts array for each population downsampling.

+

The observed variant counts for each population in pops are specified by +annotations on the obs_expr expression.

+

The function also performs a check that arrays of variant counts within population +downsamplings all have the same lengths.

+
+
Parameters:
+
    +
  • ht (Table) – Input Table containing obs_expr.

  • +
  • pops (Tuple[str]) – Populations used to categorize observed variant counts in downsamplings.

  • +
  • obs_expr (StructExpression) – Expression for the population observed variant counts. Should be a +struct containing an array for each pop in pops.

  • +
+
+
Return type:
+

List[Tuple[str, str]]

+
+
Returns:
+

A Dictionary with the minimum array length for each population.

+
+
+
+ +
+
+gnomad.utils.constraint.get_constraint_grouping_expr(vep_annotation_expr, coverage_expr=None, include_transcript_group=True, include_canonical_group=True, include_mane_select_group=False)[source]
+

Collect annotations used for constraint groupings.

+
+
Function collects the following annotations:
    +
  • annotation - ‘most_severe_consequence’ annotation in vep_annotation_expr

  • +
  • +
    modifier - classic lof annotation from ‘lof’ annotation in

    vep_annotation_expr, LOFTEE annotation from ‘lof’ annotation in +vep_annotation_expr, PolyPhen annotation from ‘polyphen_prediction’ in +vep_annotation_expr, or “None” if neither is defined

    +
    +
    +
  • +
  • gene - ‘gene_symbol’ annotation inside vep_annotation_expr

  • +
  • coverage - exome coverage if coverage_expr is specified

  • +
  • +
    transcript - id from ‘transcript_id’ in vep_annotation_expr (added when

    include_transcript_group is True)

    +
    +
    +
  • +
  • +
    canonical from vep_annotation_expr (added when include_canonical_group is

    True)

    +
    +
    +
  • +
  • +
    mane_select from vep_annotation_expr (added when include_mane_select_group is

    True)

    +
    +
    +
  • +
+
+
+
+

Note

+

This function expects that the following fields are present in +vep_annotation_expr: +- lof +- polyphen_prediction +- most_severe_consequence +- gene_symbol +- transcript_id (if include_transcript_group is True) +- canonical (if include_canonical_group is True) +- mane_select (if include_mane_select_group is True)

+
+
+
Parameters:
+
    +
  • vep_annotation_expr (StructExpression) – StructExpression of VEP annotation.

  • +
  • coverage_expr (Optional[Int32Expression]) – Optional Int32Expression of exome coverage. Default is None.

  • +
  • include_transcript_group (bool) – Whether to include the transcript annotation in the +groupings. Default is True.

  • +
  • include_canonical_group (bool) – Whether to include canonical annotation in the +groupings. Default is True.

  • +
  • include_mane_select_group (bool) – Whether to include mane_select annotation in the +groupings. Default is False.

  • +
+
+
Return type:
+

Dict[str, Union[StringExpression, Int32Expression, BooleanExpression]]

+
+
Returns:
+

A dictionary with keys as annotation names and values as actual +annotations.

+
+
+
+ +
+
+gnomad.utils.constraint.annotate_exploded_vep_for_constraint_groupings(ht, vep_annotation='transcript_consequences', include_canonical_group=True, include_mane_select_group=False)[source]
+

Annotate Table with annotations used for constraint groupings.

+
+
Function explodes the specified VEP annotation (vep_annotation) and adds the following annotations:
    +
  • annotation -‘most_severe_consequence’ annotation in vep_annotation

  • +
  • +
    modifier - classic lof annotation from ‘lof’ annotation in

    vep_annotation, LOFTEE annotation from ‘lof’ annotation in +vep_annotation, PolyPhen annotation from ‘polyphen_prediction’ in +vep_annotation, or “None” if neither is defined

    +
    +
    +
  • +
  • gene - ‘gene_symbol’ annotation inside vep_annotation

  • +
  • coverage - exome coverage in ht

  • +
  • +
    transcript - id from ‘transcript_id’ in vep_annotation (added when

    include_transcript_group is True)

    +
    +
    +
  • +
  • +
    canonical from vep_annotation (added when include_canonical_group is

    True)

    +
    +
    +
  • +
  • +
    mane_select from vep_annotation (added when include_mane_select_group is

    True)

    +
    +
    +
  • +
+
+
+
+

Note

+

This function expects that the following annotations are present in ht: +- vep +- exome_coverage

+
+
+
Parameters:
+
    +
  • t – Input Table or MatrixTable.

  • +
  • vep_annotation (str) – Name of annotation in ‘vep’ annotation (one of +“transcript_consequences” and “worst_csq_by_gene”) that will be used for +obtaining constraint annotations. Default is “transcript_consequences”.

  • +
  • include_canonical_group (bool) – Whether to include ‘canonical’ annotation in the +groupings. Default is True. Ignored unless vep_annotation is “transcript_consequences”.

  • +
  • include_mane_select_group (bool) – Whether to include ‘mane_select’ annotation in the +groupings. Default is False. Ignored unless vep_annotation is “transcript_consequences”.

  • +
  • ht (Table) –

  • +
+
+
Return type:
+

Tuple[Union[Table, MatrixTable], Tuple[str]]

+
+
Returns:
+

A tuple of input Table or MatrixTable with grouping annotations added and +the names of added annotations.

+
+
+
+ +
+
+gnomad.utils.constraint.compute_expected_variants(ht, plateau_models_expr, mu_expr, cov_corr_expr, possible_variants_expr, cpg_expr, pop=None)[source]
+

Apply plateau models for all sites and for a population (if specified) to compute predicted proportion observed ratio and expected variant counts.

+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • plateau_models_expr (StructExpression) – Linear models (output of build_models(), with the values +of the dictionary formatted as a StructExpression of intercept and slope, that +calibrates mutation rate to proportion observed for high coverage exomes. It +includes models for CpG, non-CpG sites, and each population if specified.

  • +
  • mu_expr (Float64Expression) – Float64Expression of mutation rate.

  • +
  • possible_variants_expr (Int64Expression) – Int64Expression of possible variant counts.

  • +
  • cov_corr_expr (Float64Expression) – Float64Expression of corrected coverage expression.

  • +
  • cpg_expr (BooleanExpression) – BooleanExpression noting whether a site is a CPG site.

  • +
  • pop (Optional[str]) – Optional population to use when applying plateau model. Default is +None.

  • +
+
+
Return type:
+

Dict[str, Union[Float64Expression, Int64Expression]]

+
+
Returns:
+

A dictionary with predicted proportion observed ratio and expected variant +counts.

+
+
+
+ +
+
+gnomad.utils.constraint.oe_aggregation_expr(ht, filter_expr, pops=(), exclude_mu_sum=False)[source]
+

Get aggregation expressions to compute the observed:expected ratio for rows defined by filter_expr.

+

Return a Struct containing aggregation expressions to sum the number of observed +variants, possible variants, expected variants, and mutation rate (if +exclude_mu_sum is not True) for rows defined by filter_expr. The Struct also +includes an aggregation expression for the observed:expected ratio.

+
+
The following annotations are in the returned StructExpression:
    +
  • obs - the sum of observed variants filtered to filter_expr.

  • +
  • mu - the sum of mutation rate of variants filtered to filter_expr.

  • +
  • possible - possible number of variants filtered to filter_expr.

  • +
  • exp - expected number of variants filtered to filter_expr.

  • +
  • oe - observed:expected ratio of variants filtered to filter_expr.

  • +
+
+
If pops is specified:
    +
  • pop_exp - Struct with the expected number of variants per population (for +all pop in pops) filtered to filter_expr.

  • +
  • pop_obs - Struct with the observed number of variants per population (for +all pop in pops) filtered to filter_expr.

  • +
+
+
+
+
+
+

Note

+
+
The following annotations should be present in ht:
    +
  • observed_variants

  • +
  • mu

  • +
  • possible_variants

  • +
  • expected_variants

  • +
+
+
If pops is specified, the following annotations should also be present:
    +
  • expected_variants_{pop} for all pop in pops

  • +
  • downsampling_counts_{pop} for all pop in pops

  • +
+
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table to create observed:expected ratio aggregation expressions for.

  • +
  • filter_expr (BooleanExpression) – Boolean expression used to filter ht before aggregation.

  • +
  • pops (Tuple[str]) – List of populations to compute constraint metrics for. Default is ().

  • +
  • exclude_mu_sum (bool) – Whether to exclude mu sum aggregation expression from +returned struct. Default is False.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

StructExpression with observed:expected ratio aggregation expressions.

+
+
+
+ +
+
+gnomad.utils.constraint.compute_pli(ht, obs_expr, exp_expr, expected_values=None, min_diff_convergence=0.001)[source]
+

Compute the pLI score using the observed and expected variant counts.

+

Full details on pLI can be found in the ExAC paper: Lek, M., Karczewski, K., +Minikel, E. et al. Analysis of protein-coding genetic variation in 60,706 humans. +Nature 536, 285–291 (2016).

+

pLI is the probability of being loss-of-function intolerant, and this function +computes that probability using the expectation-maximization (EM) algorithm.

+

We assume a 3 state model, where each gene fits into one of three categories +with respect loss-of-function variation sensitivity:

+
+
    +
  • Null: where protein truncating variation is completely tolerated by natural +selection.

  • +
  • Recessive (Rec): where heterozygous pLoFs are tolerated but homozygous pLoFs +are not.

  • +
  • Haploinsufficient (LI): where heterozygous pLoFs are not tolerated.

  • +
+
+

The function requires the expected amount of loss-of-function depletion for each of +these states. The default provided is based on the observed depletion of +protein-truncating variation in the Blekhman autosomal recessive and ClinGen +dosage sensitivity gene sets (Supplementary Information Table 12 of the above +reference):

+
+
    +
  • Null: 1.0, assume tolerant genes have the expected amount of truncating +variation.

  • +
  • Rec: 0.463, derived from the empirical mean observed/expected rate of +truncating variation for recessive disease genes (0.463).

  • +
  • LI: 0.089, derived from the empirical mean observed/expected rate of +truncating variation for severe haploinsufficient genes.

  • +
+
+

The output StructExpression will include the following annotations:

+
+
    +
  • pLI: Probability of loss-of-function intolerance; probability that transcript +falls into distribution of haploinsufficient genes.

  • +
  • pNull: Probability that transcript falls into distribution of unconstrained +genes.

  • +
  • pRec: Probability that transcript falls into distribution of recessive genes.

  • +
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table containing obs_expr and exp_expr.

  • +
  • obs_expr (Int64Expression) – Expression for the number of observed variants on each gene or +transcript in ht.

  • +
  • exp_expr (Float64Expression) – Expression for the number of expected variants on each gene or +transcript in ht.

  • +
  • expected_values (Optional[Dict[str, float]]) – Dictionary containing the expected values for ‘Null’, +‘Rec’, and ‘LI’ to use as starting values.

  • +
  • min_diff_convergence (float) – Minimum iteration change in LI to consider the EM +model convergence criteria as met. Default is 0.001.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

StructExpression for pLI scores.

+
+
+
+ +
+
+gnomad.utils.constraint.oe_confidence_interval(obs_expr, exp_expr, alpha=0.05)[source]
+

Determine the confidence interval around the observed:expected ratio.

+

For a given pair of observed (obs_expr) and expected (exp_expr) values, the +function computes the density of the Poisson distribution (performed using Hail’s +dpois module) with fixed k (x in dpois is set to the observed number of +variants) over a range of lambda (lamb in dpois) values, which are given by the +expected number of variants times a varying parameter ranging between 0 and 2 (the +observed:expected ratio is typically between 0 and 1, so we want to extend the +upper bound of the confidence interval to capture this). The cumulative density +function of the Poisson distribution density is computed and the value of the +varying parameter is extracted at points corresponding to alpha (defaults to 5%) +and 1-alpha (defaults to 95%) to indicate the lower and upper bounds of the +confidence interval.

+
+
The following annotations are in the output StructExpression:
    +
  • lower - the lower bound of confidence interval

  • +
  • upper - the upper bound of confidence interval

  • +
+
+
+
+
Parameters:
+
    +
  • obs_expr (Int64Expression) – Expression for the observed variant counts of pLoF, missense, or +synonymous variants in ht.

  • +
  • exp_expr (Float64Expression) – Expression for the expected variant counts of pLoF, missense, or +synonymous variants in ht.

  • +
  • alpha (float) – The significance level used to compute the confidence interval. +Default is 0.05.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

StructExpression for the confidence interval lower and upper bounds.

+
+
+
+ +
+
+gnomad.utils.constraint.calculate_raw_z_score(obs_expr, exp_expr)[source]
+

Compute the signed raw z-score using observed and expected variant counts.

+

The raw z-scores are positive when the transcript had fewer variants than expected, +and are negative when transcripts had more variants than expected.

+
+
Parameters:
+
+
+
Return type:
+

StructExpression

+
+
Returns:
+

StructExpression for the raw z-score.

+
+
+
+ +
+
+gnomad.utils.constraint.get_constraint_flags(exp_expr, raw_z_expr, raw_z_lower_threshold=-5.0, raw_z_upper_threshold=5.0, flag_postfix='')[source]
+

Determine the constraint flags that define why constraint will not be calculated.

+
+
Flags which are added:
    +
  • “no_exp_{flag_postfix}” - for genes that have missing or zero expected variants.

  • +
  • “outlier_{flag_postfix}” - for genes that are raw z-score outliers: +(raw_z_expr < raw_z_lower_threshold) or (raw_z_expr > +raw_z_upper_threshold).

  • +
+
+
+
+
Parameters:
+
    +
  • exp_expr (Float64Expression) – Expression for the expected variant counts of pLoF, missense, or +synonymous variants.

  • +
  • raw_z_expr (Float64Expression) – Expression for the signed raw z-score of pLoF, missense, or +synonymous variants.

  • +
  • raw_z_lower_threshold (Optional[float]) – Lower threshold for the raw z-score. When raw_z_expr +is less than this threshold it is considered an ‘outlier’. Default is -5.0.

  • +
  • raw_z_upper_threshold (Optional[float]) – Upper threshold for the raw z-score. When raw_z_expr +is greater than this threshold it is considered an ‘outlier’. Default is 5.0.

  • +
  • flag_postfix (str) – Postfix to add to the end of the constraint flag names.

  • +
+
+
Return type:
+

Dict[str, Expression]

+
+
Returns:
+

Dictionary containing expressions for constraint flags.

+
+
+
+ +
+
+gnomad.utils.constraint.calculate_raw_z_score_sd(raw_z_expr, flag_expr, mirror_neg_raw_z=True)[source]
+

Calculate the standard deviation of the raw z-score.

+

When using mirror_neg_raw_z is True, all the negative raw z-scores (defined by +raw_z_expr) are combined with those same z-scores multiplied by -1 (to create a +mirrored distribution).

+
+
Parameters:
+
    +
  • raw_z_expr (Float64Expression) – Expression for the raw z-score.

  • +
  • flag_expr (StringExpression) – Expression for the constraint flags. z-score will not be +calculated if flags are present.

  • +
  • mirror_neg_raw_z (bool) – Whether the standard deviation should be computed using a +mirrored distribution of negative raw_z_expr.

  • +
+
+
Return type:
+

Expression

+
+
Returns:
+

StructExpression containing standard deviation of the raw z-score and +the z-score.

+
+
+
+ +
+
+gnomad.utils.constraint.add_gencode_transcript_annotations(ht, gencode_ht, annotations=['level', 'transcript_type'])[source]
+

Add GENCODE annotations to Table based on transcript id.

+
+

Note

+

Added annotations by default are: +- level +- transcript_type

+

Computed annotations are: +- chromosome +- cds_length +- num_coding_exons

+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • gencode_ht (Table) – Table with GENCODE annotations.

  • +
  • annotations (List[str]) – List of GENCODE annotations to add. Default is [“level”, “transcript_type”]. +Added annotations also become keys for the group by when computing “cds_length” and “num_coding_exons”.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with transcript annotations from GENCODE added.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/file_utils.html b/api_reference/utils/file_utils.html new file mode 100644 index 000000000..13780d5b2 --- /dev/null +++ b/api_reference/utils/file_utils.html @@ -0,0 +1,317 @@ + + + + + + + gnomad.utils.file_utils — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.file_utils

+ + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.file_utils.file_exists(fname)

Check whether a file exists.

gnomad.utils.file_utils.check_file_exists_raise_error(fname)

Check whether the file or all files in a list of files exist and optionally raise an exception.

gnomad.utils.file_utils.write_temp_gcs(t, ...)

gnomad.utils.file_utils.select_primitives_from_ht(ht)

Select only primitive types (string, int, float, bool) from a Table.

gnomad.utils.file_utils.get_file_stats(url)

Get size (as both int and str) and md5 for file at specified URL.

gnomad.utils.file_utils.read_list_data(...)

Read a file input into a python list (each line will be an element).

gnomad.utils.file_utils.repartition_for_join(ht_path)

Calculate new partition intervals using input Table.

+
+
+gnomad.utils.file_utils.file_exists(fname)[source]
+

Check whether a file exists.

+

Supports either local or Google cloud (gs://) paths. +If the file is a Hail file (.ht, .mt, .bm, .parquet, .he, and .vds extensions), it +checks that _SUCCESS is present.

+
+
Parameters:
+

fname (str) – File name.

+
+
Return type:
+

bool

+
+
Returns:
+

Whether the file exists.

+
+
+
+ +
+
+gnomad.utils.file_utils.check_file_exists_raise_error(fname, error_if_exists=False, error_if_not_exists=False, error_if_exists_msg='The following files already exist: ', error_if_not_exists_msg='The following files do not exist: ')[source]
+

Check whether the file or all files in a list of files exist and optionally raise an exception.

+

This can be useful when writing out to files at the end of a pipeline to first check if the file already +exists and therefore requires the file to be removed or overwrite specified so the pipeline doesn’t fail.

+
+
Parameters:
+
    +
  • fname (Union[str, List[str]]) – File path, or list of file paths to check the existence of.

  • +
  • error_if_exists (bool) – Whether to raise an exception if any of the files exist. Default is True.

  • +
  • error_if_not_exists (bool) – Whether to raise an exception if any of the files do not exist. Default is False.

  • +
  • error_if_exists_msg (str) – String of the error message to print if any of the files exist.

  • +
  • error_if_not_exists_msg (str) – String of the error message to print if any of the files do not exist.

  • +
+
+
Return type:
+

bool

+
+
Returns:
+

Boolean indicating if fname or all files in fname exist.

+
+
+
+ +
+
+gnomad.utils.file_utils.write_temp_gcs(t, gcs_path, overwrite=False, temp_path=None)[source]
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) –

  • +
  • gcs_path (str) –

  • +
  • overwrite (bool) –

  • +
  • temp_path (Optional[str]) –

  • +
+
+
Return type:
+

None

+
+
+
+ +
+
+gnomad.utils.file_utils.select_primitives_from_ht(ht)[source]
+

Select only primitive types (string, int, float, bool) from a Table.

+

Particularly useful for exporting a Table.

+
+
Parameters:
+

ht (Table) – Input Table

+
+
Return type:
+

Table

+
+
Returns:
+

Table with only primitive types selected

+
+
+
+ +
+
+gnomad.utils.file_utils.get_file_stats(url, project_id=None)[source]
+

Get size (as both int and str) and md5 for file at specified URL.

+

Typically used to get stats on VCFs.

+
+
Parameters:
+
    +
  • url (str) – Path to file of interest.

  • +
  • project_id (Optional[str]) – Google project ID. Specify if URL points to a requester-pays bucket.

  • +
+
+
Return type:
+

Tuple[int, str, str]

+
+
Returns:
+

Tuple of file size and md5.

+
+
+
+ +
+
+gnomad.utils.file_utils.read_list_data(input_file_path)[source]
+

Read a file input into a python list (each line will be an element).

+

Supports Google storage paths and .gz compression.

+
+
Parameters:
+

input_file_path (str) – File path

+
+
Return type:
+

List[str]

+
+
Returns:
+

List of lines

+
+
+
+ +
+
+gnomad.utils.file_utils.repartition_for_join(ht_path, new_partition_percent=1.1)[source]
+

Calculate new partition intervals using input Table.

+

Reading in all Tables using the same partition intervals (via +_intervals) before they are joined makes the joins much more efficient. +For more information, see: +https://discuss.hail.is/t/room-for-improvement-when-joining-multiple-hts/2278/8

+
+
Parameters:
+
    +
  • ht_path (str) – Path to Table to use for interval partition calculation.

  • +
  • new_partition_percent (float) – Percent of initial dataset partitions to use. +Value should be greater than 1 so that input Table will have more +partitions for the join. Defaults to 1.1.

  • +
+
+
Return type:
+

List[IntervalExpression]

+
+
Returns:
+

List of IntervalExpressions calculated over new set of partitions +(number of partitions in HT * desired percent increase).

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/filtering.html b/api_reference/utils/filtering.html new file mode 100644 index 000000000..29ec044d9 --- /dev/null +++ b/api_reference/utils/filtering.html @@ -0,0 +1,621 @@ + + + + + + + gnomad.utils.filtering — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.filtering

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.filtering.filter_to_adj(mt)

Filter genotypes to adj criteria.

gnomad.utils.filtering.filter_by_frequency(t, ...)

Filter MatrixTable or Table with gnomAD-format frequency data (assumed bi-allelic/split).

gnomad.utils.filtering.combine_functions(...)

Combine a list of boolean functions to an Expression using the specified operator.

gnomad.utils.filtering.filter_low_conf_regions(mt)

Filter low-confidence regions.

gnomad.utils.filtering.filter_to_autosomes(t)

Filter the Table or MatrixTable to autosomes only.

gnomad.utils.filtering.add_filters_expr(filters)

Create an expression to create or add filters.

gnomad.utils.filtering.subset_samples_and_variants(...)

Subset the MatrixTable or VariantDataset to the provided list of samples and their variants.

gnomad.utils.filtering.filter_to_clinvar_pathogenic(t)

Return a MatrixTable or Table that filters the clinvar data to pathogenic and likely pathogenic variants.

gnomad.utils.filtering.filter_to_gencode_cds(t)

Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.

gnomad.utils.filtering.remove_fields_from_constant(...)

Remove fields from a list and display any field(s) missing from the original list.

gnomad.utils.filtering.filter_x_nonpar(t)

Filter to loci that are in non-PAR regions on chromosome X.

gnomad.utils.filtering.filter_y_nonpar(t)

Filter to loci that are in non-PAR regions on chromosome Y.

gnomad.utils.filtering.filter_by_numeric_expr_range(t, ...)

Filter rows in the Table/MatrixTable based on the range of a numeric expression.

gnomad.utils.filtering.filter_for_mu(ht[, ...])

Filter to non-coding annotations and remove GERP outliers.

gnomad.utils.filtering.split_vds_by_strata(...)

Split a VDS into multiple VDSs based on strata_expr.

gnomad.utils.filtering.filter_arrays_by_meta(...)

Filter both metadata array expression and meta data indexed expression by items_to_filter.

+
+
+gnomad.utils.filtering.filter_to_adj(mt)[source]
+

Filter genotypes to adj criteria.

+
+
Parameters:
+

mt (MatrixTable) –

+
+
Return type:
+

MatrixTable

+
+
+
+ +
+
+gnomad.utils.filtering.filter_by_frequency(t, direction, frequency=None, allele_count=None, population=None, subpop=None, downsampling=None, keep=True, adj=True)[source]
+

Filter MatrixTable or Table with gnomAD-format frequency data (assumed bi-allelic/split).

+

gnomAD frequency data format expectation is: Array[Struct(Array[AC], Array[AF], AN, homozygote_count, meta)].

+

At least one of frequency or allele_count is required.

+

Subpop can be specified without a population if desired.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input MatrixTable or Table

  • +
  • direction (str) – One of “above”, “below”, and “equal” (how to apply the filter)

  • +
  • frequency (float) – Frequency to filter by (one of frequency or allele_count is required)

  • +
  • allele_count (int) – Allele count to filter by (one of frequency or allele_count is required)

  • +
  • population (str) – Population in which to filter frequency

  • +
  • subpop (str) – Sub-population in which to filter frequency

  • +
  • downsampling (int) – Downsampling in which to filter frequency

  • +
  • keep (bool) – Whether to keep rows passing this frequency (passed to filter_rows)

  • +
  • adj (bool) – Whether to use adj frequency

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Filtered MatrixTable or Table

+
+
+
+ +
+
+gnomad.utils.filtering.combine_functions(func_list, x, operator_func=<built-in function iand>)[source]
+

Combine a list of boolean functions to an Expression using the specified operator.

+
+

Note

+

The operator_func is applied cumulatively from left to right of the func_list.

+
+
+
Parameters:
+
    +
  • func_list (List[Callable[[bool], bool]]) – A list of boolean functions that can be applied to x.

  • +
  • x (StructExpression) – Expression to be passed to each function in func_list.

  • +
  • operator_func (Callable[[bool, bool], bool]) – Operator function to combine the functions in func_list. Default is operator.iand.

  • +
+
+
Return type:
+

bool

+
+
Returns:
+

A boolean from the combined operations.

+
+
+
+ +
+
+gnomad.utils.filtering.filter_low_conf_regions(mt, filter_lcr=True, filter_decoy=True, filter_segdup=True, filter_exome_low_coverage_regions=False, filter_telomeres_and_centromeres=False, high_conf_regions=None)[source]
+

Filter low-confidence regions.

+
+
Parameters:
+
    +
  • mt (Union[MatrixTable, Table]) – MatrixTable or Table to filter

  • +
  • filter_lcr (bool) – Whether to filter LCR regions

  • +
  • filter_decoy (bool) – Whether to filter decoy regions

  • +
  • filter_segdup (bool) – Whether to filter Segdup regions

  • +
  • filter_exome_low_coverage_regions (bool) – Whether to filter exome low confidence regions

  • +
  • filter_telomeres_and_centromeres (bool) – Whether to filter telomeres and centromeres

  • +
  • high_conf_regions (Optional[List[str]]) – Paths to set of high confidence regions to restrict to (union of regions)

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

MatrixTable or Table with low confidence regions removed

+
+
+
+ +
+
+gnomad.utils.filtering.filter_to_autosomes(t)[source]
+

Filter the Table or MatrixTable to autosomes only.

+

This assumes that the input contains a field named locus of type Locus

+
+
Parameters:
+

t (Union[MatrixTable, Table]) – Input MT/HT

+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

MT/HT autosomes

+
+
+
+ +
+
+gnomad.utils.filtering.add_filters_expr(filters, current_filters=None)[source]
+

Create an expression to create or add filters.

+

For each entry in the filters dictionary, if the value evaluates to True, +then the key is added as a filter name.

+

Current filters are kept if provided using current_filters

+
+
Parameters:
+
+
+
Return type:
+

SetExpression

+
+
Returns:
+

An expression that can be used to annotate the filters

+
+
+
+ +
+
+gnomad.utils.filtering.subset_samples_and_variants(mtds, sample_path, header=True, table_key='s', sparse=False, gt_expr='GT', remove_dead_alleles=False)[source]
+

Subset the MatrixTable or VariantDataset to the provided list of samples and their variants.

+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input MatrixTable or VariantDataset

  • +
  • sample_path (str) – Path to a file with list of samples

  • +
  • header (bool) – Whether file with samples has a header. Default is True

  • +
  • table_key (str) – Key to sample Table. Default is “s”

  • +
  • sparse (bool) – Whether the MatrixTable is sparse. Default is False

  • +
  • gt_expr (str) – Name of field in MatrixTable containing genotype expression. Default is “GT”

  • +
  • remove_dead_alleles (bool) – Remove alleles observed in no samples. This option is currently only relevant when mtds is a VariantDataset. Default is False

  • +
+
+
Return type:
+

Union[MatrixTable, VariantDataset]

+
+
Returns:
+

MatrixTable or VariantDataset subsetted to specified samples and their variants

+
+
+
+ +
+
+gnomad.utils.filtering.filter_to_clinvar_pathogenic(t, clnrevstat_field='CLNREVSTAT', clnsig_field='CLNSIG', clnsigconf_field='CLNSIGCONF', remove_no_assertion=True, remove_conflicting=True)[source]
+

Return a MatrixTable or Table that filters the clinvar data to pathogenic and likely pathogenic variants.

+

Example use:

+
from gnomad.resources.grch38.reference_data import clinvar
+clinvar_ht = clinvar.ht()
+clinvar_ht = filter_to_clinvar_pathogenic(clinvar_ht)
+
+
+
+
Param:
+

t: Input dataset that contains clinvar data, could either be a MatrixTable or Table.

+
+
Parameters:
+
    +
  • clnrevstat_field (str) – The field string for the expression that contains the review status of the clinical significance of clinvar variants.

  • +
  • clnsig_field (str) – The field string for the expression that contains the clinical signifcance of the clinvar variant.

  • +
  • clnsigconf_field (str) – The field string for the expression that contains the conflicting clinical significance values for the variant. For variants with no conflicting significance, this field should be undefined.

  • +
  • remove_no_assertion (bool) – Flag for removing entries in which the clnrevstat (clinical significance) has no assertions (zero stars).

  • +
  • remove_conflicting (bool) – Flag for removing entries with conflicting clinical interpretations.

  • +
  • t (Union[MatrixTable, Table]) –

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Filtered MatrixTable or Table

+
+
+
+ +
+
+gnomad.utils.filtering.filter_to_gencode_cds(t, gencode_ht=None)[source]
+

Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.

+

Example use:

+
from gnomad.resources.grch37.reference_data import gencode
+gencode_ht = gencode.ht()
+gencode_ht = filter_gencode_to_cds(gencode_ht)
+
+
+
+

Note

+

If no Gencode Table is provided, the default version of the Gencode Table +resource for the genome build of the input Table/MatrixTable will be used.

+
+
+

Warning

+

This Gencode CDS interval filter does not take into account the +transcript_id, it filters to any locus that is found in a CDS interval for +any protein coding transcript. Therefore, if downstream analyses require +filtering to CDS intervals by transcript, an additional step must be taken. +For example, when filtering VEP transcript consequences, there may be cases +where a variant is retained with this filter, but is considered outside the +CDS intervals of the transcript per the VEP predicted consequence of the +variant.

+
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input Table/MatrixTable to filter.

  • +
  • gencode_ht (Optional[Table]) – Gencode Table to use for filtering the input Table/MatrixTable +to CDS regions. Default is None, which will use the default version of the +Gencode Table resource.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table/MatrixTable filtered to loci in Gencode CDS intervals.

+
+
+
+ +
+
+gnomad.utils.filtering.remove_fields_from_constant(constant, fields_to_remove)[source]
+

Remove fields from a list and display any field(s) missing from the original list.

+
+
Parameters:
+
    +
  • constant (List[str]) – List of fields

  • +
  • fields_to_remove (List[str]) – List of fields to remove from constant

  • +
+
+
Return type:
+

List[str]

+
+
+
+ +
+
+gnomad.utils.filtering.filter_x_nonpar(t)[source]
+

Filter to loci that are in non-PAR regions on chromosome X.

+
+
Parameters:
+

t (Union[Table, MatrixTable]) – Input Table or MatrixTable.

+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Filtered Table or MatrixTable.

+
+
+
+ +
+
+gnomad.utils.filtering.filter_y_nonpar(t)[source]
+

Filter to loci that are in non-PAR regions on chromosome Y.

+
+
Parameters:
+

t (Union[Table, MatrixTable]) – Input Table or MatrixTable.

+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Filtered Table or MatrixTable.

+
+
+
+ +
+
+gnomad.utils.filtering.filter_by_numeric_expr_range(t, filter_expr, filter_range, keep_between=True, inclusive=True)[source]
+

Filter rows in the Table/MatrixTable based on the range of a numeric expression.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input Table/MatrixTable.

  • +
  • filter_expr (NumericExpression) – NumericExpression to apply filter_range to.

  • +
  • filter_range (tuple) – Range of values to apply to filter_expr.

  • +
  • keep_between (bool) – Whether to keep the values between filter_range instead of keeping values outside filter_range. Default is True.

  • +
  • inclusive (bool) – Whether or not to include the filter_range values themselves. Default is True.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Table/MatrixTable filtered to rows with specified criteria.

+
+
+
+ +
+
+gnomad.utils.filtering.filter_for_mu(ht, gerp_lower_cutoff=-3.9885, gerp_upper_cutoff=2.6607)[source]
+

Filter to non-coding annotations and remove GERP outliers.

+
+

Note

+

Values for gerp_lower_cutoff and gerp_upper_cutoff default to -3.9885 and +2.6607, respectively. These values were precalculated on the GRCh37 context +table and define the 5th and 95th percentiles.

+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • gerp_lower_cutoff (float) – Minimum GERP score for variant to be included. Default is -3.9885.

  • +
  • gerp_upper_cutoff (float) – Maximum GERP score for variant to be included. Default is 2.6607.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table filtered to intron or intergenic variants with GERP outliers removed.

+
+
+
+ +
+
+gnomad.utils.filtering.split_vds_by_strata(vds, strata_expr)[source]
+

Split a VDS into multiple VDSs based on strata_expr.

+
+
Parameters:
+
+
+
Return type:
+

Dict[str, VariantDataset]

+
+
Returns:
+

Dictionary where strata value is key and VDS is value.

+
+
+
+ +
+
+gnomad.utils.filtering.filter_arrays_by_meta(meta_expr, meta_indexed_exprs, items_to_filter, keep=True, combine_operator='and', exact_match=False)[source]
+

Filter both metadata array expression and meta data indexed expression by items_to_filter.

+

The items_to_filter can be used to filter in the following ways based on +meta_expr items: +- By a list of keys, e.g. [“sex”, “downsampling”]. +- By specific key: value pairs, e.g. to filter where ‘pop’ is ‘han’ or ‘papuan’ +{“pop”: [“han”, “papuan”]}, or where ‘pop’ is ‘afr’ and/or ‘sex’ is ‘XX’ +{“pop”: [“afr”], “sex”: [“XX”]}.

+

The items can be kept or removed from meta_indexed_expr and meta_expr based on +the value of keep. For example if meta_indexed_exprs is {‘freq’: ht.freq, +‘freq_meta_sample_count’: ht.index_globals().freq_meta_sample_count} and meta_expr +is ht.freq_meta then if keep is True, the items specified by items_to_filter +such as ‘pop’ = ‘han’ will be kept and all other items will be removed from the +ht.freq, ht.freq_meta_sample_count, and ht.freq_meta. meta_indexed_exprs can also +be a single array expression such as ht.freq.

+

The filtering can also be applied such that all criteria must be met +(combine_operator = “and”) by the meta_expr item in order to be filtered, +or at least one of the specified criteria must be met (combine_operator = “or”) +by the meta_expr item in order to be filtered.

+

The exact_match parameter can be used to apply the keep parameter to only items +specified in the items_to_filter parameter. For example, by default, if keep is +True, combine_operator is “and”, and items_to_filter is [“sex”, “downsampling”], +then all items in meta_expr with both “sex” and “downsampling” as keys will be +kept. However, if exact_match is True, then the items +in meta_expr will only be kept if “sex” and “downsampling” are the only keys in +the meta dict.

+
+
Parameters:
+
    +
  • meta_expr (ArrayExpression) – Metadata expression that contains the values of the elements in +meta_indexed_expr. The most often used expression is freq_meta to index into +a ‘freq’ array.

  • +
  • meta_indexed_exprs (Union[Dict[str, ArrayExpression], ArrayExpression]) – Either a Dictionary where the keys are the expression name +and the values are the expressions indexed by the meta_expr such as a ‘freq’ +array or just a single expression indexed by the meta_expr.

  • +
  • items_to_filter (Union[Dict[str, List[str]], List[str]]) – Items to filter by, either a list or a dictionary.

  • +
  • keep (bool) – Whether to keep or remove the items specified by items_to_filter.

  • +
  • combine_operator (str) – Whether to use “and” or “or” to combine the items +specified by items_to_filter.

  • +
  • exact_match (bool) – Whether to apply the keep parameter to only the items +specified in the items_to_filter parameter or to all items in meta_expr. +See the example above for more details. Default is False.

  • +
+
+
Return type:
+

Tuple[ArrayExpression, Union[Dict[str, ArrayExpression], ArrayExpression]]

+
+
Returns:
+

A Tuple of the filtered metadata expression and a dictionary of metadata +indexed expressions when meta_indexed_expr is a Dictionary or a single filtered +array expression when meta_indexed_expr is a single array expression.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/gen_stats.html b/api_reference/utils/gen_stats.html new file mode 100644 index 000000000..cbcd01e36 --- /dev/null +++ b/api_reference/utils/gen_stats.html @@ -0,0 +1,237 @@ + + + + + + + gnomad.utils.gen_stats — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.gen_stats

+ + + + + + + + + + + + + + + +

gnomad.utils.gen_stats.to_phred(linear_expr)

Compute the phred-scaled value of the linear-scale input.

gnomad.utils.gen_stats.from_phred(...)

Compute the linear-scale value of the phred-scaled input.

gnomad.utils.gen_stats.get_median_and_mad_expr(...)

Compute the median and median absolute deviation (MAD) for the given expression.

gnomad.utils.gen_stats.merge_stats_counters_expr(stats)

Merge multiple stats counters, assuming that they were computed on non-overlapping data.

+
+
+gnomad.utils.gen_stats.to_phred(linear_expr)[source]
+

Compute the phred-scaled value of the linear-scale input.

+
+
Parameters:
+

linear_expr (NumericExpression) – input

+
+
Return type:
+

Float64Expression

+
+
Returns:
+

Phred-scaled value

+
+
+
+ +
+
+gnomad.utils.gen_stats.from_phred(phred_score_expr)[source]
+

Compute the linear-scale value of the phred-scaled input.

+
+
Parameters:
+

phred_score_expr (NumericExpression) – phred-scaled value

+
+
Return type:
+

Float64Expression

+
+
Returns:
+

linear-scale value of the phred-scaled input.

+
+
+
+ +
+
+gnomad.utils.gen_stats.get_median_and_mad_expr(metric_expr, k=1.4826)[source]
+

Compute the median and median absolute deviation (MAD) for the given expression.

+

..note:

+
The default value of k assumes normally distributed data.
+
+
+
+
Parameters:
+
    +
  • metric_expr (ArrayNumericExpression) – Expression to compute median and MAD for

  • +
  • k (float) – The scaling factor for MAD calculation. Default assumes normally distributed data.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Struct with median and MAD

+
+
+
+ +
+
+gnomad.utils.gen_stats.merge_stats_counters_expr(stats)[source]
+

Merge multiple stats counters, assuming that they were computed on non-overlapping data.

+

Examples: +- Merge stats computed on indel and snv separately +- Merge stats computed on bi-allelic and multi-allelic variants separately +- Merge stats computed on autosomes and sex chromosomes separately

+
+
Parameters:
+

stats (ArrayExpression) – An array of stats counters to merge

+
+
Return type:
+

StructExpression

+
+
Returns:
+

Merged stats Struct

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/index.html b/api_reference/utils/index.html new file mode 100644 index 000000000..ad6da0899 --- /dev/null +++ b/api_reference/utils/index.html @@ -0,0 +1,378 @@ + + + + + + + gnomad.utils — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils

+
+ +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/intervals.html b/api_reference/utils/intervals.html new file mode 100644 index 000000000..fe9e80d07 --- /dev/null +++ b/api_reference/utils/intervals.html @@ -0,0 +1,202 @@ + + + + + + + gnomad.utils.intervals — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.intervals

+ + + + + + + + + + + + +

gnomad.utils.intervals.sort_intervals(intervals)

Sort an array of intervals by start contig, then start position, then end contig, then end position.

gnomad.utils.intervals.union_intervals(intervals)

Generate a list with the union of all intervals in the input list by merging overlapping intervals.

gnomad.utils.intervals.interval_length(interval)

Return the total number of bases in an Interval.

+
+
+gnomad.utils.intervals.sort_intervals(intervals)[source]
+

Sort an array of intervals by start contig, then start position, then end contig, then end position.

+
+
Parameters:
+

intervals (List[Interval]) – Intervals to sort

+
+
Returns:
+

Sorted interval list

+
+
+
+ +
+
+gnomad.utils.intervals.union_intervals(intervals, is_sorted=False)[source]
+

Generate a list with the union of all intervals in the input list by merging overlapping intervals.

+
+
Parameters:
+
    +
  • intervals (List[Interval]) – Intervals to merge

  • +
  • is_sorted (bool) – If set, assumes intervals are already sorted, otherwise will sort.

  • +
+
+
Returns:
+

List of merged intervals

+
+
+
+ +
+
+gnomad.utils.intervals.interval_length(interval)[source]
+

Return the total number of bases in an Interval.

+
+
Parameters:
+

interval (Interval) – Input interval

+
+
Return type:
+

int

+
+
Returns:
+

Total length of the interval

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/liftover.html b/api_reference/utils/liftover.html new file mode 100644 index 000000000..dbed0694a --- /dev/null +++ b/api_reference/utils/liftover.html @@ -0,0 +1,275 @@ + + + + + + + gnomad.utils.liftover — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.liftover

+ + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.liftover.GRCH37_to_GRCH38_CHAIN

Path to chain file required to lift data from GRCh37 to GRCh38.

gnomad.utils.liftover.GRCH38_TO_GRCH37_CHAIN

Path to chain file required to lift data from GRCh38 to GRCh37.

gnomad.utils.liftover.get_liftover_genome(t)

Infer reference genome build of input data and assume destination reference genome build.

gnomad.utils.liftover.liftover_expr(locus, ...)

Generate struct liftover expression.

gnomad.utils.liftover.default_lift_data(t[, ...])

Lift input Table or MatrixTable from one reference build to another.

gnomad.utils.liftover.liftover_using_gnomad_map(ht, ...)

Liftover a gnomAD v2 table using already-established liftover file.

+
+
+gnomad.utils.liftover.GRCH37_to_GRCH38_CHAIN = 'gs://hail-common/references/grch37_to_grch38.over.chain.gz'
+

Path to chain file required to lift data from GRCh37 to GRCh38.

+
+ +
+
+gnomad.utils.liftover.GRCH38_TO_GRCH37_CHAIN = 'gs://hail-common/references/grch38_to_grch37.over.chain.gz'
+

Path to chain file required to lift data from GRCh38 to GRCh37.

+
+ +
+
+gnomad.utils.liftover.get_liftover_genome(t)[source]
+

Infer reference genome build of input data and assume destination reference genome build.

+

Adds liftover chain to source reference genome and sequence to destination reference genome. +Returns tuple containing both reference genomes in preparation for liftover.

+
+
Parameters:
+

t (Union[MatrixTable, Table]) – Input Table or MatrixTable.

+
+
Return type:
+

Tuple[ReferenceGenome, ReferenceGenome]

+
+
Returns:
+

Tuple of source reference genome (with liftover chain added) +and destination reference genome (with sequence loaded)

+
+
+
+ +
+
+gnomad.utils.liftover.liftover_expr(locus, alleles, destination_reference)[source]
+

Generate struct liftover expression.

+
+
Struct contains:
    +
  • locus: Liftover coordinates

  • +
  • alleles: Liftover alleles

  • +
  • original_locus: Locus prior to liftover

  • +
  • original_alleles: Alleles prior to liftover

  • +
  • locus_fail_liftover: Whether the locus failed liftover

  • +
  • +
    ref_allele_mismatch: Whether the allele at index 0 of alleles (lifted over reference allele)

    doesn’t match the allele at that position in the destination reference

    +
    +
    +
  • +
+
+
+
+
Parameters:
+
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Struct containing expressions for lifted over locus/alleles as well as original locus/alleles.

+
+
+
+ +
+
+gnomad.utils.liftover.default_lift_data(t, remove_failed_sites=True)[source]
+

Lift input Table or MatrixTable from one reference build to another.

+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Table or MatrixTable.

  • +
  • remove_failed_sites (bool) –

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Table or MatrixTable with liftover annotations.

+
+
+
+ +
+
+gnomad.utils.liftover.liftover_using_gnomad_map(ht, data_type)[source]
+

Liftover a gnomAD v2 table using already-established liftover file.

+
+

Note

+

This function shuffles!

+
+
+
Parameters:
+
    +
  • ht (Table) – Input Hail Table.

  • +
  • data_type (str) – Which gnomAD data type to map across. One of “exomes” or “genomes”.

  • +
+
+
Returns:
+

Lifted over Table

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/plotting.html b/api_reference/utils/plotting.html new file mode 100644 index 000000000..e4b309af2 --- /dev/null +++ b/api_reference/utils/plotting.html @@ -0,0 +1,368 @@ + + + + + + + gnomad.utils.plotting — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.plotting

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.plotting.new_show(t[, n, ...])

gnomad.utils.plotting.plot_hail_hist(hist_data)

Plot histogram from Hail hist aggregation.

gnomad.utils.plotting.plot_multi_hail_hist(...)

Plot multiple histograms on the same plot.

gnomad.utils.plotting.plot_hail_hist_cumulative(...)

Plot cumulative histogram from Hail hist aggregation.

gnomad.utils.plotting.plot_hail_hist_both(...)

gnomad.utils.plotting.set_font_size(p[, ...])

gnomad.utils.plotting.linear_and_log_tabs(...)

gnomad.utils.plotting.plot_hail_file_metadata(t_path)

Take path to hail Table or MatrixTable (gs://bucket/path/hail.mt), output Grid or Tabs, respectively.

gnomad.utils.plotting.scale_file_sizes(...)

gnomad.utils.plotting.get_rows_data(rows_files)

gnomad.utils.plotting.pair_plot(data[, ...])

Plot each column of data against each other and returns a grid of plots.

+
+
+gnomad.utils.plotting.new_show(t, n=10, width=140, truncate=40, types=True)[source]
+
+ +
+
+gnomad.utils.plotting.plot_hail_hist(hist_data, title='Plot', log=False, fill_color='#033649', outlier_fill_color='#036564', line_color='#033649', hover_mode='mouse', hide_zeros=False)[source]
+

Plot histogram from Hail hist aggregation.

+

hist_data can (and should) come straight from ht.aggregate(hl.agg.hist(ht.data, start, end, bins))

+
+
Parameters:
+
    +
  • hist_data (Struct) – Data to plot

  • +
  • title (str) – Plot title

  • +
  • log (bool) – Whether the y-axis should be log

  • +
  • fill_color (str) – Color to fill the histogram bars that fall within the hist boundaries

  • +
  • outlier_fill_color (str) – Color to fill the histogram bars that fall outside the hist boundaries

  • +
  • line_color (str) – Color of the lines around the histogram bars

  • +
  • hover_mode (str) – Hover mode; one of ‘mouse’ (default), ‘vline’ or ‘hline’

  • +
  • hide_zeros (bool) – Remove hist bars with 0 count

  • +
+
+
Return type:
+

figure

+
+
Returns:
+

Histogram plot

+
+
+
+ +
+
+gnomad.utils.plotting.plot_multi_hail_hist(hist_data, title='Plot', log=False, fill_color=None, outlier_fill_color=None, line_color='#033649', hover_mode='mouse', hide_zeros=False, alpha=None)[source]
+

Plot multiple histograms on the same plot.

+

Each histogram can (and should) come straight from ht.aggregate(hl.agg.hist(ht.data, start, end, bins))

+

Example usage:

+
plot_multi_hail_hist(ht.aggregate(hl.agg.group_by(ht.pop, hl.agg.hist(ht.data, start, end, bins))))
+
+
+
+
Parameters:
+
    +
  • hist_data (Dict[str, Struct]) – Data to plot

  • +
  • title (str) – Plot title

  • +
  • log (bool) – Whether the y-axis should be log

  • +
  • fill_color (Dict[str, str]) – Color to fill the histogram bars that fall within the hist boundaries

  • +
  • outlier_fill_color (Dict[str, str]) – Color to fill the histogram bars that fall outside the hist boundaries

  • +
  • line_color (str) – Color of the lines around the histogram bars

  • +
  • hover_mode (str) – Hover mode; one of ‘mouse’ (default), ‘vline’ or ‘hline’

  • +
  • hide_zeros (bool) – Remove hist bars with 0 count

  • +
  • alpha (float) – Alpha value (if None, then 1.0/len(hist_data) is used)

  • +
+
+
Return type:
+

figure

+
+
Returns:
+

Histogram plot

+
+
+
+ +
+
+gnomad.utils.plotting.plot_hail_hist_cumulative(hist_data, title='Plot', normalize=True, line_color='#036564', line_width=3, log=False, hover_mode='mouse')[source]
+

Plot cumulative histogram from Hail hist aggregation.

+

hist_data can (and should) come straight from ht.aggregate(hl.agg.hist(ht.data, start, end, bins))

+
+
Parameters:
+
    +
  • hist_data (Struct) – Data to plot

  • +
  • title (str) – Plot title

  • +
  • normalize (bool) – Whether to normalize the data (0,1)

  • +
  • line_color (str) – Color of the line

  • +
  • line_width (int) – Width of the line

  • +
  • log (bool) – Whether the y-axis should be log

  • +
  • hover_mode (str) – Hover mode; one of ‘mouse’ (default), ‘vline’ or ‘hline’

  • +
+
+
Return type:
+

figure

+
+
Returns:
+

Histogram plot

+
+
+
+ +
+
+gnomad.utils.plotting.plot_hail_hist_both(hist_data, title, normalize=True, log=False)[source]
+
+
Parameters:
+
    +
  • hist_data (Struct) –

  • +
  • title (str) –

  • +
  • normalize (bool) –

  • +
  • log (bool) –

  • +
+
+
+
+ +
+
+gnomad.utils.plotting.set_font_size(p, font_size='12pt')[source]
+
+
Parameters:
+

font_size (str) –

+
+
+
+ +
+
+gnomad.utils.plotting.linear_and_log_tabs(plot_func, **kwargs)[source]
+
+
Parameters:
+

plot_func (Callable) –

+
+
Return type:
+

Tabs

+
+
+
+ +
+
+gnomad.utils.plotting.plot_hail_file_metadata(t_path)[source]
+

Take path to hail Table or MatrixTable (gs://bucket/path/hail.mt), output Grid or Tabs, respectively.

+

Or if an unordered Table is provided, a Figure with file sizes is output. +If metadata file or rows directory is missing, returns None.

+
+
Parameters:
+

t_path (str) –

+
+
Return type:
+

Union[Grid, Tabs, figure, None]

+
+
+
+ +
+
+gnomad.utils.plotting.scale_file_sizes(file_sizes)[source]
+
+ +
+
+gnomad.utils.plotting.get_rows_data(rows_files)[source]
+
+ +
+
+gnomad.utils.plotting.pair_plot(data, label_col=None, colors=None, tools='save,pan,box_zoom,reset,wheel_zoom,box_select,lasso_select,help', tooltip_cols=None)[source]
+

Plot each column of data against each other and returns a grid of plots.

+

The diagonal contains a histogram of each column, or a density plot if labels are provided. +The lower diagonal contains scatter plots of each column against each other. +The upper diagonal is empty.

+

All columns should be numerical with the exception of the label_col if provided. +If a color dict containing provided mapping labels to specific colors can be specified using color_dict

+
+
Parameters:
+
    +
  • data (DataFrame) – Dataframe to plot

  • +
  • label_col (str) – Column of the DataFrame containing the labels

  • +
  • colors (Union[List[str], Dict[str, str]]) – RGB hex colors. If a dict is provided, it should contain the mapping of label to colors.

  • +
  • tools (str) – Tools for the resulting plots

  • +
  • tooltip_cols (List[str]) – Additional columns that should be displayed in tooltip

  • +
+
+
Return type:
+

Column

+
+
Returns:
+

Grid of plots (column of rows)

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/reference_genome.html b/api_reference/utils/reference_genome.html new file mode 100644 index 000000000..3449d4e89 --- /dev/null +++ b/api_reference/utils/reference_genome.html @@ -0,0 +1,220 @@ + + + + + + + gnomad.utils.reference_genome — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.reference_genome

+ + + + + + + + + + + + +

gnomad.utils.reference_genome.get_reference_ht(ref)

Create a reference Table with locus and alleles (containing only the reference allele by default) from the given reference genome.

gnomad.utils.reference_genome.add_reference_sequence(ref)

Add the fasta sequence to a Hail reference genome.

gnomad.utils.reference_genome.get_reference_genome(locus)

Return the reference genome associated with the input Locus expression.

+
+
+gnomad.utils.reference_genome.get_reference_ht(ref, contigs=None, excluded_intervals=None, add_all_substitutions=False, filter_n=True)[source]
+

Create a reference Table with locus and alleles (containing only the reference allele by default) from the given reference genome.

+
+

Note

+

If the contigs argument is not provided, all contigs (including obscure ones) will be added to the table. +This can be slow as contigs are added one by one.

+
+
+
Parameters:
+
    +
  • ref (ReferenceGenome) – Input reference genome

  • +
  • contigs (Optional[List[str]]) – An optional list of contigs that the Table should include

  • +
  • excluded_intervals (Optional[List[Interval]]) – An optional list of intervals to exclude

  • +
  • add_all_substitutions (bool) – If set, then all possible substitutions are added in the alleles array

  • +
  • filter_n (bool) – If set, bases where the reference is unknown (n) are filtered.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

+
+
+
+ +
+
+gnomad.utils.reference_genome.add_reference_sequence(ref)[source]
+

Add the fasta sequence to a Hail reference genome.

+

Only GRCh37 and GRCh38 references are supported.

+
+
Parameters:
+

ref (ReferenceGenome) – Input reference genome.

+
+
Return type:
+

ReferenceGenome

+
+
Returns:
+

+
+
+
+ +
+
+gnomad.utils.reference_genome.get_reference_genome(locus, add_sequence=False)[source]
+

Return the reference genome associated with the input Locus expression.

+
+
Parameters:
+
+
+
Return type:
+

ReferenceGenome

+
+
Returns:
+

Reference genome

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/release.html b/api_reference/utils/release.html new file mode 100644 index 000000000..28c2ffbb3 --- /dev/null +++ b/api_reference/utils/release.html @@ -0,0 +1,232 @@ + + + + + + + gnomad.utils.release — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.release

+ + + + + + + + + + + + +

gnomad.utils.release.make_faf_index_dict(...)

Create a look-up Dictionary for entries contained in the filter allele frequency annotation array.

gnomad.utils.release.make_freq_index_dict(...)

Create a look-up Dictionary for entries contained in the frequency annotation array.

gnomad.utils.release.make_freq_index_dict_from_meta(...)

Create a dictionary for accessing frequency array.

+
+
+gnomad.utils.release.make_faf_index_dict(faf_meta, groups=['adj'], pops=['afr', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'remaining', 'sas'], sexes=['XX', 'XY'], label_delimiter='_')[source]
+

Create a look-up Dictionary for entries contained in the filter allele frequency annotation array.

+
+
Parameters:
+
    +
  • faf_meta (List[Dict[str, str]]) – Global annotation containing the set of groupings for each element of the faf array +(e.g., [{‘group’: ‘adj’}, {‘group’: ‘adj’, ‘pop’: ‘nfe’}])

  • +
  • groups (List[str]) – List of sample groups [adj, raw]. Default is GROUPS

  • +
  • pops (List[str]) – List of sample global population names for gnomAD data type. Default is POPS[CURRENT_MAJOR_RELEASE][“exomes”].

  • +
  • sexes (List[str]) – List of sample sexes used in VCF export. Default is SEXES

  • +
  • label_delimiter (str) – String used as delimiter when making group label combinations

  • +
+
+
Return type:
+

Dict[str, int]

+
+
Returns:
+

Dictionary of faf annotation population groupings, where values are the corresponding 0-based indices for the +groupings in the faf_meta array

+
+
+
+ +
+
+gnomad.utils.release.make_freq_index_dict(freq_meta, groups=['adj', 'raw'], pops=['afr', 'amr', 'asj', 'eas', 'fin', 'mid', 'nfe', 'remaining', 'sas'], sexes=['XX', 'XY'], subsets=['non_ukb'], downsamplings=None, label_delimiter='_')[source]
+

Create a look-up Dictionary for entries contained in the frequency annotation array.

+
+
Parameters:
+
    +
  • freq_meta (List[Dict[str, str]]) – List containing the set of groupings for each element of the freq array +(e.g., [{‘group’: ‘adj’}, {‘group’: ‘adj’, ‘pop’: ‘nfe’}])

  • +
  • groups (List[str]) – List of sample groups [adj, raw]. Default is GROUPS

  • +
  • pops (List[str]) – List of sample global population names for gnomAD data type. Default is POPS[CURRENT_MAJOR_RELEASE][“exomes”].

  • +
  • sexes (List[str]) – List of sample sexes used in VCF export. Default is SEXES

  • +
  • subsets (List[str]) – List of sample subsets in dataset. Default is SUBSETS[CURRENT_MAJOR_RELEASE]

  • +
  • downsamplings (Optional[List[int]]) – List of downsampling cohort sizes present in global frequency array

  • +
  • label_delimiter (str) – String used as delimiter when making group label combinations

  • +
+
+
Return type:
+

Dict[str, int]

+
+
Returns:
+

Dictionary keyed by the grouping combinations found in the frequency array, where values are the corresponding +0-based indices for the groupings in the freq_meta array

+
+
+
+ +
+
+gnomad.utils.release.make_freq_index_dict_from_meta(freq_meta, label_delimiter='_', sort_order=['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group'])[source]
+

Create a dictionary for accessing frequency array.

+

The dictionary is keyed by the grouping combinations found in the frequency metadata +array, where values are the corresponding 0-based indices for the groupings in the +frequency array. For example, if the freq_meta entry [{‘pop’: ‘nfe’}, {‘sex’: ‘XX’}] +corresponds to the 5th entry in the frequency array, the returned dictionary entry +would be {‘nfe_XX’: 4}.

+
+
Parameters:
+
    +
  • freq_meta (List[Dict[str, str]]) – List of dictionaries containing frequency metadata.

  • +
  • label_delimiter (str) – Delimiter to use when joining frequency metadata labels.

  • +
  • sort_order (Optional[List[str]]) – List of frequency metadata labels to use when sorting the dictionary.

  • +
+
+
Return type:
+

Dict[str, int]

+
+
Returns:
+

Dictionary of frequency metadata.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/slack.html b/api_reference/utils/slack.html new file mode 100644 index 000000000..d8900dc61 --- /dev/null +++ b/api_reference/utils/slack.html @@ -0,0 +1,214 @@ + + + + + + + gnomad.utils.slack — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.slack

+ + + + + + + + + +

gnomad.utils.slack.SlackClient(token)

Slack API client.

gnomad.utils.slack.slack_notifications(token, to)

Send a Slack notification after some code runs.

+
+
+class gnomad.utils.slack.SlackClient(token)[source]
+

Slack API client.

+
+
Parameters:
+

token (str) – Slack API token

+
+
+
+
+send_file(to, file=None, content=None, filename='data.txt', filetype='text', comment=None)[source]
+

Send a file to Slack channel(s) and/or user(s).

+
+
Parameters:
+
    +
  • to (Union[str, Iterable[str]]) – Channel(s) (prefixed with ‘#’) and/or user(s) (prefixed with ‘@’) to send message to

  • +
  • file (Optional[str]) – Path of file to upload

  • +
  • content (Optional[str]) – File content to upload

  • +
  • filename (str) – Filename of file

  • +
  • filetype (str) – File type identifier

  • +
  • comment (Optional[str]) – Text for message sharing file

  • +
+
+
+
+ +
+
+send_message(to, message, icon_emoji=None)[source]
+

Send a message to Slack channel(s) and/or user(s).

+
+
Parameters:
+
    +
  • to (Union[str, Iterable[str]]) – Channel(s) (prefixed with ‘#’) and/or user(s) (prefixed with ‘@’) to send message to

  • +
  • message (str) – Message content (long messages will be converted to snippets)

  • +
  • icon_emoji (Optional[str]) – Emoji to use as icon for message

  • +
+
+
+
+ +
+ +
+
+gnomad.utils.slack.slack_notifications(token, to)[source]
+

Send a Slack notification after some code runs.

+

If the wrapped code block raises an exception, the notification will include the exception and stack trace.

+

Example usage:

+
with slack_notifications(token, "@username"):
+    run_analysis()
+
+
+
+
Parameters:
+
    +
  • token (str) – Slack API token

  • +
  • to (Union[str, Iterable[str]]) – Channel(s) (prefixed with ‘#’) and/or user(s) (prefixed with ‘@’) to send notification to

  • +
+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/sparse_mt.html b/api_reference/utils/sparse_mt.html new file mode 100644 index 000000000..2ff8abaae --- /dev/null +++ b/api_reference/utils/sparse_mt.html @@ -0,0 +1,613 @@ + + + + + + + gnomad.utils.sparse_mt — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.sparse_mt

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.sparse_mt.compute_last_ref_block_end(mt)

Compute the genomic position of the most upstream reference block overlapping each row on a sparse MT.

gnomad.utils.sparse_mt.densify_sites(mt, ...)

Create a dense version of the input sparse MT at the sites in sites_ht reading the minimal amount of data required.

gnomad.utils.sparse_mt.get_as_info_expr(mt)

Return an allele-specific annotation Struct containing typical VCF INFO fields from GVCF INFO fields stored in the MT entries.

gnomad.utils.sparse_mt.get_site_info_expr(mt)

Create a site-level annotation Struct aggregating typical VCF INFO fields from GVCF INFO fields stored in the MT entries.

gnomad.utils.sparse_mt.default_compute_info(mt)

Compute a HT with the typical GATK allele-specific (AS) info fields as well as ACs and lowqual fields.

gnomad.utils.sparse_mt.split_info_annotation(...)

Split multi-allelic allele-specific info fields.

gnomad.utils.sparse_mt.split_lowqual_annotation(...)

Split multi-allelic low QUAL annotation.

gnomad.utils.sparse_mt.impute_sex_ploidy(mt)

Impute sex ploidy from a sparse MatrixTable.

gnomad.utils.sparse_mt.densify_all_reference_sites(...)

Densify a VariantDataset or Sparse MatrixTable at all sites in a reference Table.

gnomad.utils.sparse_mt.compute_stats_per_ref_site(...)

Compute stats per site in a reference Table.

gnomad.utils.sparse_mt.compute_coverage_stats(...)

Compute coverage statistics for every base of the reference_ht provided.

gnomad.utils.sparse_mt.get_allele_number_agg_func([...])

Get a transformation and aggregation function for computing the allele number.

gnomad.utils.sparse_mt.compute_allele_number_per_ref_site(...)

Compute the allele number per reference site.

gnomad.utils.sparse_mt.filter_ref_blocks(t)

Filter ref blocks out of the Table or MatrixTable.

+
+
+gnomad.utils.sparse_mt.compute_last_ref_block_end(mt)[source]
+

Compute the genomic position of the most upstream reference block overlapping each row on a sparse MT.

+

Note that since reference blocks do not extend beyond contig boundaries, only the position is kept.

+

This function returns a Table with that annotation. (last_END_position).

+
+
Parameters:
+

mt (MatrixTable) – Input MatrixTable

+
+
Return type:
+

Table

+
+
Returns:
+

Output Table with last_END_position annotation

+
+
+
+ +
+
+gnomad.utils.sparse_mt.densify_sites(mt, sites_ht, last_END_positions_ht, semi_join_rows=True)[source]
+

Create a dense version of the input sparse MT at the sites in sites_ht reading the minimal amount of data required.

+

Note that only rows that appear both in mt and sites_ht are returned.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input sparse MT

  • +
  • sites_ht (Table) – Desired sites to densify

  • +
  • last_END_positions_ht (Table) – Table storing positions of the furthest ref block (END tag)

  • +
  • semi_join_rows (bool) – Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites)

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

Dense MT filtered to the sites in sites_ht

+
+
+
+ +
+
+gnomad.utils.sparse_mt.get_as_info_expr(mt, sum_agg_fields=['QUALapprox'], int32_sum_agg_fields=['VarDP'], median_agg_fields=['ReadPosRankSum', 'MQRankSum'], array_sum_agg_fields=['SB', 'RAW_MQandDP'], alt_alleles_range_array_field='alt_alleles_range_array', treat_fields_as_allele_specific=False)[source]
+

Return an allele-specific annotation Struct containing typical VCF INFO fields from GVCF INFO fields stored in the MT entries.

+
+

Note

+
    +
  • If SB is specified in array_sum_agg_fields, it will be aggregated as +AS_SB_TABLE, according to GATK standard nomenclature.

  • +
  • If RAW_MQandDP is specified in array_sum_agg_fields, it will be used for +the MQ calculation and then dropped according to GATK recommendation.

  • +
  • If RAW_MQ and MQ_DP are given, they will be used for the MQ calculation +and then dropped according to GATK recommendation.

  • +
  • If the fields to be aggregate (sum_agg_fields, int32_sum_agg_fields, +median_agg_fields) are passed as list of str, then they should correspond +to entry fields in mt or in mt.gvcf_info.

  • +
  • Priority is given to entry fields in mt over those in mt.gvcf_info in +case of a name clash.

  • +
  • If treat_fields_as_allele_specific is False, it’s expected that there is a +single value for each entry field to be aggregated. Then when performing the +aggregation per global alternate allele, that value is included in the +aggregation if the global allele is present in the entry’s list of local +alleles. If treat_fields_as_allele_specific is True, it’s expected that +each entry field to be aggregated has one value per local allele, and each +of those is mapped to a global allele for aggregation.

  • +
+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input Matrix Table

  • +
  • sum_agg_fields (Union[List[str], Dict[str, NumericExpression]]) – Fields to aggregate using sum.

  • +
  • int32_sum_agg_fields (Union[List[str], Dict[str, NumericExpression]]) – Fields to aggregate using sum using int32.

  • +
  • median_agg_fields (Union[List[str], Dict[str, NumericExpression]]) – Fields to aggregate using (approximate) median.

  • +
  • array_sum_agg_fields (Union[List[str], Dict[str, ArrayNumericExpression]]) – Fields to aggregate using array sum.

  • +
  • alt_alleles_range_array_field (str) – Annotation containing an array of the range +of alternate alleles e.g., hl.range(1, hl.len(mt.alleles))

  • +
  • treat_fields_as_allele_specific (bool) – Treat info fields as allele-specific. +Defaults to False.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Expression containing the AS info fields

+
+
+
+ +
+
+gnomad.utils.sparse_mt.get_site_info_expr(mt, sum_agg_fields=['QUALapprox'], int32_sum_agg_fields=['VarDP'], median_agg_fields=['ReadPosRankSum', 'MQRankSum'], array_sum_agg_fields=['SB', 'RAW_MQandDP'])[source]
+

Create a site-level annotation Struct aggregating typical VCF INFO fields from GVCF INFO fields stored in the MT entries.

+
+

Note

+
    +
  • If RAW_MQandDP is specified in array_sum_agg_fields, it will be used for +the MQ calculation and then dropped according to GATK recommendation.

  • +
  • If RAW_MQ and MQ_DP are given, they will be used for the MQ calculation +and then dropped according to GATK recommendation.

  • +
  • If the fields to be aggregate (sum_agg_fields, int32_sum_agg_fields, +median_agg_fields) are passed as list of str, then they should correspond +to entry fields in mt or in mt.gvcf_info.

  • +
  • Priority is given to entry fields in mt over those in mt.gvcf_info in +case of a name clash.

  • +
+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input Matrix Table

  • +
  • sum_agg_fields (Union[List[str], Dict[str, NumericExpression]]) – Fields to aggregate using sum.

  • +
  • int32_sum_agg_fields (Union[List[str], Dict[str, NumericExpression]]) – Fields to aggregate using sum using int32.

  • +
  • median_agg_fields (Union[List[str], Dict[str, NumericExpression]]) – Fields to aggregate using (approximate) median.

  • +
  • array_sum_agg_fields (Union[List[str], Dict[str, ArrayNumericExpression]]) –

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Expression containing the site-level info fields

+
+
+
+ +
+
+gnomad.utils.sparse_mt.default_compute_info(mt, site_annotations=False, as_annotations=False, quasi_as_annotations=True, n_partitions=5000, lowqual_indel_phred_het_prior=40, ac_filter_groups=None)[source]
+

Compute a HT with the typical GATK allele-specific (AS) info fields as well as ACs and lowqual fields.

+
+

Note

+
    +
  • This table doesn’t split multi-allelic sites.

  • +
  • At least one of site_annotations, as_annotations or quasi_as_annotations +must be True.

  • +
+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MatrixTable. Note that this table should be filtered to nonref sites.

  • +
  • site_annotations (bool) – Whether to generate site level info fields. Default is False.

  • +
  • as_annotations (bool) – Whether to generate allele-specific info fields using +allele-specific annotations in gvcf_info. Default is False.

  • +
  • quasi_as_annotations (bool) – Whether to generate allele-specific info fields using +non-allele-specific annotations in gvcf_info, but performing per allele +aggregations. This method can be used in cases where genotype data doesn’t +contain allele-specific annotations to approximate allele-specific annotations. +Default is True.

  • +
  • n_partitions (Optional[int]) – Optional number of desired partitions for output Table. If +specified, naive_coalesce is performed. Default is 5000.

  • +
  • lowqual_indel_phred_het_prior (int) – Phred-scaled prior for a het genotype at a +site with a low quality indel. Default is 40. We use 1/10k bases (phred=40) to +be more consistent with the filtering used by Broad’s Data Sciences Platform +for VQSR.

  • +
  • ac_filter_groups (Optional[Dict[str, Expression]]) – Optional dictionary of sample filter expressions to compute +additional groupings of ACs. Default is None.

  • +
+
+
Returns:
+

Table with info fields

+
+
Return type:
+

Table

+
+
+
+ +
+
+gnomad.utils.sparse_mt.split_info_annotation(info_expr, a_index)[source]
+

Split multi-allelic allele-specific info fields.

+
+
Parameters:
+
    +
  • info_expr (StructExpression) – Field containing info struct.

  • +
  • a_index (Int32Expression) – Allele index. Output by hl.split_multi or hl.split_multi_hts.

  • +
+
+
Return type:
+

StructExpression

+
+
Returns:
+

Info struct with split annotations.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.split_lowqual_annotation(lowqual_expr, a_index)[source]
+

Split multi-allelic low QUAL annotation.

+
+
Parameters:
+
    +
  • lowqual_expr (ArrayExpression) – Field containing low QUAL annotation.

  • +
  • a_index (Int32Expression) – Allele index. Output by hl.split_multi or hl.split_multi_hts.

  • +
+
+
Return type:
+

BooleanExpression

+
+
Returns:
+

Low QUAL expression for particular allele.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.impute_sex_ploidy(mt, excluded_calling_intervals=None, included_calling_intervals=None, normalization_contig='chr20', chr_x=None, chr_y=None, use_only_variants=False)[source]
+

Impute sex ploidy from a sparse MatrixTable.

+

Sex ploidy is imputed by normalizing the coverage of chromosomes X and Y using the coverage of an autosomal +chromosome (by default chr20).

+

Coverage is computed using the median block coverage (summed over the block size) and the non-ref coverage at +non-ref genotypes unless the use_only_variants argument is set to True and then it will use the mean coverage +defined by only the variants.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input sparse Matrix Table

  • +
  • excluded_calling_intervals (Optional[Table]) – Optional table of intervals to exclude from the computation. Used only when +determining contig size (not used when computing chromosome depth) when use_only_variants is False.

  • +
  • included_calling_intervals (Optional[Table]) – Optional table of intervals to use in the computation. Used only when +determining contig size (not used when computing chromosome depth) when use_only_variants is False.

  • +
  • normalization_contig (str) – Which chromosome to normalize by

  • +
  • chr_x (Optional[str]) – Optional X Chromosome contig name (by default uses the X contig in the reference)

  • +
  • chr_y (Optional[str]) – Optional Y Chromosome contig name (by default uses the Y contig in the reference)

  • +
  • use_only_variants (bool) – Whether to use depth of variant data within calling intervals instead of reference data. +Default will only use reference data.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with mean coverage over chromosomes 20, X and Y and sex chromosomes ploidy based on normalized coverage.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.densify_all_reference_sites(mtds, reference_ht, interval_ht=None, row_key_fields=('locus',), entry_keep_fields=('GT',))[source]
+

Densify a VariantDataset or Sparse MatrixTable at all sites in a reference Table.

+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input sparse MatrixTable or VariantDataset.

  • +
  • reference_ht (Table) – Table of reference sites.

  • +
  • interval_ht (Optional[Table]) – Optional Table of intervals to filter to.

  • +
  • row_key_fields (Union[Tuple[str], List[str], Set[str]]) – Fields to use as row key. Defaults to locus.

  • +
  • entry_keep_fields (Union[Tuple[str], List[str], Set[str]]) – Fields to keep in entries before performing the +densification. Defaults to GT.

  • +
+
+
Return type:
+

MatrixTable

+
+
Returns:
+

Densified MatrixTable.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.compute_stats_per_ref_site(mtds, reference_ht, entry_agg_funcs, row_key_fields=('locus',), interval_ht=None, entry_keep_fields=None, row_keep_fields=None, entry_agg_group_membership=None, strata_expr=None, group_membership_ht=None, sex_karyotype_field=None)[source]
+

Compute stats per site in a reference Table.

+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input sparse Matrix Table or VariantDataset.

  • +
  • reference_ht (Table) – Table of reference sites.

  • +
  • entry_agg_funcs (Dict[str, Tuple[Callable, Callable]]) – Dict of entry aggregation functions to perform on the +VariantDataset/MatrixTable. The keys of the dict are the names of the +annotations and the values are tuples of functions. The first function is used +to transform the mt entries in some way, and the second function is used to +aggregate the output from the first function.

  • +
  • row_key_fields (Union[Tuple[str], List[str]]) – Fields to use as row key. Defaults to locus.

  • +
  • interval_ht (Optional[Table]) – Optional table of intervals to filter to.

  • +
  • entry_keep_fields (Union[Tuple[str], List[str], Set[str]]) – Fields to keep in entries before performing the +densification in densify_all_reference_sites. Should include any fields +needed for the functions in entry_agg_funcs. By default, only GT or LGT is +kept.

  • +
  • row_keep_fields (Union[Tuple[str], List[str], Set[str]]) – Fields to keep in rows after performing the stats +aggregation. By default, only the row key fields are kept.

  • +
  • entry_agg_group_membership (Optional[Dict[str, List[dict[str, str]]]]) – Optional dict indicating the subset of group +strata in ‘freq_meta’ to use the entry aggregation functions on. The keys of +the dict can be any of the keys in entry_agg_funcs and the values are lists +of dicts. Each dict in the list contains the strata in ‘freq_meta’ to use for +the corresponding entry aggregation function. If provided, ‘freq_meta’ must be +present in group_membership_ht and represent the same strata as those in +‘group_membership’. If not provided, all entries of the ‘group_membership’ +annotation will have the entry aggregation functions applied to them.

  • +
  • strata_expr (Optional[List[Dict[str, StringExpression]]]) – Optional list of dicts of expressions to stratify by.

  • +
  • group_membership_ht (Optional[Table]) – Optional Table of group membership annotations.

  • +
  • sex_karyotype_field (Optional[str]) – Optional field to use to adjust genotypes for sex +karyotype before stats aggregation. If provided, the field must be present in +the columns of mtds (variant_data MT if mtds is a VDS) and use “XX” and +“XY” as values. If not provided, no sex karyotype adjustment is performed. +Default is None.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of stats per site.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.compute_coverage_stats(mtds, reference_ht, interval_ht=None, coverage_over_x_bins=[1, 5, 10, 15, 20, 25, 30, 50, 100], row_key_fields=['locus'], strata_expr=None, group_membership_ht=None)[source]
+

Compute coverage statistics for every base of the reference_ht provided.

+
+
The following coverage stats are calculated:
    +
  • mean

  • +
  • median

  • +
  • total DP

  • +
  • fraction of samples with coverage above X, for each x in coverage_over_x_bins

  • +
+
+
+

The reference_ht is a Table that contains a row for each locus coverage that should be +computed on. It needs to be keyed by locus. The reference_ht can e.g. be +created using get_reference_ht.

+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input sparse MT or VDS

  • +
  • reference_ht (Table) – Input reference HT

  • +
  • interval_ht (Optional[Table]) – Optional Table containing intervals to filter to

  • +
  • coverage_over_x_bins (List[int]) – List of boundaries for computing samples over X

  • +
  • row_key_fields (List[str]) – List of row key fields to use for joining mtds with +reference_ht

  • +
  • strata_expr (Optional[List[Dict[str, StringExpression]]]) – Optional list of dicts containing expressions to stratify the +coverage stats by. Only one of group_membership_ht or strata_expr can be +specified.

  • +
  • group_membership_ht (Optional[Table]) – Optional Table containing group membership annotations +to stratify the coverage stats by. Only one of group_membership_ht or +strata_expr can be specified.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with per-base coverage stats.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.get_allele_number_agg_func(gt_field='GT')[source]
+

Get a transformation and aggregation function for computing the allele number.

+

Can be used as an entry aggregation function in compute_stats_per_ref_site.

+
+
Parameters:
+

gt_field (str) – Genotype field to use for computing the allele number.

+
+
Return type:
+

Tuple[Callable, Callable]

+
+
Returns:
+

Tuple of functions to transform and aggregate the allele number.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.compute_allele_number_per_ref_site(mtds, reference_ht, **kwargs)[source]
+

Compute the allele number per reference site.

+
+
Parameters:
+
    +
  • mtds (Union[MatrixTable, VariantDataset]) – Input sparse Matrix Table or VariantDataset.

  • +
  • reference_ht (Table) – Table of reference sites.

  • +
  • kwargs – Keyword arguments to pass to compute_stats_per_ref_site.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of allele number per reference site.

+
+
+
+ +
+
+gnomad.utils.sparse_mt.filter_ref_blocks(t)[source]
+

Filter ref blocks out of the Table or MatrixTable.

+
+
Parameters:
+

t (Union[MatrixTable, Table]) – Input MT/HT

+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

MT/HT with ref blocks removed

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/transcript_annotation.html b/api_reference/utils/transcript_annotation.html new file mode 100644 index 000000000..5973c1193 --- /dev/null +++ b/api_reference/utils/transcript_annotation.html @@ -0,0 +1,445 @@ + + + + + + + gnomad.utils.transcript_annotation — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.transcript_annotation

+

Utils module containing generic functions that are useful for adding transcript expression-aware annotations.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.transcript_annotation.summarize_transcript_expression(mt)

Summarize a transcript expression MatrixTable by transcript, gene, and tissue.

gnomad.utils.transcript_annotation.get_expression_proportion(ht)

Calculate the proportion of expression of transcript to gene per tissue.

gnomad.utils.transcript_annotation.filter_expression_ht_by_tissues(ht)

Filter a Table with a row annotation for each tissue to only include specified tissues.

gnomad.utils.transcript_annotation.tissue_expression_ht_to_array(ht)

Convert a Table with a row annotation for each tissue to a Table with tissues as an array.

gnomad.utils.transcript_annotation.tx_filter_variants_by_csqs(ht)

Prepare a Table of variants with VEP transcript consequences for annotation.

gnomad.utils.transcript_annotation.tx_annotate_variants(ht, ...)

Annotate variants with transcript-based expression values or expression proportion from GTEx.

gnomad.utils.transcript_annotation.tx_aggregate_variants(ht)

Aggregate transcript-based expression values or expression proportion from GTEx.

gnomad.utils.transcript_annotation.perform_tx_annotation_pipeline(ht, ...)

One-stop usage of tx_filter_variants_by_csqs, tx_annotate_variants and tx_aggregate_variants.

+

Utils module containing generic functions that are useful for adding transcript expression-aware annotations.

+
+
+gnomad.utils.transcript_annotation.summarize_transcript_expression(mt, transcript_expression_expr='transcript_tpm', tissue_expr='tissue', summary_agg_func=None)[source]
+

Summarize a transcript expression MatrixTable by transcript, gene, and tissue.

+

The summary_agg_func argument allows the user to specify a Hail aggregation +function to use to summarize the expression by tissue. By default, the median is +used.

+

The returned Table has a row annotation for each tissue containing a struct with the +summarized tissue expression value (‘transcript_expression’) and the proportion of +expression of transcript to gene per tissue (‘expression_proportion’).

+

Returned Table Schema example:

+
Row fields:
+    'transcript_id': str
+    'gene_id': str
+    'tissue_1': struct {
+      transcript_expression: float64,
+      expression_proportion: float64
+    }
+    'tissue_2': struct {
+      transcript_expression: float64,
+      expression_proportion: float64
+    }
+
+Key: ['transcript_id', 'gene_id']
+
+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – MatrixTable of transcript (rows) expression quantifications (entry) by +sample (columns).

  • +
  • transcript_expression_expr (Union[NumericExpression, str]) – Entry expression indicating transcript expression +quantification. Default is ‘transcript_tpm’.

  • +
  • tissue_expr (Union[StringExpression, str]) – Column expression indicating tissue type. Default is ‘tissue’.

  • +
  • summary_agg_func (Optional[Callable]) – Optional aggregation function to use to summarize the +transcript expression quantification by tissue. Example: hl.agg.mean. Default +is None, which will use a median aggregation.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

A Table of summarized transcript expression by tissue.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.get_expression_proportion(ht)[source]
+

Calculate the proportion of expression of transcript to gene per tissue.

+
+
Parameters:
+

ht (Table) – Table of summarized transcript expression by tissue.

+
+
Return type:
+

StructExpression

+
+
Returns:
+

StructExpression containing the proportion of expression of transcript to +gene per tissue.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.filter_expression_ht_by_tissues(ht, tissues_to_keep=None, tissues_to_filter=None)[source]
+

Filter a Table with a row annotation for each tissue to only include specified tissues.

+
+
Parameters:
+
    +
  • ht (Table) – Table with a row annotation for each tissue.

  • +
  • tissues_to_keep (Optional[List[str]]) – Optional list of tissues to keep in the Table. Default is +all non-key rows in the Table.

  • +
  • tissues_to_filter (Optional[List[str]]) – Optional list of tissues to exclude from the Table.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with only specified tissues.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.tissue_expression_ht_to_array(ht, tissues_to_keep=None, tissues_to_filter=None, annotations_to_extract=('transcript_expression', 'expression_proportion'))[source]
+

Convert a Table with a row annotation for each tissue to a Table with tissues as an array.

+
+
The output is a Table with one of the two formats:
    +
  • An annotation of ‘tissue_expression’ containing an array of structs by +tissue, where each element of the array is the Table’s row value for a given +tissue.

    +
    +

    Example:

    +
    tissue_expression': array<struct {
    +    transcript_expression: float64,
    +    expression_proportion: float64
    +}>
    +
    +
    +
    +
  • +
  • One array annotation for each field defined in the ‘annotations_to_extract’ +argument, where each array is an array of the given field values by tissue.

    +
    +

    Example:

    +
    'transcript_expression': array<float64>
    +'expression_proportion': array<float64>
    +
    +
    +
    +
  • +
+
+
+

The order of tissues in the array is indicated by the “tissues” global annotation.

+
+
Parameters:
+
    +
  • ht (Table) – Table with a row annotation for each tissue.

  • +
  • tissues_to_keep (Optional[List[str]]) – Optional list of tissues to keep in the tissue expression +array. Default is all non-key rows in the Table.

  • +
  • tissues_to_filter (Optional[List[str]]) – Optional list of tissues to exclude from the +tissue expression array.

  • +
  • annotations_to_extract (Union[Tuple[str], List[str], None]) – Optional list of tissue struct fields to extract +into top level array annotations. If None, the returned Table will contain a +single top level annotation ‘tissue_expression’ that contains an array of +structs by tissue. Default is (‘transcript_expression’, ‘expression_proportion’).

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with requested tissue struct annotations pulled into arrays of +tissue values and a ‘tissues’ global annotation indicating the order of tissues +in the arrays.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.tx_filter_variants_by_csqs(ht, filter_to_cds=True, gencode_ht=None, filter_to_genes=None, match_by_gene_symbol=False, filter_to_csqs=None, ignore_splicing=True, filter_to_protein_coding=True, vep_root='vep')[source]
+

Prepare a Table of variants with VEP transcript consequences for annotation.

+
+

Note

+

When filter_to_cds is set to True, the returned Table will be further +filtered by defined ‘amino_acids’ annotation, which is to filter out certain +consequences, such as ‘stop_retained_variant’, that are kept by all CDS +intervals but don’t belong to CDS of the transcript they fall on.

+
+
+
Parameters:
+
    +
  • ht (Table) – Table of variants with ‘vep’ annotations.

  • +
  • gencode_ht (Optional[Table]) – Optional Gencode resource Table containing CDS interval +information. This is only used when filter_to_cds is set to True. Default is +None, which will use the default version of the Gencode Table resource for +the reference build of the input Table ht.

  • +
  • filter_to_cds (bool) – Whether to filter to CDS regions. Default is True. And it +will be further filtered by defined ‘amino_acids’ annotation.

  • +
  • filter_to_genes (Optional[List[str]]) – Optional list of genes to filter to. Default is None.

  • +
  • match_by_gene_symbol (bool) – Whether to match by gene symbol instead of gene ID. +Default is False.

  • +
  • filter_to_csqs (Optional[List[str]]) – Optional list of consequences to filter to. Default is None.

  • +
  • ignore_splicing (bool) – If True, ignore splice consequences. Default is True.

  • +
  • filter_to_protein_coding (bool) – Whether to filter to protein coding transcripts. +Default is True.

  • +
  • vep_root (str) – Name used for root VEP annotation. Default is ‘vep’.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of variants with preprocessed/filtered transcript consequences +prepared for annotation.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.tx_annotate_variants(ht, tx_ht, tissues_to_filter=None, vep_root='vep', vep_annotation='transcript_consequences')[source]
+

Annotate variants with transcript-based expression values or expression proportion from GTEx.

+
+
Parameters:
+
    +
  • ht (Table) – Table of variants to annotate, it should contain the nested fields: +{vep_root}.{vep_annotation}.

  • +
  • tx_ht (Table) – Table of transcript expression information.

  • +
  • tissues_to_filter (Optional[List[str]]) – Optional list of tissues to exclude from the output. +Default is None.

  • +
  • vep_root (str) – Name used for root VEP annotation. Default is ‘vep’.

  • +
  • vep_annotation (str) – Name of annotation under vep_root, one of the processed +consequences: [“transcript_consequences”, “worst_csq_by_gene”, +“worst_csq_for_variant”, “worst_csq_by_gene_canonical”, +“worst_csq_for_variant_canonical”]. For example, if you want to annotate +each variant with the worst consequence in each gene it falls on and the +transcript expression, you would use “worst_csq_by_gene”. Default is +“transcript_consequences”.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Input Table with transcript expression information annotated.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.tx_aggregate_variants(ht, additional_group_by=('alleles', 'gene_symbol', 'most_severe_consequence', 'lof', 'lof_flags'))[source]
+

Aggregate transcript-based expression values or expression proportion from GTEx.

+
+
Parameters:
+
    +
  • ht (Table) – Table of variants annotated with transcript expression information.

  • +
  • additional_group_by (Union[Tuple[str], List[str], None]) – Optional list of additional fields to group by before +sum aggregation. If None, the returned Table will be grouped by only “locus” +and “gene_id” before the sum aggregation.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of variants with transcript expression information aggregated.

+
+
+
+ +
+
+gnomad.utils.transcript_annotation.perform_tx_annotation_pipeline(ht, tx_ht, tissues_to_filter=None, vep_root='vep', vep_annotation='transcript_consequences', filter_to_csqs=['transcript_ablation', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained', 'frameshift_variant', 'stop_lost', 'start_lost', 'initiator_codon_variant', 'transcript_amplification', 'inframe_insertion', 'inframe_deletion', 'missense_variant', 'protein_altering_variant', 'splice_region_variant', 'incomplete_terminal_codon_variant', 'start_retained_variant', 'stop_retained_variant', 'synonymous_variant', 'coding_sequence_variant'], additional_group_by=('alleles', 'gene_symbol', 'most_severe_consequence', 'lof', 'lof_flags'), **kwargs)[source]
+

One-stop usage of tx_filter_variants_by_csqs, tx_annotate_variants and tx_aggregate_variants.

+
+
Parameters:
+
    +
  • ht (Table) – Table of variants to annotate, it should contain the nested fields: +{vep_root}.{vep_annotation}.

  • +
  • tx_ht (Table) – Table of transcript expression information.

  • +
  • tissues_to_filter (Optional[List[str]]) – Optional list of tissues to exclude from the output.

  • +
  • vep_root (str) – Name used for root VEP annotation. Default is ‘vep’.

  • +
  • vep_annotation (str) – Name of annotation under vep_root. Default is +‘transcript_consequences’.

  • +
  • filter_to_csqs (Optional[List[str]]) – Optional list of consequences to filter to. Default is None.

  • +
  • additional_group_by (Union[Tuple[str], List[str], None]) – Optional list of additional fields to group by before +sum aggregation. If None, the returned Table will be grouped by only “locus” +and “gene_id” before the sum aggregation.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table of variants with transcript expression information aggregated.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/vcf.html b/api_reference/utils/vcf.html new file mode 100644 index 000000000..25b5d9200 --- /dev/null +++ b/api_reference/utils/vcf.html @@ -0,0 +1,728 @@ + + + + + + + gnomad.utils.vcf — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.vcf

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.vcf.SORT_ORDER

Order to sort subgroupings during VCF export.

gnomad.utils.vcf.GROUPS

Group names used to generate labels for high quality genotypes and all raw genotypes.

gnomad.utils.vcf.HISTS

Quality histograms used in VCF export.

gnomad.utils.vcf.FAF_POPS

Global populations that are included in filtering allele frequency (faf) calculations.

gnomad.utils.vcf.SEXES

Sample sexes used in VCF export.

gnomad.utils.vcf.AS_FIELDS

Allele-specific variant annotations.

gnomad.utils.vcf.SITE_FIELDS

Site level variant annotations.

gnomad.utils.vcf.ALLELE_TYPE_FIELDS

Allele-type annotations.

gnomad.utils.vcf.REGION_FLAG_FIELDS

Annotations about variant region type.

gnomad.utils.vcf.JOINT_REGION_FLAG_FIELDS

Annotations about variant region type that are specifically created for joint dataset of exomes and genomes from gnomAD v4.1.

gnomad.utils.vcf.RF_FIELDS

Annotations specific to the variant QC using a random forest model.

gnomad.utils.vcf.AS_VQSR_FIELDS

Allele-specific VQSR annotations.

gnomad.utils.vcf.VQSR_FIELDS

Annotations specific to VQSR.

gnomad.utils.vcf.INFO_DICT

Dictionary used during VCF export to export row (variant) annotations.

gnomad.utils.vcf.IN_SILICO_ANNOTATIONS_INFO_DICT

Dictionary with in silico score descriptions to include in the VCF INFO header.

gnomad.utils.vcf.VRS_FIELDS_DICT

Dictionary with VRS annotations to include in the VCF INFO field and VCF header.

gnomad.utils.vcf.ENTRIES

Densified entries to be selected during VCF export.

gnomad.utils.vcf.SPARSE_ENTRIES

Sparse entries to be selected and densified during VCF export.

gnomad.utils.vcf.FORMAT_DICT

Dictionary used during VCF export to export MatrixTable entries.

gnomad.utils.vcf.adjust_vcf_incompatible_types(ht)

Create a Table ready for vcf export.

gnomad.utils.vcf.make_label_combos(label_groups)

Make combinations of all possible labels for a supplied dictionary of label groups.

gnomad.utils.vcf.index_globals(...[, ...])

Create a dictionary keyed by the specified label groupings with values describing the corresponding index of each grouping entry in the meta_array annotation.

gnomad.utils.vcf.make_combo_header_text(...)

Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.

gnomad.utils.vcf.create_label_groups(pops[, ...])

Generate a list of label group dictionaries needed to populate info dictionary.

gnomad.utils.vcf.make_info_dict([prefix, ...])

Generate dictionary of Number and Description attributes of VCF INFO fields.

gnomad.utils.vcf.add_as_info_dict([...])

Update info dictionary with allele-specific terms and their descriptions.

gnomad.utils.vcf.make_vcf_filter_dict(...[, ...])

Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.

gnomad.utils.vcf.make_hist_bin_edges_expr(ht)

Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters.

gnomad.utils.vcf.make_hist_dict(bin_edges, adj)

Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations.

gnomad.utils.vcf.set_female_y_metrics_to_na(t)

Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0).

gnomad.utils.vcf.build_vcf_export_reference(name)

Create export reference based on reference genome defined by build.

gnomad.utils.vcf.rekey_new_reference(t, ...)

Re-key Table or MatrixTable with a new reference genome.

+
+
+gnomad.utils.vcf.SORT_ORDER = ['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group']
+

Order to sort subgroupings during VCF export. +Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female).

+
+ +
+
+gnomad.utils.vcf.GROUPS = ['adj', 'raw']
+

Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export.

+
+ +
+
+gnomad.utils.vcf.HISTS = ['gq_hist_alt', 'gq_hist_all', 'dp_hist_alt', 'dp_hist_all', 'ab_hist_alt']
+

Quality histograms used in VCF export.

+
+ +
+
+gnomad.utils.vcf.FAF_POPS = {'v3': ['afr', 'amr', 'eas', 'nfe', 'sas'], 'v4': ['afr', 'amr', 'eas', 'mid', 'nfe', 'sas']}
+

Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export.

+
+ +
+
+gnomad.utils.vcf.SEXES = ['XX', 'XY']
+

Sample sexes used in VCF export.

+

Used to stratify frequency annotations (AC, AN, AF) for each sex. +Note that sample sexes in gnomAD v3 and earlier were ‘male’ and ‘female’.

+
+ +
+
+gnomad.utils.vcf.AS_FIELDS = ['AS_FS', 'AS_MQ', 'AS_MQRankSum', 'AS_pab_max', 'AS_QUALapprox', 'AS_QD', 'AS_ReadPosRankSum', 'AS_SB_TABLE', 'AS_SOR', 'AS_VarDP', 'InbreedingCoeff']
+

Allele-specific variant annotations.

+
+ +
+
+gnomad.utils.vcf.SITE_FIELDS = ['FS', 'MQ', 'MQRankSum', 'QUALapprox', 'QD', 'ReadPosRankSum', 'SB', 'SOR', 'VarDP']
+

Site level variant annotations.

+
+ +
+
+gnomad.utils.vcf.ALLELE_TYPE_FIELDS = ['allele_type', 'has_star', 'n_alt_alleles', 'original_alleles', 'variant_type', 'was_mixed']
+

Allele-type annotations.

+
+ +
+
+gnomad.utils.vcf.REGION_FLAG_FIELDS = ['decoy', 'lcr', 'nonpar', 'non_par', 'segdup']
+

Annotations about variant region type.

+
+

Note

+

decoy resource files do not currently exist for GRCh38/hg38.

+
+
+ +
+
+gnomad.utils.vcf.JOINT_REGION_FLAG_FIELDS = ['fail_interval_qc', 'outside_broad_capture_region', 'outside_ukb_capture_region', 'outside_broad_calling_region', 'outside_ukb_calling_region', 'not_called_in_exomes', 'not_called_in_genomes']
+

Annotations about variant region type that are specifically created for joint dataset of exomes and genomes from gnomAD v4.1.

+
+ +
+
+gnomad.utils.vcf.RF_FIELDS = ['rf_positive_label', 'rf_negative_label', 'rf_label', 'rf_train', 'rf_tp_probability']
+

Annotations specific to the variant QC using a random forest model.

+
+ +
+
+gnomad.utils.vcf.AS_VQSR_FIELDS = ['AS_culprit', 'AS_VQSLOD']
+

Allele-specific VQSR annotations.

+
+ +
+
+gnomad.utils.vcf.VQSR_FIELDS = ['AS_culprit', 'AS_VQSLOD', 'NEGATIVE_TRAIN_SITE', 'POSITIVE_TRAIN_SITE']
+

Annotations specific to VQSR.

+
+ +
+
+gnomad.utils.vcf.INFO_DICT = {'AS_SB_TABLE': {'Description': 'Allele-specific forward/reverse read counts for strand bias tests', 'Number': '.'}, 'AS_pab_max': {'Description': 'Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5', 'Number': 'A'}, 'BaseQRankSum': {'Description': 'Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities'}, 'FS': {'Description': "Phred-scaled p-value of Fisher's exact test for strand bias"}, 'InbreedingCoeff': {'Description': 'Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)', 'Number': 'A'}, 'MQ': {'Description': 'Root mean square of the mapping quality of reads across all samples'}, 'MQRankSum': {'Description': 'Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities'}, 'NEGATIVE_TRAIN_SITE': {'Description': 'Variant was used to build the negative training set of low-quality variants for VQSR'}, 'POSITIVE_TRAIN_SITE': {'Description': 'Variant was used to build the positive training set of high-quality variants for VQSR'}, 'QD': {'Description': 'Variant call confidence normalized by depth of sample reads supporting a variant'}, 'QUALapprox': {'Description': 'Sum of PL[0] values; used to approximate the QUAL score', 'Number': '1'}, 'ReadPosRankSum': {'Description': 'Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias'}, 'SOR': {'Description': 'Strand bias estimated by the symmetric odds ratio test'}, 'VQSLOD': {'Description': 'Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model'}, 'VarDP': {'Description': 'Depth over variant genotypes (does not include depth of reference samples)'}, 'allele_type': {'Description': 'Allele type (snv, insertion, deletion, or mixed)'}, 'culprit': {'Description': 'Worst-performing annotation in the VQSR Gaussian mixture model'}, 'decoy': {'Description': 'Variant falls within a reference decoy region'}, 'fail_interval_qc': {'Description': 'Less than 85 percent of samples meet 20X coverage if variant is in autosomal or PAR regions or 10X coverage for non-PAR regions of chromosomes X and Y.'}, 'has_star': {'Description': 'Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset'}, 'inbreeding_coeff': {'Description': 'Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)', 'Number': 'A'}, 'lcr': {'Description': 'Variant falls within a low complexity region'}, 'monoallelic': {'Description': 'All samples are homozygous alternate for the variant'}, 'n_alt_alleles': {'Description': 'Total number of alternate alleles observed at variant locus', 'Number': '1'}, 'negative_train_site': {'Description': 'Variant was used to build the negative training set of low-quality variants for VQSR'}, 'non_par': {'Description': 'Variant (on sex chromosome) falls outside a pseudoautosomal region'}, 'nonpar': {'Description': 'Variant (on sex chromosome) falls outside a pseudoautosomal region'}, 'only_het': {'Description': 'All samples are heterozygous for the variant'}, 'original_alleles': {'Description': 'Alleles before splitting multiallelics'}, 'outside_broad_capture_region': {'Description': 'Variant falls outside of Broad exome capture regions.'}, 'outside_ukb_capture_region': {'Description': 'Variant falls outside of UK Biobank exome capture regions.'}, 'positive_train_site': {'Description': 'Variant was used to build the positive training set of high-quality variants for VQSR'}, 'rf_label': {'Description': 'Random forest training label'}, 'rf_negative_label': {'Description': 'Variant was labelled as a negative example for training of random forest model'}, 'rf_positive_label': {'Description': 'Variant was labelled as a positive example for training of random forest model'}, 'rf_tp_probability': {'Description': 'Probability of a called variant being a true variant as determined by random forest model'}, 'rf_train': {'Description': 'Variant was used in training random forest model'}, 'segdup': {'Description': 'Variant falls within a segmental duplication region'}, 'sibling_singleton': {'Description': 'Variant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort).'}, 'transmitted_singleton': {'Description': 'Variant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort)'}, 'variant_type': {'Description': 'Variant type (snv, indel, multi-snv, multi-indel, or mixed)'}, 'was_mixed': {'Description': 'Variant type was mixed'}}
+

Dictionary used during VCF export to export row (variant) annotations.

+
+ +
+
+gnomad.utils.vcf.IN_SILICO_ANNOTATIONS_INFO_DICT = {'cadd_phred': {'Description': "Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values are more deleterious.", 'Number': '1'}, 'cadd_raw_score': {'Description': "Raw CADD scores are interpretable as the extent to which the annotation profile for a given variant suggests that the variant is likely to be 'observed' (negative values) vs 'simulated' (positive values). Larger values are more deleterious.", 'Number': '1'}, 'pangolin_largest_ds': {'Description': "Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering", 'Number': '1'}, 'phylop': {'Description': 'Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores).', 'Number': '1'}, 'polyphen_max': {'Description': 'Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious).  We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.', 'Number': '1'}, 'revel_max': {'Description': "The maximum REVEL score at a site's MANE Select or canonical transcript. It's an ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors). Scores ranges from 0 to 1. Variants with higher scores are predicted to be more likely to be deleterious.", 'Number': '1'}, 'sift_max': {'Description': 'Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript.', 'Number': '1'}, 'spliceai_ds_max': {'Description': "Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering.", 'Number': '1'}}
+

Dictionary with in silico score descriptions to include in the VCF INFO header.

+
+ +
+
+gnomad.utils.vcf.VRS_FIELDS_DICT = {'VRS_Allele_IDs': {'Description': 'The computed identifiers for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields', 'Number': 'R'}, 'VRS_Ends': {'Description': 'Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields', 'Number': 'R'}, 'VRS_Starts': {'Description': 'Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields', 'Number': 'R'}, 'VRS_States': {'Description': 'The literal sequence states used for the GA4GH VRS Alleles corresponding to the values in the REF and ALT fields', 'Number': '.'}}
+

Dictionary with VRS annotations to include in the VCF INFO field and VCF header.

+
+ +
+
+gnomad.utils.vcf.ENTRIES = ['GT', 'GQ', 'DP', 'AD', 'MIN_DP', 'PGT', 'PID', 'PL', 'SB']
+

Densified entries to be selected during VCF export.

+
+ +
+
+gnomad.utils.vcf.SPARSE_ENTRIES = ['END', 'DP', 'GQ', 'LA', 'LAD', 'LGT', 'LPGT', 'LPL', 'MIN_DP', 'PID', 'RGQ', 'SB']
+

Sparse entries to be selected and densified during VCF export.

+
+ +
+
+gnomad.utils.vcf.FORMAT_DICT = {'AD': {'Description': 'Allelic depths for the ref and alt alleles in the order listed', 'Number': 'R', 'Type': 'Integer'}, 'DP': {'Description': 'Approximate read depth (reads with MQ=255 or with bad mates are filtered)', 'Number': '1', 'Type': 'Integer'}, 'GQ': {'Description': 'Phred-scaled confidence that the genotype assignment is correct. Value is the difference between the second lowest PL and the lowest PL (always normalized to 0).', 'Number': '1', 'Type': 'Integer'}, 'GT': {'Description': 'Genotype', 'Number': '1', 'Type': 'String'}, 'MIN_DP': {'Description': 'Minimum DP observed within the GVCF block', 'Number': '1', 'Type': 'Integer'}, 'PGT': {'Description': 'Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another', 'Number': '1', 'Type': 'String'}, 'PID': {'Description': 'Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group', 'Number': '1', 'Type': 'String'}, 'PL': {'Description': 'Normalized, phred-scaled likelihoods for genotypes as defined in the VCF specification', 'Number': 'G', 'Type': 'Integer'}, 'SB': {'Description': "Per-sample component statistics which comprise the Fisher's exact test to detect strand bias. Values are: depth of reference allele on forward strand, depth of reference allele on reverse strand, depth of alternate allele on forward strand, depth of alternate allele on reverse strand.", 'Number': '4', 'Type': 'Integer'}}
+

Dictionary used during VCF export to export MatrixTable entries.

+
+ +
+
+gnomad.utils.vcf.adjust_vcf_incompatible_types(ht, pipe_delimited_annotations=['AS_QUALapprox', 'AS_VarDP', 'AS_MQ_DP', 'AS_RAW_MQ', 'AS_SB_TABLE'])[source]
+

Create a Table ready for vcf export.

+
+
In particular, the following conversions are done:
    +
  • All int64 are coerced to int32

  • +
  • Fields specified by pipe_delimited_annotations are converted from arrays to pipe-delimited strings

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • pipe_delimited_annotations (List[str]) – List of info fields (they must be fields of the ht.info Struct).

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table ready for VCF export.

+
+
+
+ +
+
+gnomad.utils.vcf.make_label_combos(label_groups, sort_order=['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group'], label_delimiter='_')[source]
+

Make combinations of all possible labels for a supplied dictionary of label groups.

+

For example, if label_groups is {“sex”: [“male”, “female”], “pop”: [“afr”, “nfe”, “amr”]}, +this function will return [“afr_male”, “afr_female”, “nfe_male”, “nfe_female”, “amr_male”, “amr_female’]

+
+
Parameters:
+
    +
  • label_groups (Dict[str, List[str]]) – Dictionary containing an entry for each label group, where key is the name of the grouping, +e.g. “sex” or “pop”, and value is a list of all possible values for that grouping (e.g. [“male”, “female”] or [“afr”, “nfe”, “amr”]).

  • +
  • sort_order (List[str]) – List containing order to sort label group combinations. Default is SORT_ORDER.

  • +
  • label_delimiter (str) – String to use as delimiter when making group label combinations.

  • +
+
+
Return type:
+

List[str]

+
+
Returns:
+

list of all possible combinations of values for the supplied label groupings.

+
+
+
+ +
+
+gnomad.utils.vcf.index_globals(globals_array, label_groups, label_delimiter='_')[source]
+

Create a dictionary keyed by the specified label groupings with values describing the corresponding index of each grouping entry in the meta_array annotation.

+
+
Parameters:
+
    +
  • globals_array (List[Dict[str, str]]) – Ordered list containing dictionary entries describing all the grouping combinations contained in the globals_array annotation. +Keys are the grouping type (e.g., ‘group’, ‘pop’, ‘sex’) and values are the grouping attribute (e.g., ‘adj’, ‘eas’, ‘XY’).

  • +
  • label_groups (Dict[str, List[str]]) – Dictionary containing an entry for each label group, where key is the name of the grouping, +e.g. “sex” or “pop”, and value is a list of all possible values for that grouping (e.g. [“male”, “female”] or [“afr”, “nfe”, “amr”])

  • +
  • label_delimiter (str) – String used as delimiter when making group label combinations.

  • +
+
+
Return type:
+

Dict[str, int]

+
+
Returns:
+

Dictionary keyed by specified label grouping combinations, with values describing the corresponding index +of each grouping entry in the globals

+
+
+
+ +
+
+gnomad.utils.vcf.make_combo_header_text(preposition, combo_dict, pop_names)[source]
+

Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.

+

For example, if preposition is “for”, group_types is [“group”, “pop”, “sex”], and combo_fields is [“adj”, “afr”, “female”], +this function will return the string ” for female samples in the African-American/African genetic ancestry group”.

+
+
Parameters:
+
    +
  • preposition (str) – Relevant preposition to precede automatically generated text.

  • +
  • combo_dict (Dict[str, str]) – Dict with grouping types as keys and values for grouping type as values. This function generates text for these values. +Possible grouping types are: “group”, “pop”, “sex”, and “subpop”. +Example input: {“pop”: “afr”, “sex”: “female”}

  • +
  • pop_names (Dict[str, str]) – Dict with global population names (keys) and population descriptions (values).

  • +
+
+
Return type:
+

str

+
+
Returns:
+

String with automatically generated description text for a given set of combo fields.

+
+
+
+ +
+
+gnomad.utils.vcf.create_label_groups(pops, sexes=['XX', 'XY'], all_groups=['adj', 'raw'], pop_sex_groups=['adj'])[source]
+

Generate a list of label group dictionaries needed to populate info dictionary.

+

Label dictionaries are passed as input to make_info_dict.

+
+
Parameters:
+
    +
  • pops (List[str]) – List of population names.

  • +
  • sexes (List[str]) – List of sample sexes.

  • +
  • all_groups (List[str]) – List of data types (raw, adj). Default is GROUPS, which is [“raw”, “adj”].

  • +
  • pop_sex_groups (List[str]) – List of data types (raw, adj) to populate with pops and sexes. Default is [“adj”].

  • +
+
+
Return type:
+

List[Dict[str, List[str]]]

+
+
Returns:
+

List of label group dictionaries.

+
+
+
+ +
+
+gnomad.utils.vcf.make_info_dict(prefix='', suffix='', prefix_before_metric=True, pop_names={'afr': 'African/African-American', 'ami': 'Amish', 'amr': 'Admixed American', 'asj': 'Ashkenazi Jewish', 'bgr': 'Bulgarian (Eastern European)', 'consanguineous': 'South Asian (F > 0.05)', 'eas': 'East Asian', 'est': 'Estonian', 'eur': 'European', 'exac': 'ExAC', 'fin': 'Finnish', 'gbr': 'British', 'jpn': 'Japanese', 'kor': 'Korean', 'mde': 'Middle Eastern', 'mid': 'Middle Eastern', 'nfe': 'Non-Finnish European', 'nwe': 'North-Western European', 'oea': 'Other East Asian', 'oeu': 'Other European', 'onf': 'Other Non-Finnish European', 'oth': 'Other', 'remaining': 'Remaining individuals', 'sas': 'South Asian', 'sas_non_consang': 'South Asian (F < 0.05)', 'seu': 'Southern European', 'sgp': 'Singaporean', 'swe': 'Swedish', 'uniform': 'Uniform', 'unk': 'Unknown'}, label_groups=None, label_delimiter='_', bin_edges=None, faf=False, popmax=False, grpmax=False, fafmax=False, callstats=False, freq_ctt=False, freq_cmh=False, description_text='', age_hist_distribution=None, sort_order=['subset', 'downsampling', 'popmax', 'grpmax', 'pop', 'gen_anc', 'subpop', 'sex', 'group'])[source]
+

Generate dictionary of Number and Description attributes of VCF INFO fields.

+

Used to populate the INFO fields of the VCF header during export.

+
+
Creates:
    +
  • INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers)

  • +
  • INFO fields for popmax AC, AN, AF, nhomalt, and popmax population

  • +
  • INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data

  • +
  • INFO fields for filtering allele frequency (faf) annotations

  • +
+
+
+
+
Parameters:
+
    +
  • prefix (str) – Prefix string for data, e.g. “gnomAD”. Default is empty string.

  • +
  • suffix (str) – Suffix string for data, e.g. “gnomAD”. Default is empty string.

  • +
  • prefix_before_metric (bool) – Whether prefix should be added before the metric (AC, AN, AF, nhomalt, faf95, faf99) in INFO field. Default is True.

  • +
  • pop_names (Dict[str, str]) – Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES.

  • +
  • label_groups (Dict[str, List[str]]) – Dictionary containing an entry for each label group, where key is the name of the grouping, +e.g. “sex” or “pop”, and value is a list of all possible values for that grouping (e.g. [“male”, “female”] or [“afr”, “nfe”, “amr”]).

  • +
  • label_delimiter (str) – String to use as delimiter when making group label combinations.

  • +
  • bin_edges (Dict[str, str]) – Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.

  • +
  • faf (bool) – If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.

  • +
  • popmax (bool) – If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.

  • +
  • grpmax (bool) – If True, use alternate logic to auto-populate dictionary values associated with grpmax annotations.

  • +
  • fafmax (bool) – If True, use alternate logic to auto-populate dictionary values associated with fafmax annotations.

  • +
  • callstats (bool) – If True, use alternate logic to auto-populate dictionary values associated with callstats annotations.

  • +
  • freq_contingency – If True, use alternate logic to auto-populate dictionary values associated with frequency contingency table test (CTT) annotations.

  • +
  • freq_cmh (bool) – If True, use alternate logic to auto-populate dictionary values associated with frequency Cochran-Mantel-Haenszel (CMH) annotations.

  • +
  • description_text (str) – Optional text to append to the end of descriptions. Needs to start with a space if specified.

  • +
  • age_hist_distribution (str) – Pipe-delimited string of overall age distribution.

  • +
  • sort_order (List[str]) – List containing order to sort label group combinations. Default is SORT_ORDER.

  • +
  • freq_ctt (bool) –

  • +
  • age_hist_distribution

  • +
+
+
Return type:
+

Dict[str, Dict[str, str]]

+
+
Returns:
+

Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.

+
+
+
+ +
+
+gnomad.utils.vcf.add_as_info_dict(info_dict={'AS_SB_TABLE': {'Description': 'Allele-specific forward/reverse read counts for strand bias tests', 'Number': '.'}, 'AS_pab_max': {'Description': 'Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5', 'Number': 'A'}, 'BaseQRankSum': {'Description': 'Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities'}, 'FS': {'Description': "Phred-scaled p-value of Fisher's exact test for strand bias"}, 'InbreedingCoeff': {'Description': 'Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)', 'Number': 'A'}, 'MQ': {'Description': 'Root mean square of the mapping quality of reads across all samples'}, 'MQRankSum': {'Description': 'Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities'}, 'NEGATIVE_TRAIN_SITE': {'Description': 'Variant was used to build the negative training set of low-quality variants for VQSR'}, 'POSITIVE_TRAIN_SITE': {'Description': 'Variant was used to build the positive training set of high-quality variants for VQSR'}, 'QD': {'Description': 'Variant call confidence normalized by depth of sample reads supporting a variant'}, 'QUALapprox': {'Description': 'Sum of PL[0] values; used to approximate the QUAL score', 'Number': '1'}, 'ReadPosRankSum': {'Description': 'Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias'}, 'SOR': {'Description': 'Strand bias estimated by the symmetric odds ratio test'}, 'VQSLOD': {'Description': 'Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model'}, 'VarDP': {'Description': 'Depth over variant genotypes (does not include depth of reference samples)'}, 'allele_type': {'Description': 'Allele type (snv, insertion, deletion, or mixed)'}, 'culprit': {'Description': 'Worst-performing annotation in the VQSR Gaussian mixture model'}, 'decoy': {'Description': 'Variant falls within a reference decoy region'}, 'fail_interval_qc': {'Description': 'Less than 85 percent of samples meet 20X coverage if variant is in autosomal or PAR regions or 10X coverage for non-PAR regions of chromosomes X and Y.'}, 'has_star': {'Description': 'Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset'}, 'inbreeding_coeff': {'Description': 'Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)', 'Number': 'A'}, 'lcr': {'Description': 'Variant falls within a low complexity region'}, 'monoallelic': {'Description': 'All samples are homozygous alternate for the variant'}, 'n_alt_alleles': {'Description': 'Total number of alternate alleles observed at variant locus', 'Number': '1'}, 'negative_train_site': {'Description': 'Variant was used to build the negative training set of low-quality variants for VQSR'}, 'non_par': {'Description': 'Variant (on sex chromosome) falls outside a pseudoautosomal region'}, 'nonpar': {'Description': 'Variant (on sex chromosome) falls outside a pseudoautosomal region'}, 'only_het': {'Description': 'All samples are heterozygous for the variant'}, 'original_alleles': {'Description': 'Alleles before splitting multiallelics'}, 'outside_broad_capture_region': {'Description': 'Variant falls outside of Broad exome capture regions.'}, 'outside_ukb_capture_region': {'Description': 'Variant falls outside of UK Biobank exome capture regions.'}, 'positive_train_site': {'Description': 'Variant was used to build the positive training set of high-quality variants for VQSR'}, 'rf_label': {'Description': 'Random forest training label'}, 'rf_negative_label': {'Description': 'Variant was labelled as a negative example for training of random forest model'}, 'rf_positive_label': {'Description': 'Variant was labelled as a positive example for training of random forest model'}, 'rf_tp_probability': {'Description': 'Probability of a called variant being a true variant as determined by random forest model'}, 'rf_train': {'Description': 'Variant was used in training random forest model'}, 'segdup': {'Description': 'Variant falls within a segmental duplication region'}, 'sibling_singleton': {'Description': 'Variant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort).'}, 'transmitted_singleton': {'Description': 'Variant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort)'}, 'variant_type': {'Description': 'Variant type (snv, indel, multi-snv, multi-indel, or mixed)'}, 'was_mixed': {'Description': 'Variant type was mixed'}}, as_fields=['AS_FS', 'AS_MQ', 'AS_MQRankSum', 'AS_pab_max', 'AS_QUALapprox', 'AS_QD', 'AS_ReadPosRankSum', 'AS_SB_TABLE', 'AS_SOR', 'AS_VarDP', 'InbreedingCoeff'])[source]
+

Update info dictionary with allele-specific terms and their descriptions.

+

Used in VCF export.

+
+
Parameters:
+
    +
  • info_dict (Dict[str, Dict[str, str]]) – Dictionary containing site-level annotations and their descriptions. Default is INFO_DICT.

  • +
  • as_fields (List[str]) – List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS.

  • +
+
+
Return type:
+

Dict[str, Dict[str, str]]

+
+
Returns:
+

Dictionary with allele specific annotations, their descriptions, and their VCF number field.

+
+
+
+ +
+
+gnomad.utils.vcf.make_vcf_filter_dict(snp_cutoff, indel_cutoff, inbreeding_cutoff, variant_qc_filter='RF')[source]
+

Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.

+
+
Generates descriptions for:
    +
  • AC0 filter

  • +
  • InbreedingCoeff filter

  • +
  • Variant QC filter (RF or AS_VQSR)

  • +
  • PASS (passed all variant filters)

  • +
+
+
+
+
Parameters:
+
    +
  • snp_cutoff (float) – Minimum SNP cutoff score from random forest model.

  • +
  • indel_cutoff (float) – Minimum indel cutoff score from random forest model.

  • +
  • inbreeding_cutoff (float) – Inbreeding coefficient hard cutoff.

  • +
  • variant_qc_filter (str) – Method used for variant QC filter. One of ‘RF’ or ‘AS_VQSR’. Default is ‘RF’.

  • +
+
+
Return type:
+

Dict[str, str]

+
+
Returns:
+

Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes.

+
+
+
+ +
+
+gnomad.utils.vcf.make_hist_bin_edges_expr(ht, hists=['gq_hist_alt', 'gq_hist_all', 'dp_hist_alt', 'dp_hist_all', 'ab_hist_alt'], ann_with_hists=None, prefix='', label_delimiter='_', include_age_hists=True)[source]
+

Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters.

+
+
Parameters:
+
    +
  • ht (Table) – Table containing histogram variant annotations.

  • +
  • hists (List[str]) – List of variant histogram annotations. Default is HISTS.

  • +
  • ann_with_hists (Optional[str]) – Name of row annotation containing histogram data. In exomes or +genomes release HT, histograms is a row, but in the joint release HT, it’s +under the row of exomes, genomes, or joint.

  • +
  • prefix (str) – Prefix text for age histogram bin edges. Default is empty string.

  • +
  • label_delimiter (str) – String used as delimiter between prefix and histogram annotation.

  • +
  • include_age_hists (bool) – Include age histogram annotations.

  • +
+
+
Return type:
+

Dict[str, str]

+
+
Returns:
+

Dictionary keyed by histogram annotation name, with corresponding +reformatted bin edges for values.

+
+
+
+ +
+
+gnomad.utils.vcf.make_hist_dict(bin_edges, adj, hist_metric_list=['gq_hist_alt', 'gq_hist_all', 'dp_hist_alt', 'dp_hist_all', 'ab_hist_alt'], label_delimiter='_', drop_n_smaller_larger=False, prefix='', suffix='', description_text='')[source]
+

Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations.

+
+
Parameters:
+
    +
  • bin_edges (Dict[str, Dict[str, str]]) – Dictionary keyed by histogram annotation name, with corresponding string-reformatted bin edges for values.

  • +
  • adj (bool) – Whether to create a header dict for raw or adj quality histograms.

  • +
  • hist_metric_list (List[str]) – List of hists for which to build hist info dict

  • +
  • label_delimiter (str) – String used as delimiter in values stored in hist_metric_list.

  • +
  • drop_n_smaller_larger (bool) – Whether to drop n_smaller and n_larger annotations from header dict. Default is False.

  • +
  • prefix (str) – Prefix text for histogram annotations. Default is empty string.

  • +
  • suffix (str) – Suffix text for histogram annotations. Default is empty string.

  • +
  • description_text (str) – Optional text to append to the end of descriptions. Needs to start with a space if specified.

  • +
+
+
Return type:
+

Dict[str, str]

+
+
Returns:
+

Dictionary keyed by VCF INFO annotations, where values are Dictionaries of Number and Description attributes.

+
+
+
+ +
+
+gnomad.utils.vcf.set_female_y_metrics_to_na(t)[source]
+

Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0).

+
+
Parameters:
+

t (Union[Table, MatrixTable]) – Table/MatrixTable containing female variant annotations.

+
+
Return type:
+

Dict[str, Int32Expression]

+
+
Returns:
+

Dictionary with reset annotations

+
+
+
+ +
+
+gnomad.utils.vcf.build_vcf_export_reference(name, build='GRCh38', keep_contigs=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'], keep_chrM=True)[source]
+

Create export reference based on reference genome defined by build.

+

By default this will return a new reference with all non-standard contigs eliminated. Keeps chr 1-22, Y, X, and M.

+

An example of a non-standard contig is: ##contig=<ID=chr3_GL000221v1_random,length=155397,assembly=GRCh38>

+
+
Parameters:
+
    +
  • name (str) – Name to use for new reference.

  • +
  • build (str) – Reference genome build to use as starting reference genome.

  • +
  • keep_contigs (List[str]) – Contigs to keep from reference genome defined by build. Default is autosomes and sex chromosomes.

  • +
  • keep_chrM (bool) – Whether to keep chrM. Default is True.

  • +
+
+
Return type:
+

ReferenceGenome

+
+
Returns:
+

Reference genome for VCF export containing only contigs in keep_contigs.

+
+
+
+ +
+
+gnomad.utils.vcf.rekey_new_reference(t, reference)[source]
+

Re-key Table or MatrixTable with a new reference genome.

+
+
Parameters:
+
+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Re-keyed Table/MatrixTable

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/utils/vep.html b/api_reference/utils/vep.html new file mode 100644 index 000000000..c2f8bd676 --- /dev/null +++ b/api_reference/utils/vep.html @@ -0,0 +1,646 @@ + + + + + + + gnomad.utils.vep — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.utils.vep

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.utils.vep.CURRENT_VEP_VERSION

Versions of VEP used in gnomAD data, the latest version is 105.

gnomad.utils.vep.CSQ_CODING

Constant containing all coding consequences.

gnomad.utils.vep.CSQ_SPLICE

Constant containing all splice consequences.

gnomad.utils.vep.POSSIBLE_REFS

Constant containing supported references

gnomad.utils.vep.VEP_CONFIG_PATH

Constant that contains the local path to the VEP config file

gnomad.utils.vep.VEP_CSQ_FIELDS

Constant that defines the order of VEP annotations used in VCF export, currently stored in a dictionary with the VEP version as the key.

gnomad.utils.vep.VEP_CSQ_HEADER

Constant that contains description for VEP used in VCF export.

gnomad.utils.vep.LOFTEE_LABELS

Constant that contains annotations added by LOFTEE.

gnomad.utils.vep.LOF_CSQ_SET

Set containing loss-of-function consequence strings.

gnomad.utils.vep.get_vep_help([vep_config_path])

Return the output of vep --help which includes the VEP version.

gnomad.utils.vep.get_vep_context([ref])

Get VEP context resource for the genome build ref.

gnomad.utils.vep.vep_or_lookup_vep(ht[, ...])

VEP a table, or lookup variants in a reference database.

gnomad.utils.vep.add_most_severe_consequence_to_consequence(tc)

Add most_severe_consequence annotation to transcript consequences.

gnomad.utils.vep.process_consequences(mt[, ...])

Add most_severe_consequence into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root].

gnomad.utils.vep.filter_vep_to_canonical_transcripts(mt)

Filter VEP transcript consequences to those in the canonical transcript.

gnomad.utils.vep.filter_vep_to_mane_select_transcripts(mt)

Filter VEP transcript consequences to those in the MANE Select transcript.

gnomad.utils.vep.filter_vep_to_synonymous_variants(mt)

Filter VEP transcript consequences to those with a most severe consequence of 'synonymous_variant'.

gnomad.utils.vep.filter_vep_to_gene_list(t, ...)

Filter VEP transcript consequences to those in a set of genes.

gnomad.utils.vep.vep_struct_to_csq(vep_expr)

Given a VEP Struct, returns and array of VEP VCF CSQ strings (one per consequence in the struct).

gnomad.utils.vep.get_most_severe_consequence_for_summary(ht)

Prepare a hail Table for summary statistics generation.

gnomad.utils.vep.filter_vep_transcript_csqs(t)

Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering.

gnomad.utils.vep.add_most_severe_csq_to_tc_within_vep_root(t)

Add most_severe_consequence annotation to 'transcript_consequences' within the vep root annotation.

gnomad.utils.vep.explode_by_vep_annotation(t)

Explode the specified VEP annotation on the input Table/MatrixTable.

+
+
+gnomad.utils.vep.CURRENT_VEP_VERSION = '105'
+

Versions of VEP used in gnomAD data, the latest version is 105.

+
+ +
+
+gnomad.utils.vep.CSQ_CODING = ['transcript_ablation', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained', 'frameshift_variant', 'stop_lost', 'start_lost', 'initiator_codon_variant', 'transcript_amplification', 'inframe_insertion', 'inframe_deletion', 'missense_variant', 'protein_altering_variant', 'splice_region_variant', 'incomplete_terminal_codon_variant', 'start_retained_variant', 'stop_retained_variant', 'synonymous_variant', 'coding_sequence_variant']
+

Constant containing all coding consequences.

+
+ +
+
+gnomad.utils.vep.CSQ_SPLICE = ['splice_acceptor_variant', 'splice_donor_variant', 'splice_region_variant']
+

Constant containing all splice consequences.

+
+ +
+
+gnomad.utils.vep.POSSIBLE_REFS = ('GRCh37', 'GRCh38')
+

Constant containing supported references

+
+ +
+
+gnomad.utils.vep.VEP_CONFIG_PATH = 'file:///vep_data/vep-gcloud.json'
+

Constant that contains the local path to the VEP config file

+
+ +
+
+gnomad.utils.vep.VEP_CSQ_FIELDS = {'101': 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info', '105': 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|UNIPROT_ISOFORM|SOURCE|SIFT|PolyPhen|DOMAINS|miRNA|HGVS_OFFSET|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|LoF|LoF_filter|LoF_flags|LoF_info'}
+

Constant that defines the order of VEP annotations used in VCF export, currently stored in a dictionary with the VEP version as the key.

+
+ +
+
+gnomad.utils.vep.VEP_CSQ_HEADER = 'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|UNIPROT_ISOFORM|SOURCE|SIFT|PolyPhen|DOMAINS|miRNA|HGVS_OFFSET|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|LoF|LoF_filter|LoF_flags|LoF_info'
+

Constant that contains description for VEP used in VCF export.

+
+ +
+
+gnomad.utils.vep.LOFTEE_LABELS = ['HC', 'LC', 'OS']
+

Constant that contains annotations added by LOFTEE.

+
+ +
+
+gnomad.utils.vep.LOF_CSQ_SET = {'frameshift_variant', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained'}
+

Set containing loss-of-function consequence strings.

+
+ +
+
+gnomad.utils.vep.get_vep_help(vep_config_path=None)[source]
+

Return the output of vep –help which includes the VEP version.

+
+

Warning

+

If no vep_config_path is supplied, this function will only work for Dataproc clusters +created with hailctl dataproc start –vep. It assumes that the command is /path/to/vep.

+
+
+
Parameters:
+

vep_config_path (Optional[str]) – Optional path to use as the VEP config file. If None, VEP_CONFIG_URI environment variable is used

+
+
Returns:
+

VEP help string

+
+
+
+ +
+
+gnomad.utils.vep.get_vep_context(ref=None)[source]
+

Get VEP context resource for the genome build ref.

+
+
Parameters:
+

ref (Optional[str]) – Genome build. If None, hl.default_reference is used

+
+
Return type:
+

VersionedTableResource

+
+
Returns:
+

VEPed context resource

+
+
+
+ +
+
+gnomad.utils.vep.vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config_path=None, vep_version=None)[source]
+

VEP a table, or lookup variants in a reference database.

+
+

Warning

+

If reference_vep_ht is supplied, no check is performed to confirm reference_vep_ht was +generated with the same version of VEP / VEP configuration as the VEP referenced in vep_config_path.

+
+
+
Parameters:
+
    +
  • ht – Input Table

  • +
  • reference_vep_ht – A reference database with VEP annotations (must be in top-level vep)

  • +
  • reference – If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference)

  • +
  • vep_config_path – vep_config to pass to hl.vep (if None, a suitable one for reference is chosen)

  • +
  • vep_version – Version of VEPed context Table to use (if None, the default vep_context resource will be used)

  • +
+
+
Returns:
+

VEPed Table

+
+
+
+ +
+
+gnomad.utils.vep.add_most_severe_consequence_to_consequence(tc)[source]
+

Add most_severe_consequence annotation to transcript consequences.

+

This is for a given transcript, as there are often multiple annotations for a single transcript: +e.g. splice_region_variant&intron_variant -> splice_region_variant

+
+
Parameters:
+

tc (StructExpression) –

+
+
Return type:
+

StructExpression

+
+
+
+ +
+
+gnomad.utils.vep.process_consequences(mt, vep_root='vep', penalize_flags=True, csq_order=None, has_polyphen=True)[source]
+

Add most_severe_consequence into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root].

+

most_severe_consequence is the worst consequence for a transcript.

+
+

Note

+

From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct +in the release HTs. When using this function with gnomAD v4.0 or later, +set has_polyphen to False.

+
+
+
Parameters:
+
    +
  • mt (Union[MatrixTable, Table]) – Input Table or MatrixTable.

  • +
  • vep_root (str) – Root for VEP annotation (probably “vep”).

  • +
  • penalize_flags (bool) – Whether to penalize LOFTEE flagged variants, or treat them +as equal to HC.

  • +
  • csq_order (Optional[List[str]]) – Optional list indicating the order of VEP consequences, sorted +from high to low impact. Default is None, which uses the value of the +CSQ_ORDER global.

  • +
  • has_polyphen (bool) – Whether the input VEP Struct has a PolyPhen annotation which +will be used to modify the consequence score. Default is True.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

MT with better formatted consequences.

+
+
+
+ +
+
+gnomad.utils.vep.filter_vep_to_canonical_transcripts(mt, vep_root='vep', filter_empty_csq=False)[source]
+

Filter VEP transcript consequences to those in the canonical transcript.

+
+
Parameters:
+
    +
  • mt (Union[MatrixTable, Table]) – Input Table or MatrixTable.

  • +
  • vep_root (str) – Name used for VEP annotation. Default is ‘vep’.

  • +
  • filter_empty_csq (bool) – Whether to filter out rows where ‘transcript_consequences’ is empty. Default is False.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Table or MatrixTable with VEP transcript consequences filtered.

+
+
+
+ +
+
+gnomad.utils.vep.filter_vep_to_mane_select_transcripts(mt, vep_root='vep', filter_empty_csq=False)[source]
+

Filter VEP transcript consequences to those in the MANE Select transcript.

+
+
Parameters:
+
    +
  • mt (Union[MatrixTable, Table]) – Input Table or MatrixTable.

  • +
  • vep_root (str) – Name used for VEP annotation. Default is ‘vep’.

  • +
  • filter_empty_csq (bool) – Whether to filter out rows where ‘transcript_consequences’ is empty. Default is False.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Table or MatrixTable with VEP transcript consequences filtered.

+
+
+
+ +
+
+gnomad.utils.vep.filter_vep_to_synonymous_variants(mt, vep_root='vep', filter_empty_csq=False)[source]
+

Filter VEP transcript consequences to those with a most severe consequence of ‘synonymous_variant’.

+
+
Parameters:
+
    +
  • mt (Union[MatrixTable, Table]) – Input Table or MatrixTable.

  • +
  • vep_root (str) – Name used for VEP annotation. Default is ‘vep’.

  • +
  • filter_empty_csq (bool) – Whether to filter out rows where ‘transcript_consequences’ is empty. Default is False.

  • +
+
+
Return type:
+

Union[MatrixTable, Table]

+
+
Returns:
+

Table or MatrixTable with VEP transcript consequences filtered.

+
+
+
+ +
+
+gnomad.utils.vep.filter_vep_to_gene_list(t, genes, match_by_gene_symbol=False, vep_root='vep', filter_empty_csq=False)[source]
+

Filter VEP transcript consequences to those in a set of genes.

+
+

Note

+

Filtering to a list of genes by their ‘gene_id’ or ‘gene_symbol’ will filter to +all variants that are annotated to the gene, including +[‘upstream_gene_variant’, ‘downstream_gene_variant’], which will not be the +same as if you filter to a gene interval. If you only want variants inside +certain gene boundaries and a faster filter, you can first filter t to an +interval list and then apply this filter.

+
+
+
Parameters:
+
    +
  • t (Union[MatrixTable, Table]) – Input Table or MatrixTable.

  • +
  • genes (List[str]) – Genes of interest to filter VEP transcript consequences to.

  • +
  • match_by_gene_symbol (bool) – Whether to match values in genes to VEP transcript +consequences by ‘gene_symbol’ instead of ‘gene_id’. Default is False.

  • +
  • vep_root (str) – Name used for VEP annotation. Default is ‘vep’.

  • +
  • filter_empty_csq (bool) – Whether to filter out rows where ‘transcript_consequences’ +is empty. Default is False.

  • +
+
+
Returns:
+

Table or MatrixTable with VEP transcript consequences filtered.

+
+
+
+ +
+
+gnomad.utils.vep.vep_struct_to_csq(vep_expr, csq_fields='Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|UNIPROT_ISOFORM|SOURCE|SIFT|PolyPhen|DOMAINS|miRNA|HGVS_OFFSET|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|LoF|LoF_filter|LoF_flags|LoF_info', has_polyphen_sift=True)[source]
+

Given a VEP Struct, returns and array of VEP VCF CSQ strings (one per consequence in the struct).

+

The fields and their order will correspond to those passed in csq_fields, which corresponds to the +VCF header that is required to interpret the VCF CSQ INFO field.

+

Note that the order is flexible and that all fields that are in the default value are supported. +These fields will be formatted in the same way that their VEP CSQ counterparts are.

+

While other fields can be added if their name are the same as those in the struct. Their value will be the result of calling +hl.str(), so it may differ from their usual VEP CSQ representation.

+
+
Parameters:
+
    +
  • vep_expr (StructExpression) – The input VEP Struct

  • +
  • csq_fields (str) – The | delimited list of fields to include in the CSQ (in that order), default is the CSQ fields of the CURRENT_VEP_VERSION.

  • +
  • has_polyphen_sift (bool) – Whether the input VEP Struct has PolyPhen and SIFT annotations. Default is True.

  • +
+
+
Return type:
+

ArrayExpression

+
+
Returns:
+

The corresponding CSQ strings

+
+
+
+ +
+
+gnomad.utils.vep.get_most_severe_consequence_for_summary(ht, csq_order=['transcript_ablation', 'splice_acceptor_variant', 'splice_donor_variant', 'stop_gained', 'frameshift_variant', 'stop_lost', 'start_lost', 'initiator_codon_variant', 'transcript_amplification', 'inframe_insertion', 'inframe_deletion', 'missense_variant', 'protein_altering_variant', 'splice_region_variant', 'incomplete_terminal_codon_variant', 'start_retained_variant', 'stop_retained_variant', 'synonymous_variant', 'coding_sequence_variant', 'mature_miRNA_variant', '5_prime_UTR_variant', '3_prime_UTR_variant', 'non_coding_transcript_exon_variant', 'non_coding_exon_variant', 'intron_variant', 'NMD_transcript_variant', 'non_coding_transcript_variant', 'nc_transcript_variant', 'upstream_gene_variant', 'downstream_gene_variant', 'TFBS_ablation', 'TFBS_amplification', 'TF_binding_site_variant', 'regulatory_region_ablation', 'regulatory_region_amplification', 'feature_elongation', 'regulatory_region_variant', 'feature_truncation', 'intergenic_variant'], loftee_labels=['HC', 'LC', 'OS'])[source]
+

Prepare a hail Table for summary statistics generation.

+
+
Adds the following annotations:
    +
  • most_severe_csq: Most severe consequence for variant

  • +
  • protein_coding: Whether the variant is present on a protein-coding transcript

  • +
  • lof: Whether the variant is a loss-of-function variant

  • +
  • no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags)

  • +
+
+
+

Assumes input Table is annotated with VEP and that VEP annotations have been filtered to canonical transcripts.

+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • csq_order (List[str]) – Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER.

  • +
  • loftee_labels (List[str]) – Annotations added by LOFTEE. Default is LOFTEE_LABELS.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table annotated with VEP summary annotations.

+
+
+
+ +
+
+gnomad.utils.vep.filter_vep_transcript_csqs(t, vep_root='vep', synonymous=True, canonical=True, mane_select=False, filter_empty_csq=True, ensembl_only=True, protein_coding=False, csqs=None, keep_csqs=True, genes=None, keep_genes=True, match_by_gene_symbol=False, additional_filtering_criteria=None)[source]
+

Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering.

+

Transcript consequences can be filtered to those where ‘most_severe_consequence’ is +‘synonymous_variant’ and/or the transcript is the canonical transcript, if the +synonymous and canonical parameter are set to True, respectively.

+

If filter_empty_csq parameter is set to True, the Table/MatrixTable is filtered +to variants where ‘transcript_consequences’ within the VEP annotation is not empty +after the specified filtering criteria is applied.

+
+
Parameters:
+
    +
  • t (Union[Table, MatrixTable]) – Input Table or MatrixTable.

  • +
  • vep_root (str) – Name used for VEP annotation. Default is ‘vep’.

  • +
  • synonymous (bool) – Whether to filter to variants where the most severe consequence +is ‘synonymous_variant’. Default is True.

  • +
  • canonical (bool) – Whether to filter to only canonical transcripts. Default is True.

  • +
  • mane_select (bool) – Whether to filter to only MANE Select transcripts. Default is +False.

  • +
  • filter_empty_csq (bool) – Whether to filter out rows where ‘transcript_consequences’ +is empty, after filtering ‘transcript_consequences’ to the specified criteria. +Default is True.

  • +
  • ensembl_only (bool) – Whether to filter to only Ensembl transcripts. This option is +useful for deduplicating transcripts that are the same between RefSeq and +Emsembl. Default is True.

  • +
  • protein_coding (bool) – Whether to filter to only protein-coding transcripts. +Default is False.

  • +
  • csqs (List[str]) – Optional list of consequence terms to filter to. Transcript +consequences are filtered to those where ‘most_severe_consequence’ is in the +list of consequence terms csqs. Default is None.

  • +
  • keep_csqs (bool) – Whether to keep transcript consequences that are in csqs. If +set to False, transcript consequences that are in csqs will be removed. +Default is True.

  • +
  • genes (Optional[List[str]]) – Optional list of genes to filter VEP transcript consequences to. +Default is None.

  • +
  • keep_genes (bool) – Whether to keep transcript consequences that are in genes. If +set to False, transcript consequences that are in genes will be removed. +Default is True.

  • +
  • match_by_gene_symbol (bool) – Whether to match values in genes to VEP transcript +consequences by ‘gene_symbol’ instead of ‘gene_id’. Default is False.

  • +
  • additional_filtering_criteria (Optional[List[Callable]]) – Optional list of additional filtering +criteria to apply to the VEP transcript consequences.

  • +
+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Table or MatrixTable filtered to specified criteria.

+
+
+
+ +
+
+gnomad.utils.vep.add_most_severe_csq_to_tc_within_vep_root(t, vep_root='vep')[source]
+

Add most_severe_consequence annotation to ‘transcript_consequences’ within the vep root annotation.

+
+
Parameters:
+
    +
  • t (Union[Table, MatrixTable]) – Input Table or MatrixTable.

  • +
  • vep_root (str) – Root for vep annotation (probably vep).

  • +
+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Table or MatrixTable with most_severe_consequence annotation added.

+
+
+
+ +
+
+gnomad.utils.vep.explode_by_vep_annotation(t, vep_annotation='transcript_consequences', vep_root='vep')[source]
+

Explode the specified VEP annotation on the input Table/MatrixTable.

+
+
Parameters:
+
    +
  • t (Union[Table, MatrixTable]) – Input Table or MatrixTable.

  • +
  • vep_annotation (str) – Name of annotation in vep_root to explode.

  • +
  • vep_root (str) – Name used for root VEP annotation. Default is ‘vep’.

  • +
+
+
Return type:
+

Union[Table, MatrixTable]

+
+
Returns:
+

Table or MatrixTable with exploded VEP annotation.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/variant_qc/evaluation.html b/api_reference/variant_qc/evaluation.html new file mode 100644 index 000000000..394f90fd2 --- /dev/null +++ b/api_reference/variant_qc/evaluation.html @@ -0,0 +1,348 @@ + + + + + + + gnomad.variant_qc.evaluation — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.variant_qc.evaluation

+ + + + + + + + + + + + + + + + + + +

gnomad.variant_qc.evaluation.compute_ranked_bin(ht, ...)

Return a table with a bin for each row based on the ranking of score_expr.

gnomad.variant_qc.evaluation.compute_grouped_binned_ht(bin_ht)

Group a Table that has been annotated with bins (compute_ranked_bin or create_binned_ht).

gnomad.variant_qc.evaluation.compute_binned_truth_sample_concordance(ht, ...)

Determine the concordance (TP, FP, FN) between a truth sample within the callset and the samples truth data grouped by bins computed using compute_ranked_bin.

gnomad.variant_qc.evaluation.create_truth_sample_ht(mt, ...)

Compute a table comparing a truth sample in callset vs the truth.

gnomad.variant_qc.evaluation.add_rank(ht, ...)

Add rank based on the score_expr.

+
+
+gnomad.variant_qc.evaluation.compute_ranked_bin(ht, score_expr, bin_expr={'bin': True}, compute_snv_indel_separately=True, n_bins=100, desc=True)[source]
+

Return a table with a bin for each row based on the ranking of score_expr.

+

The bin is computed by dividing the score_expr into n_bins bins containing approximately equal numbers of elements. +This is done by ranking the rows by score_expr (and a random number in cases where multiple variants have the same score) +and then assigning the variant to a bin based on its ranking.

+

If compute_snv_indel_separately is True all items in bin_expr will be stratified by snv / indels for the ranking and +bin calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For +example if we have the following four variants and scores and n_bins of 2:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Variant

Type

Score

bin - compute_snv_indel_separately:

False

True

Var1

SNV

0.1

1

1

Var2

SNV

0.2

1

2

Var3

Indel

0.3

2

1

Var4

Indel

0.4

2

2

+
+

Note

+

The bin_expr defines which data the bin(s) should be computed on. E.g., to get biallelic specific binning +and singleton specific binning, the following could be used:

+
bin_expr={
+    'biallelic_bin': ~ht.was_split,
+    'singleton_bin': ht.singleton
+}
+
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table

  • +
  • score_expr (NumericExpression) – Expression containing the score

  • +
  • bin_expr (Dict[str, BooleanExpression]) – Specific row grouping(s) to perform ranking and binning on (see note)

  • +
  • compute_snv_indel_separately (bool) – Should all bin_expr items be stratified by SNVs / indels

  • +
  • n_bins (int) – Number of bins to bin the data into

  • +
  • desc (bool) – Whether to bin the score in descending order

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with the requested bin annotations

+
+
+
+ +
+
+gnomad.variant_qc.evaluation.compute_grouped_binned_ht(bin_ht, checkpoint_path=None)[source]
+

Group a Table that has been annotated with bins (compute_ranked_bin or create_binned_ht).

+

The table will be grouped by bin_id (bin, biallelic, etc.), contig, snv, bi_allelic and singleton.

+
+

Note

+

If performing an aggregation following this grouping (such as score_bin_agg) then the aggregation +function will need to use ht._parent to get the origin Table from the GroupedTable for the aggregation

+
+
+
Parameters:
+
    +
  • bin_ht (Table) – Input Table with a bin_id annotation

  • +
  • checkpoint_path (Optional[str]) – If provided an intermediate checkpoint table is created with all required annotations before shuffling.

  • +
+
+
Return type:
+

GroupedTable

+
+
Returns:
+

Table grouped by bins(s)

+
+
+
+ +
+
+gnomad.variant_qc.evaluation.compute_binned_truth_sample_concordance(ht, binned_score_ht, n_bins=100, add_bins={})[source]
+

Determine the concordance (TP, FP, FN) between a truth sample within the callset and the samples truth data grouped by bins computed using compute_ranked_bin.

+
+

Note

+
+
The input ‘ht` should contain three row fields:
    +
  • score: value to use for binning

  • +
  • GT: a CallExpression containing the genotype of the evaluation data for the sample

  • +
  • truth_GT: a CallExpression containing the genotype of the truth sample

  • +
+
+
The input binned_score_ht should contain:
    +
  • score: value used to bin the full callset

  • +
  • bin: the full callset bin

  • +
+
+
+
+

‘add_bins` can be used to add additional global and truth sample binning to the final binned truth sample +concordance HT. The keys in add_bins must be present in binned_score_ht and the values in add_bins +should be expressions on ht that define a subset of variants to bin in the truth sample. An example is if we want +to look at the global and truth sample binning on only bi-allelic variants. add_bins could be set to +{‘biallelic_bin’: ht.biallelic}.

+

The table is grouped by global/truth sample bin and variant type and contains TP, FP and FN.

+
+
Parameters:
+
    +
  • ht (Table) – Input HT

  • +
  • binned_score_ht (Table) – Table with the bin annotation for each variant

  • +
  • n_bins (int) – Number of bins to bin the data into

  • +
  • add_bins (Dict[str, BooleanExpression]) – Dictionary of additional global bin columns (key) and the expr to use for binning the truth sample (value)

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Binned truth sample concordance HT

+
+
+
+ +
+
+gnomad.variant_qc.evaluation.create_truth_sample_ht(mt, truth_mt, high_confidence_intervals_ht)[source]
+

Compute a table comparing a truth sample in callset vs the truth.

+
+
Parameters:
+
    +
  • mt (MatrixTable) – MT of truth sample from callset to be compared to truth

  • +
  • truth_mt (MatrixTable) – MT of truth sample

  • +
  • high_confidence_intervals_ht (Table) – High confidence interval HT

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table containing both the callset truth sample and the truth data

+
+
+
+ +
+
+gnomad.variant_qc.evaluation.add_rank(ht, score_expr, subrank_expr=None)[source]
+

Add rank based on the score_expr. Rank is added for snvs and indels separately.

+

If one or more subrank_expr are provided, then subrank is added based on all sites for which the boolean expression is true.

+

In addition, variant counts (snv, indel separately) is added as a global (rank_variant_counts).

+
+
Parameters:
+
    +
  • ht (Table) – input Hail Table containing variants (with QC annotations) to be ranked

  • +
  • score_expr (NumericExpression) – the Table annotation by which ranking should be scored

  • +
  • subrank_expr (Optional[Dict[str, BooleanExpression]]) – Any subranking to be added in the form name_of_subrank: subrank_filtering_expr

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with rankings added

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/variant_qc/index.html b/api_reference/variant_qc/index.html new file mode 100644 index 000000000..2370b8dea --- /dev/null +++ b/api_reference/variant_qc/index.html @@ -0,0 +1,174 @@ + + + + + + + gnomad.variant_qc — gnomad master documentation + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/api_reference/variant_qc/ld.html b/api_reference/variant_qc/ld.html new file mode 100644 index 000000000..99b020abe --- /dev/null +++ b/api_reference/variant_qc/ld.html @@ -0,0 +1,229 @@ + + + + + + + gnomad.variant_qc.ld — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.variant_qc.ld

+ + + + + + + + + + + + + + + +

gnomad.variant_qc.ld.get_r_human_readable(...)

gnomad.variant_qc.ld.get_r_for_pair_of_variants(bm, ...)

Get r value (LD) for pair of variants var1 and var2.

gnomad.variant_qc.ld.get_r_within_gene_in_pop(...)

Get LD information (r) for all pairs of variants within gene for a given pop.

gnomad.variant_qc.ld.get_r_within_gene(bm, ...)

Get LD information (r) for all pairs of variants within gene.

+
+
+gnomad.variant_qc.ld.get_r_human_readable(pop, var1, var2, ref_genome='GRCh37')[source]
+
+
Parameters:
+
    +
  • pop (str) –

  • +
  • var1 (str) –

  • +
  • var2 (str) –

  • +
  • ref_genome (str) –

  • +
+
+
+
+ +
+
+gnomad.variant_qc.ld.get_r_for_pair_of_variants(bm, ld_index, var1, var2)[source]
+

Get r value (LD) for pair of variants var1 and var2.

+
bm = get_ld_matrix('nfe')
+ld_index = get_ld_index('nfe')
+var1 = (hl.parse_locus('1:10146', 'GRCh37'), ['AC', 'A'])
+var2 = (hl.parse_locus('1:10151', 'GRCh37'), ['TA', 'T'])
+get_r_for_pair_of_variants(bm, ld_index, var1, var2)
+# 0.01789767935482124
+
+
+
+
Parameters:
+
    +
  • bm (BlockMatrix) – Input BlockMatrix

  • +
  • ld_index (Table) – Corresponding index table

  • +
  • var1 ((tlocus, tarray)) – Tuple of locus and alleles

  • +
  • var2 ((tlocus, tarray)) – Tuple of locus and alleles

  • +
+
+
Returns:
+

Correlation (r) between two variants

+
+
+
+ +
+
+gnomad.variant_qc.ld.get_r_within_gene_in_pop(pop, gene)[source]
+

Get LD information (r) for all pairs of variants within gene for a given pop.

+

Warning: this returns a table quadratic in number of variants. Exercise caution with large genes.

+
+
Parameters:
+
    +
  • pop (str) – Population for which to get LD information

  • +
  • gene (str) – Gene symbol as string

  • +
+
+
Returns:
+

Table with pairs of variants

+
+
+
+ +
+
+gnomad.variant_qc.ld.get_r_within_gene(bm, ld_index, gene, vep_ht=None, reference_genome=None)[source]
+

Get LD information (r) for all pairs of variants within gene.

+

Warning: this returns a table quadratic in number of variants. Exercise caution with large genes.

+
+
Parameters:
+
    +
  • bm (BlockMatrix) – Input Block Matrix

  • +
  • ld_index (Table) – Corresponding index table

  • +
  • gene (str) – Gene symbol as string

  • +
  • vep_ht (Table) – Table with VEP annotations (if None, gets from get_gnomad_public_data())

  • +
  • reference_genome (str) – Reference genome to pass to get_gene_intervals for fast filtering to gene

  • +
+
+
Returns:
+

Table with pairs of variants

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/variant_qc/pipeline.html b/api_reference/variant_qc/pipeline.html new file mode 100644 index 000000000..20827be29 --- /dev/null +++ b/api_reference/variant_qc/pipeline.html @@ -0,0 +1,400 @@ + + + + + + + gnomad.variant_qc.pipeline — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.variant_qc.pipeline

+ + + + + + + + + + + + + + + + + + +

gnomad.variant_qc.pipeline.create_binned_ht(ht)

Annotate each row of ht with a bin based on binning the score annotation into n_bins equally-sized bins.

gnomad.variant_qc.pipeline.score_bin_agg(ht, ...)

Make dict of aggregations for min/max of score, number of ClinVar variants, number of truth variants, and family statistics.

gnomad.variant_qc.pipeline.generate_trio_stats(mt)

Run generate_trio_stats_expr with variant QC pipeline defaults to get trio stats stratified by raw and adj.

gnomad.variant_qc.pipeline.generate_sib_stats(mt, ...)

Generate a hail table with counts of variants shared by pairs of siblings in relatedness_ht.

gnomad.variant_qc.pipeline.train_rf_model(ht, ...)

Perform random forest (RF) training using a Table annotated with features and training data.

+
+
+gnomad.variant_qc.pipeline.create_binned_ht(ht, n_bins=100, singleton=True, biallelic=True, adj=True, add_substrat=None)[source]
+

Annotate each row of ht with a bin based on binning the score annotation into n_bins equally-sized bins.

+

This is meant as a default wrapper for compute_ranked_bin.

+
+

Note

+
+
The following fields should be present:
    +
  • score

  • +
  • ac - expected that this is the adj filtered allele count

  • +
  • ac_raw - expected that this is the raw allele count before adj filtering

  • +
+
+
+
+
+
Computes bin numbers stratified by SNV / Indels and with the following optional sub bins
    +
  • singletons

  • +
  • biallelics

  • +
  • biallelic singletons

  • +
  • adj

  • +
  • adj biallelics

  • +
  • adj singletons

  • +
  • adj biallelic singletons

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Input table

  • +
  • n_bins (int) – Number of bins to bin into

  • +
  • singleton (bool) – Should bins be stratified by singletons

  • +
  • biallelic (bool) – Should bins be stratified by bi-alleleic variants

  • +
  • adj (bool) – Should bins be stratified by adj filtering

  • +
  • add_substrat (Optional[Dict[str, BooleanExpression]]) – Any additional stratifications for adding bins

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

table with bin number for each variant

+
+
+
+ +
+
+gnomad.variant_qc.pipeline.score_bin_agg(ht, fam_stats_ht)[source]
+

Make dict of aggregations for min/max of score, number of ClinVar variants, number of truth variants, and family statistics.

+
+

Note

+

This function uses ht._parent to get the origin Table from the GroupedTable for the aggregation

+
+

This can easily be combined with the GroupedTable returned by compute_grouped_binned_ht, For example:

+
binned_ht = create_binned_ht(...)
+grouped_binned_ht = compute_grouped_binned_ht(binned_ht)
+agg_ht = grouped_binned_ht.aggregate(score_bin_agg(**grouped_binned_ht, ...))
+
+
+
+

Note

+

The following annotations should be present:

+
+
In ht:
    +
  • score

  • +
  • singleton

  • +
  • positive_train_site

  • +
  • negative_train_site

  • +
  • ac_raw - expected that this is the raw allele count before adj filtering

  • +
  • ac - expected that this is the allele count after adj filtering

  • +
  • ac_qc_samples_unrelated_raw - allele count before adj filtering for unrelated samples passing sample QC

  • +
  • info - struct that includes QD, FS, and MQ in order to add an annotation for fail_hard_filters

  • +
+
+
In truth_ht:
    +
  • omni

  • +
  • mills

  • +
  • hapmap

  • +
  • kgp_phase1_hc

  • +
+
+
In fam_stats_ht:
    +
  • n_de_novos_adj

  • +
  • n_de_novos_raw

  • +
  • n_transmitted_raw

  • +
  • n_untransmitted_raw

  • +
+
+
+
+
+
Automatic aggregations that will be done are:
    +
  • min_score - minimun of score annotation per group

  • +
  • max_score - maiximum of score annotation per group

  • +
  • n - count of variants per group

  • +
  • n_ins - count of insertion per group

  • +
  • n_ins - count of insertion per group

  • +
  • n_del - count of deletions per group

  • +
  • n_ti - count of transitions per group

  • +
  • n_tv - count of trnasversions per group

  • +
  • n_1bp_indel - count of one base pair indels per group

  • +
  • n_mod3bp_indel - count of indels with a length divisible by three per group

  • +
  • n_singleton - count of singletons per group

  • +
  • fail_hard_filters - count of variants per group with QD < 2 | FS > 60 | MQ < 30

  • +
  • n_vqsr_pos_train - count of variants that were a VQSR positive train site per group

  • +
  • n_vqsr_neg_train - count of variants that were a VQSR negative train site per group

  • +
  • n_clinvar - count of clinvar variants

  • +
  • n_de_novos_singleton_adj - count of singleton de novo variants after adj filtration

  • +
  • n_de_novo_singleton - count of raw unfiltered singleton de novo variants

  • +
  • n_de_novos_adj - count of adj filtered de novo variants

  • +
  • n_de_novos - count of raw unfiltered de novo variants

  • +
  • n_trans_singletons - count of transmitted singletons

  • +
  • n_untrans_singletons - count of untransmitted singletons

  • +
  • n_omni - count of omni truth variants

  • +
  • n_mills - count of mills truth variants

  • +
  • n_hapmap - count of hapmap truth variants

  • +
  • n_kgp_phase1_hc - count of 1000 genomes phase 1 high confidence truth variants

  • +
+
+
+
+
Parameters:
+
    +
  • ht (GroupedTable) – Table that aggregation will be performed on

  • +
  • fam_stats_ht (Table) – Path to family statistics HT

  • +
+
+
Return type:
+

Dict[str, Aggregation]

+
+
Returns:
+

a dictionary containing aggregations to perform on ht

+
+
+
+ +
+
+gnomad.variant_qc.pipeline.generate_trio_stats(mt, autosomes_only=True, bi_allelic_only=True)[source]
+

Run generate_trio_stats_expr with variant QC pipeline defaults to get trio stats stratified by raw and adj.

+
+

Note

+

Expects that mt is it a trio matrix table that was annotated with adj and if dealing with +a sparse MT hl.experimental.densify must be run first.

+

By default this pipeline function will filter mt to only autosomes and bi-allelic sites.

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – A Trio Matrix Table returned from hl.trio_matrix. Must be dense

  • +
  • autosomes_only (bool) – If set, only autosomal intervals are used.

  • +
  • bi_allelic_only (bool) – If set, only bi-allelic sites are used for the computation

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with trio stats

+
+
+
+ +
+
+gnomad.variant_qc.pipeline.generate_sib_stats(mt, relatedness_ht, i_col='i', j_col='j', relationship_col='relationship', autosomes_only=True, bi_allelic_only=True)[source]
+

Generate a hail table with counts of variants shared by pairs of siblings in relatedness_ht.

+

This is meant as a default wrapper for generate_sib_stats_expr.

+

This function takes a hail Table with a row for each pair of individuals i,j in the data that are related +(it’s OK to have unrelated samples too).

+

The relationship_col should be a column specifying the relationship between each two samples as defined by +the constants in gnomad.utils.relatedness. This relationship_col will be used to filter to only pairs of +samples that are annotated as SIBLINGS.

+
+

Note

+

By default this pipeline function will filter mt to only autosomes and bi-allelic sites.

+
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input Matrix table

  • +
  • relatedness_ht (Table) – Input relationship table

  • +
  • i_col (str) – Column containing the 1st sample of the pair in the relationship table

  • +
  • j_col (str) – Column containing the 2nd sample of the pair in the relationship table

  • +
  • relationship_col (str) – Column containing the relationship for the sample pair as defined in this module constants.

  • +
  • autosomes_only (bool) – If set, only autosomal intervals are used.

  • +
  • bi_allelic_only (bool) – If set, only bi-allelic sites are used for the computation

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

A Table with the sibling shared variant counts

+
+
+
+ +
+
+gnomad.variant_qc.pipeline.train_rf_model(ht, rf_features, tp_expr, fp_expr, fp_to_tp=1.0, num_trees=500, max_depth=5, test_expr=False)[source]
+

Perform random forest (RF) training using a Table annotated with features and training data.

+
+

Note

+
+
This function uses train_rf and extends it by:
    +
  • Adding an option to apply the resulting model to test variants which are withheld from training.

  • +
  • Uses a false positive (FP) to true positive (TP) ratio to determine what variants to use for RF training.

  • +
+
+
+
+
+
The returned Table includes the following annotations:
    +
  • rf_train: indicates if the variant was used for training of the RF model.

  • +
  • rf_label: indicates if the variant is a TP or FP.

  • +
  • rf_test: indicates if the variant was used in testing of the RF model.

  • +
  • features: global annotation of the features used for the RF model.

  • +
  • features_importance: global annotation of the importance of each feature in the model.

  • +
  • test_results: results from testing the model on variants defined by test_expr.

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Table annotated with features for the RF model and the positive and negative training data.

  • +
  • rf_features (List[str]) – List of column names to use as features in the RF training.

  • +
  • tp_expr (BooleanExpression) – TP training expression.

  • +
  • fp_expr (BooleanExpression) – FP training expression.

  • +
  • fp_to_tp (float) – Ratio of FPs to TPs for creating the RF model. If set to 0, all training examples are used.

  • +
  • num_trees (int) – Number of trees in the RF model.

  • +
  • max_depth (int) – Maxmimum tree depth in the RF model.

  • +
  • test_expr (BooleanExpression) – An expression specifying variants to hold out for testing and use for evaluation only.

  • +
+
+
Return type:
+

Tuple[Table, PipelineModel]

+
+
Returns:
+

Table with TP and FP training sets used in the RF training and the resulting RF model.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/variant_qc/random_forest.html b/api_reference/variant_qc/random_forest.html new file mode 100644 index 000000000..6460974f2 --- /dev/null +++ b/api_reference/variant_qc/random_forest.html @@ -0,0 +1,536 @@ + + + + + + + gnomad.variant_qc.random_forest — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.variant_qc.random_forest

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

gnomad.variant_qc.random_forest.run_rf_test(mt)

Run a dummy test RF on a given MT.

gnomad.variant_qc.random_forest.check_ht_fields_for_spark(ht, ...)

Check specified fields of a hail table for Spark DataFrame conversion (type and name).

gnomad.variant_qc.random_forest.get_columns_quantiles(ht, ...)

Compute approximate quantiles of specified numeric fields from non-missing values.

gnomad.variant_qc.random_forest.median_impute_features(ht)

Numerical features in the Table are median-imputed by Hail's approx_median.

gnomad.variant_qc.random_forest.ht_to_rf_df(ht, ...)

Create a Spark dataframe ready for RF from a HT.

gnomad.variant_qc.random_forest.get_features_importance(...)

Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.

gnomad.variant_qc.random_forest.get_labels(...)

Return the labels from the StringIndexer stage at index 0 from an RF pipeline model.

gnomad.variant_qc.random_forest.test_model(ht, ...)

A wrapper to test a model on a set of examples with known labels.

gnomad.variant_qc.random_forest.apply_rf_model(ht, ...)

Apply a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions.

gnomad.variant_qc.random_forest.save_model(...)

Save a Random Forest pipeline model.

gnomad.variant_qc.random_forest.load_model(...)

Load a Random Forest pipeline model.

gnomad.variant_qc.random_forest.train_rf(ht, ...)

Train a Random Forest (RF) pipeline model.

gnomad.variant_qc.random_forest.get_rf_runs(...)

Load RF run data from JSON file.

gnomad.variant_qc.random_forest.get_run_data(...)

Create a Dict containing information about the RF input arguments and feature importance.

gnomad.variant_qc.random_forest.pretty_print_runs(runs)

Print the information for the RF runs loaded from the json file storing the RF run hashes -> info.

+
+
+gnomad.variant_qc.random_forest.run_rf_test(mt, output='/tmp')[source]
+

Run a dummy test RF on a given MT.

+
    +
  1. Creates row annotations and labels to run model on

  2. +
  3. Trains a RF pipeline model (including median imputation of missing values in created annotations)

  4. +
  5. Saves the RF pipeline model

  6. +
  7. Applies the model to the MT and prints features importance

  8. +
+
+
Parameters:
+
    +
  • mt (MatrixTable) – Input MT

  • +
  • output (str) – Output files prefix to save the RF model

  • +
+
+
Return type:
+

Tuple[PipelineModel, Table]

+
+
Returns:
+

RF model and MatrixTable after applying RF model

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.check_ht_fields_for_spark(ht, fields)[source]
+

Check specified fields of a hail table for Spark DataFrame conversion (type and name).

+
+
Parameters:
+
    +
  • ht (Table) – input Table

  • +
  • fields (List[str]) – Fields to test

  • +
+
+
Return type:
+

None

+
+
Returns:
+

None

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.get_columns_quantiles(ht, fields, quantiles, relative_error=0.001)[source]
+

Compute approximate quantiles of specified numeric fields from non-missing values. Non-numeric fields are ignored.

+

This function returns a Dict of column name -> list of quantiles in the same order specified. +If a column only has NAs, None is returned.

+
+
Parameters:
+
    +
  • ht (Table) – input HT

  • +
  • fields (List[str]) – list of features to impute. If none given, all numerical features with missing data are imputed

  • +
  • quantiles (List[float]) – list of quantiles to return (e.g. [0.5] would return the median)

  • +
  • relative_error (int) – The relative error on the quantile approximation

  • +
+
+
Return type:
+

Dict[str, List[float]]

+
+
Returns:
+

Dict of column -> quantiles

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.median_impute_features(ht, strata=None)[source]
+

Numerical features in the Table are median-imputed by Hail’s approx_median.

+

If a strata dict is given, imputation is done based on the median of of each stratification.

+
+
The annotations that are added to the Table are
    +
  • feature_imputed - A row annotation indicating if each numerical feature was imputed or not.

  • +
  • features_median - A global annotation containing the median of the numerical features. If strata is given, +this struct will also be broken down by the given strata.

  • +
  • variants_by_strata - An additional global annotation with the variant counts by strata that will only be +added if imputing by a given strata.

  • +
+
+
+
+
Parameters:
+
    +
  • ht (Table) – Table containing all samples and features for median imputation.

  • +
  • strata (Optional[Dict[str, Expression]]) – Whether to impute features median by specific strata (default False).

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Feature Table imputed using approximate median values.

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.ht_to_rf_df(ht, features, label=None, index=None)[source]
+

Create a Spark dataframe ready for RF from a HT.

+

Rows with any missing features are dropped. +Missing labels are replaced with ‘NA’

+
+

Note

+

Only basic types are supported!

+
+
+
Parameters:
+
    +
  • ht (Table) – Input HT

  • +
  • features (List[str]) – Features that will be used for RF

  • +
  • label (str) – Optional label column that will be predicted by RF

  • +
  • index (str) – Optional index column to keep (E.g. for joining results back at later stage)

  • +
+
+
Return type:
+

DataFrame

+
+
Returns:
+

Spark Dataframe

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.get_features_importance(rf_pipeline, rf_index=-2, assembler_index=-3)[source]
+

Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.

+
+
Parameters:
+
    +
  • rf_pipeline (PipelineModel) – Input pipeline

  • +
  • rf_index (int) – index of the RandomForestClassifier stage

  • +
  • assembler_index (int) – index of the VectorAssembler stage

  • +
+
+
Return type:
+

Dict[str, float]

+
+
Returns:
+

feature importance for each feature in the RF model

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.get_labels(rf_pipeline)[source]
+

Return the labels from the StringIndexer stage at index 0 from an RF pipeline model.

+
+
Parameters:
+

rf_pipeline (PipelineModel) – Input pipeline

+
+
Return type:
+

List[str]

+
+
Returns:
+

labels

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.test_model(ht, rf_model, features, label, prediction_col_name='rf_prediction')[source]
+

A wrapper to test a model on a set of examples with known labels.

+
    +
  1. Runs the model on the data

  2. +
  3. Prints confusion matrix and accuracy

  4. +
  5. Returns confusion matrix as a list of struct

  6. +
+
+
Parameters:
+
    +
  • ht (Table) – Input table

  • +
  • rf_model (PipelineModel) – RF Model

  • +
  • features (List[str]) – Columns containing features that were used in the model

  • +
  • label (str) – Column containing label to be predicted

  • +
  • prediction_col_name (str) – Where to store the prediction

  • +
+
+
Return type:
+

List[tstruct]

+
+
Returns:
+

A list containing structs with {label, prediction, n}

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.apply_rf_model(ht, rf_model, features, label=None, probability_col_name='rf_probability', prediction_col_name='rf_prediction')[source]
+

Apply a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions.

+
+
Parameters:
+
    +
  • ht (Table) – Input HT

  • +
  • rf_model (PipelineModel) – Random Forest pipeline model

  • +
  • features (List[str]) – List of feature columns in the pipeline. !Should match the model list of features!

  • +
  • label (str) – Optional column containing labels. !Should match the model labels!

  • +
  • probability_col_name (str) – Name of the column that will store the RF probabilities

  • +
  • prediction_col_name (str) – Name of the column that will store the RF predictions

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table with RF columns

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.save_model(rf_pipeline, out_path, overwrite=False)[source]
+

Save a Random Forest pipeline model.

+
+
Parameters:
+
    +
  • rf_pipeline (PipelineModel) – Pipeline to save

  • +
  • out_path (str) – Output path

  • +
  • overwrite (bool) – If set, will overwrite existing file(s) at output location

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.load_model(input_path)[source]
+

Load a Random Forest pipeline model.

+
+
Parameters:
+

input_path (str) – Location of model to load

+
+
Return type:
+

PipelineModel

+
+
Returns:
+

Random Forest pipeline model

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.train_rf(ht, features, label, num_trees=500, max_depth=5)[source]
+

Train a Random Forest (RF) pipeline model.

+
+
Parameters:
+
    +
  • ht (Table) – Input HT

  • +
  • features (List[str]) – List of columns to be used as features

  • +
  • label (str) – Column containing the label to predict

  • +
  • num_trees (int) – Number of trees to use

  • +
  • max_depth (int) – Maximum tree depth

  • +
+
+
Return type:
+

PipelineModel

+
+
Returns:
+

Random Forest pipeline model

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.get_rf_runs(rf_json_fp)[source]
+

Load RF run data from JSON file.

+
+
Parameters:
+

rf_json_fp (str) – File path to rf json file.

+
+
Return type:
+

Dict

+
+
Returns:
+

Dictionary containing the content of the JSON file, or an empty dictionary if the file wasn’t found.

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.get_run_data(input_args, test_intervals, features_importance, test_results)[source]
+

Create a Dict containing information about the RF input arguments and feature importance.

+
+
Parameters:
+
    +
  • input_args (Dict[str, bool]) – Dictionary of model input arguments

  • +
  • test_intervals (List[str]) – Intervals withheld from training to be used in testing

  • +
  • features_importance (Dict[str, float]) – Feature importance returned by the RF

  • +
  • test_results (List[tstruct]) – Accuracy results from applying RF model to the test intervals

  • +
  • input_args

  • +
  • test_intervals

  • +
  • features_importance

  • +
  • test_results

  • +
+
+
Return type:
+

Dict

+
+
Returns:
+

Dict of RF information

+
+
+
+ +
+
+gnomad.variant_qc.random_forest.pretty_print_runs(runs, label_col='rf_label', prediction_col_name='rf_prediction')[source]
+

Print the information for the RF runs loaded from the json file storing the RF run hashes -> info.

+
+
Parameters:
+
    +
  • runs (Dict) – Dictionary containing JSON input loaded from RF run file

  • +
  • label_col (str) – Name of the RF label column

  • +
  • prediction_col_name (str) – Name of the RF prediction column

  • +
+
+
Return type:
+

None

+
+
Returns:
+

Nothing – only prints information

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/api_reference/variant_qc/training.html b/api_reference/variant_qc/training.html new file mode 100644 index 000000000..c36d864b8 --- /dev/null +++ b/api_reference/variant_qc/training.html @@ -0,0 +1,176 @@ + + + + + + + gnomad.variant_qc.training — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

gnomad.variant_qc.training

+ + + + + + +

gnomad.variant_qc.training.sample_training_examples(ht, ...)

Return a Table of all positive and negative training examples in ht with an annotation indicating those that should be used for training.

+
+
+gnomad.variant_qc.training.sample_training_examples(ht, tp_expr, fp_expr, fp_to_tp=1.0, test_expr=None)[source]
+

Return a Table of all positive and negative training examples in ht with an annotation indicating those that should be used for training.

+

If fp_to_tp is greater than 0, this true positive (TP) to false positive (FP) ratio will be used to determine +sampling of training variants.

+
+
The returned Table has the following annotations:
    +
  • train: indicates if the variant should be used for training. A row is given False for the annotation if True +for test_expr, True for both tp_expr and fp_expr, or it is pruned out to obtain the desired fp_to_tp ratio.

  • +
  • label: indicates if a variant is a ‘TP’ or ‘FP’ and will also be labeled as such for variants defined by test_expr.

  • +
+
+
+
+

Note

+
    +
  • This function does not support multi-allelic variants.

  • +
  • The function will give some stats about the TPs/FPs provided (Ti, Tv, indels).

  • +
+
+
+
Parameters:
+
    +
  • ht (Table) – Input Table.

  • +
  • tp_expr (BooleanExpression) – Expression for TP examples.

  • +
  • fp_expr (BooleanExpression) – Expression for FP examples.

  • +
  • fp_to_tp (float) – FP to TP ratio. If set to <= 0, all training examples are used.

  • +
  • test_expr (Optional[BooleanExpression]) – Optional expression to exclude a set of variants from training set. Still contains TP/FP label annotation.

  • +
+
+
Return type:
+

Table

+
+
Returns:
+

Table subset with corresponding TP and FP examples with desired FP to TP ratio.

+
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/examples/index.html b/examples/index.html new file mode 100644 index 000000000..9fad193ba --- /dev/null +++ b/examples/index.html @@ -0,0 +1,118 @@ + + + + + + + Examples — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Examples

+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/examples/vep.html b/examples/vep.html new file mode 100644 index 000000000..b259b27a9 --- /dev/null +++ b/examples/vep.html @@ -0,0 +1,147 @@ + + + + + + + Variant Effect Predictor (VEP) — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Variant Effect Predictor (VEP)

+

To use the Ensembl Variant Effect Predictor with Hail on Google Dataproc, +the --vep flag must be included when starting the cluster. Note that a cluster’s VEP configuration is +tied to a specific reference genome.

+
hailctl dataproc start cluster-name --vep GRCh37 --packages gnomad
+
+
+
+

Note

+

VEP data is stored in requester pays buckets. Reading from these buckets will bill charges to the project +in which the cluster is created.

+
+

Import variants into a sites-only Hail Table:

+
import hail as hl
+
+ds = hl.import_vcf("/path/to/data.vcf.gz", reference_genome="GRCh37", drop_samples=True).rows()
+
+
+

Annotate variants with VEP consequences:

+
from gnomad.utils.vep import vep_or_lookup_vep
+
+ds = vep_or_lookup_vep(ds, reference="GRCh37")
+
+
+

vep_or_lookup_vep uses a precomputed dataset to +drastically speed up this process.

+

Identify the most severe consequence for each variant:

+
from gnomad.utils.vep import process_consequences
+
+ds = process_consequences(ds)
+
+
+

process_consequences adds worst_consequence_term, +worst_csq_for_variant, worst_csq_by_gene and other fields to ds.vep.

+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/genindex.html b/genindex.html new file mode 100644 index 000000000..58e4ce187 --- /dev/null +++ b/genindex.html @@ -0,0 +1,1427 @@ + + + + + + Index — gnomad master documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + +

Index

+ +
+ A + | B + | C + | D + | E + | F + | G + | H + | I + | J + | L + | M + | N + | O + | P + | Q + | R + | S + | T + | U + | V + | W + +
+

A

+ + + +
+ +

B

+ + + +
+ +

C

+ + + +
+ +

D

+ + + +
+ +

E

+ + + +
+ +

F

+ + + +
+ +

G

+ + + +
+ +

H

+ + + +
+ +

I

+ + + +
+ +

J

+ + +
+ +

L

+ + + +
+ +

M

+ + + +
+ +

N

+ + +
+ +

O

+ + + +
+ +

P

+ + + +
+ +

Q

+ + +
+ +

R

+ + + +
+ +

S

+ + + +
+ +

T

+ + + +
+ +

U

+ + + +
+ +

V

+ + + +
+ +

W

+ + +
+ + + +
+
+
+ +
+ +
+

+
+ + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/getting_started.html b/getting_started.html new file mode 100644 index 000000000..69956df8f --- /dev/null +++ b/getting_started.html @@ -0,0 +1,160 @@ + + + + + + + Getting Started — gnomad master documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Getting Started

+
    +
  1. Install Hail:

    +
    pip install hail
    +
    +
    +
  2. +
  3. Use hailctl to start a Google Dataproc cluster with the +gnomad package installed (see Hail on the Cloud for more detail on hailctl):

    +
    hailctl dataproc start cluster-name --packages gnomad
    +
    +
    +
  4. +
  5. Connect to a Jupyter Notebook on the cluster:

    +
    hailctl dataproc connect cluster-name notebook
    +
    +
    +
  6. +
  7. Import gnomAD data in Hail Table format:

    +
    +
      +
    • gnomAD v2.1.1 variants:

      +
      from gnomad.resources.grch37 import gnomad
      +
      +gnomad_v2_exomes = gnomad.public_release("exomes")
      +exomes_ht = gnomad_v2_exomes.ht()
      +exomes_ht.describe()
      +
      +gnomad_v2_genomes = gnomad.public_release("genomes")
      +genomes_ht = gnomad_v2_genomes.ht()
      +genomes_ht.describe()
      +
      +
      +
    • +
    • gnomAD v3 variants:

      +
      from gnomad.resources.grch38 import gnomad
      +gnomad_v3_genomes = gnomad.public_release("genomes")
      +ht = gnomad_v3_genomes.ht()
      +ht.describe()
      +
      +
      +
    • +
    +
    +
  8. +
  9. Shut down the cluster when finished with it:

    +
    hailctl dataproc stop cluster-name
    +
    +
    +
  10. +
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 000000000..b995cd5bb --- /dev/null +++ b/index.html @@ -0,0 +1,180 @@ + + + + + + + gnomad — gnomad master documentation + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/objects.inv b/objects.inv new file mode 100644 index 000000000..18fab3029 Binary files /dev/null and b/objects.inv differ diff --git a/py-modindex.html b/py-modindex.html new file mode 100644 index 000000000..9eb80f33c --- /dev/null +++ b/py-modindex.html @@ -0,0 +1,301 @@ + + + + + + Python Module Index — gnomad master documentation + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + +

Python Module Index

+ +
+ g +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
+ g
+ gnomad +
    + gnomad.assessment.summary_stats +
    + gnomad.assessment.validity_checks +
    + gnomad.resources.config +
    + gnomad.resources.grch37.gnomad +
    + gnomad.resources.grch37.gnomad_ld +
    + gnomad.resources.grch37.reference_data +
    + gnomad.resources.grch38.gnomad +
    + gnomad.resources.grch38.reference_data +
    + gnomad.resources.import_resources +
    + gnomad.resources.resource_utils +
    + gnomad.sample_qc.ancestry +
    + gnomad.sample_qc.filtering +
    + gnomad.sample_qc.pipeline +
    + gnomad.sample_qc.platform +
    + gnomad.sample_qc.relatedness +
    + gnomad.sample_qc.sex +
    + gnomad.utils.annotations +
    + gnomad.utils.constraint +
    + gnomad.utils.file_utils +
    + gnomad.utils.filtering +
    + gnomad.utils.gen_stats +
    + gnomad.utils.intervals +
    + gnomad.utils.liftover +
    + gnomad.utils.plotting +
    + gnomad.utils.reference_genome +
    + gnomad.utils.release +
    + gnomad.utils.slack +
    + gnomad.utils.sparse_mt +
    + gnomad.utils.transcript_annotation +
    + gnomad.utils.vcf +
    + gnomad.utils.vep +
    + gnomad.variant_qc.evaluation +
    + gnomad.variant_qc.ld +
    + gnomad.variant_qc.pipeline +
    + gnomad.variant_qc.random_forest +
    + gnomad.variant_qc.training +
+ + +
+
+
+ +
+ +
+

+
+ + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/resource_sources.html b/resource_sources.html new file mode 100644 index 000000000..3368afb84 --- /dev/null +++ b/resource_sources.html @@ -0,0 +1,156 @@ + + + + + + + Resource Sources — gnomad master documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Resource Sources

+

gnomAD data is available through multiple cloud providers’ public datasets programs.

+

The functions in the gnomad.resources package can be configured to load data from different sources.

+

If Hail determines that is is running in a cloud provider’s Spark environment, resources will default to being read from that cloud provider’s datasets program. +For example, resource will be read from Azure Open Datasets if Hail determines that it is running on an Azure HDInsight cluster. +Otherwise, resources will default to being read from Google Cloud Public Datasets. +This can be configured using the GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE environment variable.

+

To load resources from a different source (for example, the gnomAD project’s public GCS bucket), use:

+
from gnomad.resources.config import gnomad_public_resource_configuration, GnomadPublicResourceSource
+
+gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD
+
+
+

To see all available public sources for gnomAD resources, use:

+
from gnomad.resources.config import GnomadPublicResourceSource
+
+list(GnomadPublicResourceSource)
+
+
+
+

Note

+

The gnomAD project’s bucket (gs://gnomad-public-requester-pays) is requester pays, meaning that charges for data access and transfer will be billed to your Google Cloud project.

+

Clusters must be configured to read requester pays buckets during creation. For example,

+
hailctl dataproc start cluster-name --packages gnomad --requester-pays-allow-buckets gnomad-public-requester-pays
+
+
+
+
+

Custom Sources

+

Alternatively, instead of using one of the pre-defined public sources, a custom source can be provided.

+
from gnomad.resources.config import gnomad_public_resource_configuration
+
+gnomad_public_resource_configuration.source = "gs://my-bucket/gnomad-resources"
+
+
+
+
+

Environment Configuration

+

The default source can be configured through the GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE environment variable. This variable can be set to either the name of one of the public datasets programs or the URL of a custom source.

+

Examples:

+
    +
  • GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE="Google Cloud Public Datasets"

  • +
  • GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE="gs://my-bucket/gnomad-resources"

  • +
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/search.html b/search.html new file mode 100644 index 000000000..b5efd8491 --- /dev/null +++ b/search.html @@ -0,0 +1,121 @@ + + + + + + Search — gnomad master documentation + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + + + +
+ +
+ +
+
+
+ +
+ +
+

+
+ + + +
+
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/searchindex.js b/searchindex.js new file mode 100644 index 000000000..b7aa1f0d2 --- /dev/null +++ b/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"docnames": ["api_reference/assessment/index", "api_reference/assessment/summary_stats", "api_reference/assessment/validity_checks", "api_reference/index", "api_reference/resources/config", "api_reference/resources/grch37/gnomad", "api_reference/resources/grch37/gnomad_ld", "api_reference/resources/grch37/index", "api_reference/resources/grch37/reference_data", "api_reference/resources/grch38/gnomad", "api_reference/resources/grch38/index", "api_reference/resources/grch38/reference_data", "api_reference/resources/import_resources", "api_reference/resources/index", "api_reference/resources/resource_utils", "api_reference/sample_qc/ancestry", "api_reference/sample_qc/filtering", "api_reference/sample_qc/index", "api_reference/sample_qc/pipeline", "api_reference/sample_qc/platform", "api_reference/sample_qc/relatedness", "api_reference/sample_qc/sex", "api_reference/utils/annotations", "api_reference/utils/constraint", "api_reference/utils/file_utils", "api_reference/utils/filtering", "api_reference/utils/gen_stats", "api_reference/utils/index", "api_reference/utils/intervals", "api_reference/utils/liftover", "api_reference/utils/plotting", "api_reference/utils/reference_genome", "api_reference/utils/release", "api_reference/utils/slack", "api_reference/utils/sparse_mt", "api_reference/utils/transcript_annotation", "api_reference/utils/vcf", "api_reference/utils/vep", "api_reference/variant_qc/evaluation", "api_reference/variant_qc/index", "api_reference/variant_qc/ld", "api_reference/variant_qc/pipeline", "api_reference/variant_qc/random_forest", "api_reference/variant_qc/training", "examples/index", "examples/vep", "getting_started", "index", "resource_sources"], "filenames": ["api_reference/assessment/index.rst", "api_reference/assessment/summary_stats.rst", "api_reference/assessment/validity_checks.rst", "api_reference/index.rst", "api_reference/resources/config.rst", "api_reference/resources/grch37/gnomad.rst", "api_reference/resources/grch37/gnomad_ld.rst", "api_reference/resources/grch37/index.rst", "api_reference/resources/grch37/reference_data.rst", "api_reference/resources/grch38/gnomad.rst", "api_reference/resources/grch38/index.rst", "api_reference/resources/grch38/reference_data.rst", "api_reference/resources/import_resources.rst", "api_reference/resources/index.rst", "api_reference/resources/resource_utils.rst", "api_reference/sample_qc/ancestry.rst", "api_reference/sample_qc/filtering.rst", "api_reference/sample_qc/index.rst", "api_reference/sample_qc/pipeline.rst", "api_reference/sample_qc/platform.rst", "api_reference/sample_qc/relatedness.rst", "api_reference/sample_qc/sex.rst", "api_reference/utils/annotations.rst", "api_reference/utils/constraint.rst", "api_reference/utils/file_utils.rst", "api_reference/utils/filtering.rst", "api_reference/utils/gen_stats.rst", "api_reference/utils/index.rst", "api_reference/utils/intervals.rst", "api_reference/utils/liftover.rst", "api_reference/utils/plotting.rst", "api_reference/utils/reference_genome.rst", "api_reference/utils/release.rst", "api_reference/utils/slack.rst", "api_reference/utils/sparse_mt.rst", "api_reference/utils/transcript_annotation.rst", "api_reference/utils/vcf.rst", "api_reference/utils/vep.rst", "api_reference/variant_qc/evaluation.rst", "api_reference/variant_qc/index.rst", "api_reference/variant_qc/ld.rst", "api_reference/variant_qc/pipeline.rst", "api_reference/variant_qc/random_forest.rst", "api_reference/variant_qc/training.rst", "examples/index.rst", "examples/vep.rst", "getting_started.rst", "index.rst", "resource_sources.rst"], "titles": ["gnomad.assessment", "gnomad.assessment.summary_stats", "gnomad.assessment.validity_checks", "gnomad", "gnomad.resources.config", "gnomad.resources.grch37.gnomad", "gnomad.resources.grch37.gnomad_ld", "gnomad.resources.grch37", "gnomad.resources.grch37.reference_data", "gnomad.resources.grch38.gnomad", "gnomad.resources.grch38", "gnomad.resources.grch38.reference_data", "gnomad.resources.import_resources", "gnomad.resources", "gnomad.resources.resource_utils", "gnomad.sample_qc.ancestry", "gnomad.sample_qc.filtering", "gnomad.sample_qc", "gnomad.sample_qc.pipeline", "gnomad.sample_qc.platform", "gnomad.sample_qc.relatedness", "gnomad.sample_qc.sex", "gnomad.utils.annotations", "gnomad.utils.constraint", "gnomad.utils.file_utils", "gnomad.utils.filtering", "gnomad.utils.gen_stats", "gnomad.utils", "gnomad.utils.intervals", "gnomad.utils.liftover", "gnomad.utils.plotting", "gnomad.utils.reference_genome", "gnomad.utils.release", "gnomad.utils.slack", "gnomad.utils.sparse_mt", "gnomad.utils.transcript_annotation", "gnomad.utils.vcf", "gnomad.utils.vep", "gnomad.variant_qc.evaluation", "gnomad.variant_qc", "gnomad.variant_qc.ld", "gnomad.variant_qc.pipeline", "gnomad.variant_qc.random_forest", "gnomad.variant_qc.training", "Examples", "Variant Effect Predictor (VEP)", "Getting Started", "gnomad", "Resource Sources"], "terms": {"summary_stat": [0, 3, 47], "freq_bin_expr": [0, 1], "get_summary_counts_dict": [0, 1], "get_summary_ac_dict": [0, 1], "get_summary_count": [0, 1], "get_an_criteria": [0, 1], "get_tx_expression_expr": [0, 1], "default_generate_gene_lof_matrix": [0, 1], "get_het_hom_summary_dict": [0, 1], "default_generate_gene_lof_summari": [0, 1], "validity_check": [0, 3, 47], "generic_field_check": [0, 2], "make_filters_expr_dict": [0, 2], "make_group_sum_expr_dict": [0, 2], "compare_row_count": [0, 2], "summarize_variant_filt": [0, 2], "generic_field_check_loop": [0, 2], "compare_subset_freq": [0, 2], "sum_group_callstat": [0, 2], "summarize_vari": [0, 2], "check_raw_and_adj_callstat": [0, 2], "check_sex_chr_metr": [0, 2], "compute_missing": [0, 2], "vcf_field_check": [0, 2], "check_global_and_row_annot_length": [0, 2], "pprint_global_ann": [0, 2], "validate_release_t": [0, 2], "count_vep_annotated_variants_per_interv": [0, 2], "freq_expr": [1, 22, 23], "index": [1, 2, 16, 20, 22, 23, 25, 29, 34, 36, 40, 42], "0": [1, 2, 14, 15, 16, 18, 19, 20, 22, 23, 29, 30, 32, 36, 37, 38, 40, 41, 42, 43], "sourc": [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 47], "return": [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43], "frequenc": [1, 2, 9, 15, 18, 22, 23, 25, 32, 36], "string": [1, 2, 9, 12, 14, 15, 20, 21, 22, 24, 25, 32, 36, 37, 40], "annot": [1, 2, 3, 8, 9, 11, 14, 15, 16, 18, 19, 20, 21, 23, 25, 27, 29, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 47], "base": [1, 14, 15, 18, 19, 21, 22, 23, 25, 28, 31, 32, 34, 35, 36, 37, 38, 41, 42], "input": [1, 2, 9, 12, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 31, 34, 35, 36, 37, 38, 40, 41, 42, 43], "ac": [1, 2, 9, 22, 23, 25, 34, 36, 40, 41], "af": [1, 2, 9, 18, 22, 23, 25, 36], "default": [1, 2, 4, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 34, 35, 36, 37, 41, 42, 48], "i": [1, 2, 4, 5, 9, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 43, 45, 48], "becaus": [1, 16, 20, 22, 23, 38], "function": [1, 2, 14, 15, 18, 20, 21, 22, 23, 25, 29, 34, 35, 36, 37, 38, 41, 42, 43, 47, 48], "assum": [1, 16, 18, 20, 21, 22, 23, 25, 26, 28, 29, 37], "wa": [1, 12, 20, 22, 23, 36, 37, 41, 42], "calcul": [1, 2, 9, 15, 16, 18, 20, 22, 23, 24, 26, 34, 35, 36, 38], "annotate_freq": [1, 22, 23, 27], "from": [1, 2, 4, 5, 8, 11, 12, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25, 29, 30, 31, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 46, 48], "all": [1, 2, 9, 12, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 34, 35, 36, 37, 38, 40, 41, 42, 43, 48], "pop": [1, 2, 6, 9, 10, 15, 22, 23, 25, 30, 32, 36, 40], "adj": [1, 2, 9, 18, 20, 22, 23, 25, 32, 36, 41], "genotyp": [1, 9, 16, 18, 20, 21, 22, 23, 25, 34, 36, 38], "onli": [1, 2, 9, 12, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 31, 34, 35, 36, 37, 38, 41, 42, 45], "paramet": [1, 2, 5, 6, 9, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43], "arrayexpress": [1, 15, 22, 23, 25, 26, 29, 34, 37], "arrai": [1, 2, 8, 11, 15, 22, 23, 25, 26, 28, 31, 32, 34, 35, 36, 37], "struct": [1, 2, 9, 16, 18, 21, 22, 23, 25, 26, 29, 30, 34, 35, 36, 37, 41, 42], "contain": [1, 2, 5, 9, 14, 15, 16, 19, 20, 21, 22, 23, 25, 29, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 43, 47], "inform": [1, 9, 14, 16, 22, 23, 24, 35, 36, 40, 42], "int": [1, 2, 12, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 32, 34, 36, 38, 41, 42], "which": [1, 2, 12, 15, 16, 18, 20, 21, 22, 23, 25, 29, 34, 35, 36, 37, 38, 40, 41, 45], "us": [1, 2, 4, 9, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 29, 30, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43, 45, 46, 48], "type": [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43], "stringexpress": [1, 20, 21, 22, 23, 34, 35], "bin": [1, 2, 22, 30, 36, 38, 41], "name": [1, 2, 4, 9, 12, 14, 16, 18, 22, 23, 24, 25, 32, 34, 35, 36, 37, 41, 42, 45, 46, 48], "locus_expr": [1, 21, 22], "allele_expr": 1, "lof_expr": 1, "no_lof_flags_expr": 1, "most_severe_csq_expr": 1, "prefix_str": 1, "dictionari": [1, 2, 9, 14, 16, 20, 22, 23, 25, 32, 34, 36, 37, 38, 41, 42], "count": [1, 2, 20, 22, 23, 25, 30, 36, 38, 41, 42], "multipl": [1, 2, 15, 20, 24, 25, 26, 30, 37, 38, 48], "variant": [1, 2, 9, 18, 20, 22, 23, 25, 26, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 46, 47], "categori": [1, 21, 23], "ar": [1, 2, 8, 9, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 32, 34, 35, 36, 37, 38, 41, 42, 43], "number": [1, 2, 9, 15, 16, 18, 19, 20, 21, 22, 23, 24, 28, 34, 36, 38, 40, 41, 42, 47], "indel": [1, 8, 11, 16, 22, 23, 26, 34, 36, 38, 41, 43], "snv": [1, 16, 18, 22, 26, 36, 38, 41], "lof": [1, 23, 35, 37], "pass": [1, 2, 9, 14, 15, 18, 22, 25, 34, 36, 37, 40, 41], "lofte": [1, 23, 37], "without": [1, 22, 23, 25], "ani": [1, 2, 14, 15, 16, 20, 22, 23, 24, 25, 34, 37, 38, 41, 42], "flg": 1, "other": [1, 2, 14, 15, 16, 18, 19, 22, 23, 25, 30, 36, 37, 45], "splice": [1, 35, 36, 37], "o": [1, 37], "fail": [1, 2, 24, 29], "missens": [1, 23, 36], "synonym": [1, 23, 37], "autosom": [1, 15, 16, 19, 20, 22, 23, 25, 26, 34, 36, 41], "allosom": 1, "two": [1, 2, 20, 21, 22, 23, 35, 36, 40, 41], "multi": [1, 16, 18, 20, 22, 26, 34, 36, 43], "allel": [1, 2, 9, 15, 16, 18, 19, 20, 22, 23, 25, 26, 29, 31, 32, 34, 35, 36, 37, 38, 40, 41, 43], "have": [1, 2, 15, 16, 19, 20, 22, 23, 24, 34, 37, 38, 41], "been": [1, 20, 22, 37, 38], "split": [1, 9, 18, 20, 21, 22, 25, 34, 36], "locusexpress": [1, 21, 22, 29, 31], "booleanexpress": [1, 2, 16, 20, 22, 23, 25, 34, 38, 41, 43], "indic": [1, 2, 6, 16, 20, 22, 23, 24, 32, 34, 35, 37, 41, 42, 43], "whether": [1, 2, 15, 16, 18, 22, 23, 24, 25, 29, 30, 34, 35, 36, 37, 38, 42], "ha": [1, 2, 16, 18, 19, 20, 21, 22, 25, 34, 35, 37, 38, 42, 43], "flag": [1, 2, 22, 23, 25, 37, 45], "most": [1, 22, 25, 34, 37, 45], "sever": [1, 23, 37, 45], "consequ": [1, 25, 35, 36, 37, 45], "str": [1, 2, 4, 5, 6, 9, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42], "desir": [1, 2, 9, 19, 20, 22, 24, 25, 29, 34, 36, 43], "prefix": [1, 12, 16, 21, 33, 36, 42], "empti": [1, 2, 5, 23, 30, 36, 37, 42], "dict": [1, 2, 12, 14, 16, 20, 22, 23, 25, 30, 32, 34, 36, 38, 41, 42], "int64express": [1, 2, 22, 23], "per": [1, 2, 22, 23, 25, 34, 35, 36, 37, 41], "ac_expr": 1, "total": [1, 2, 22, 23, 28, 34, 36], "ht": [1, 2, 9, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 29, 30, 34, 35, 36, 37, 38, 41, 42, 43, 46], "freq_field": 1, "freq": [1, 9, 22, 23, 25, 32, 36], "filter_field": 1, "filter": [1, 2, 3, 17, 18, 20, 22, 23, 27, 31, 32, 34, 35, 36, 37, 40, 41, 47], "filter_decoi": [1, 18, 25], "fals": [1, 2, 9, 14, 16, 18, 20, 22, 23, 24, 25, 28, 30, 31, 34, 35, 36, 37, 38, 41, 42, 43], "canonical_onli": 1, "true": [1, 2, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 29, 30, 31, 34, 35, 36, 37, 38, 41, 43, 45], "mane_select_onli": 1, "gener": [1, 2, 9, 12, 14, 15, 20, 22, 23, 28, 29, 34, 35, 36, 37, 41], "summari": [1, 2, 37], "across": [1, 2, 9, 22, 23, 29, 36], "includ": [1, 2, 8, 9, 11, 16, 18, 20, 21, 22, 23, 25, 31, 33, 34, 35, 36, 37, 41, 42, 45], "also": [1, 2, 9, 14, 15, 16, 18, 20, 22, 23, 25, 42, 43], "tabl": [1, 2, 5, 8, 9, 11, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 29, 30, 31, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 46], "": [1, 2, 4, 5, 9, 14, 16, 18, 20, 21, 22, 23, 25, 33, 34, 35, 36, 38, 41, 42, 45, 48], "global": [1, 2, 5, 9, 20, 22, 23, 32, 34, 35, 36, 37, 38, 41, 42], "befor": [1, 9, 18, 20, 22, 23, 24, 34, 35, 36, 38, 41], "out": [1, 16, 18, 20, 24, 34, 35, 37, 41, 43], "low": [1, 23, 25, 34, 36, 37], "confid": [1, 8, 11, 23, 25, 36, 38, 41], "region": [1, 2, 18, 22, 25, 35, 36], "canon": [1, 23, 36, 37], "transcript": [1, 23, 25, 35, 36, 37], "mane": [1, 36, 37], "select": [1, 9, 12, 14, 16, 22, 23, 24, 36, 37], "vep": [1, 2, 3, 23, 25, 27, 35, 40, 44, 47], "multiallel": [1, 23, 36], "bi": [1, 16, 18, 19, 20, 22, 25, 26, 38, 41], "field": [1, 2, 9, 14, 16, 18, 19, 20, 21, 22, 23, 25, 34, 35, 36, 37, 38, 41, 42, 45], "bool": [1, 2, 9, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 28, 29, 30, 31, 34, 35, 36, 37, 38, 41, 42], "decoi": [1, 18, 22, 25, 36], "group": [1, 2, 9, 10, 16, 20, 21, 22, 23, 27, 32, 34, 35, 36, 38, 41, 47], "aggreg": [1, 2, 16, 22, 23, 30, 34, 35, 38, 41], "mt": [1, 14, 15, 18, 19, 20, 21, 22, 24, 25, 30, 34, 35, 37, 38, 41, 42], "samples_by_sex": 1, "none": [1, 2, 4, 9, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43], "meta_root": 1, "meta": [1, 22, 23, 25], "sex_field": 1, "sex_imput": 1, "sex_karyotyp": [1, 18, 21], "xy_str": [1, 22], "xy": [1, 2, 9, 18, 21, 22, 32, 34, 36], "xx_str": 1, "xx": [1, 2, 9, 18, 21, 22, 25, 32, 34, 36], "freq_index": 1, "an_proportion_cutoff": 1, "8": [1, 15, 18, 24, 36], "criteria": [1, 18, 22, 23, 25, 37], "sampl": [1, 2, 9, 15, 16, 18, 19, 20, 21, 22, 25, 32, 34, 35, 36, 38, 41, 42, 43], "an": [1, 2, 4, 9, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 38, 41, 42, 43, 48], "proxi": 1, "call": [1, 2, 15, 16, 18, 19, 22, 34, 36, 37], "rate": [1, 18, 19, 23], "matrixt": [1, 2, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 29, 30, 34, 35, 36, 37, 38, 41, 42], "option": [1, 2, 9, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43], "valu": [1, 2, 4, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 32, 34, 35, 36, 37, 38, 40, 42], "each": [1, 2, 9, 15, 16, 19, 20, 22, 23, 24, 25, 30, 32, 34, 35, 36, 38, 41, 42, 45], "sex": [1, 2, 3, 9, 10, 16, 17, 18, 20, 22, 25, 26, 27, 32, 34, 36, 47], "kei": [1, 2, 12, 15, 16, 19, 20, 22, 23, 25, 32, 34, 35, 36, 37, 38], "metadata": [1, 22, 23, 25, 30, 32], "assign": [1, 15, 19, 21, 36, 38], "defualt": 1, "mark": 1, "float": [1, 2, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 36, 41, 42, 43], "proport": [1, 15, 23, 35], "cutoff": [1, 2, 18, 20, 21, 23, 36], "key_expr": 1, "tx_ht": [1, 35], "csq_expr": 1, "gene_field": 1, "ensg": 1, "csq_field": [1, 37], "csq": [1, 37], "tx_struct": 1, "tx_annot": 1, "pull": [1, 35], "appropri": 1, "express": [1, 2, 14, 15, 16, 20, 21, 22, 23, 25, 26, 29, 31, 34, 35, 38, 41, 42, 43], "given": [1, 2, 6, 15, 18, 20, 21, 22, 23, 26, 31, 34, 35, 36, 37, 40, 42, 43], "specif": [1, 2, 5, 9, 14, 16, 22, 23, 25, 30, 34, 36, 38, 42, 45], "locu": [1, 2, 9, 19, 21, 22, 25, 29, 31, 34, 35, 36, 40], "provid": [1, 4, 5, 9, 15, 16, 20, 22, 23, 25, 30, 31, 34, 38, 43, 48], "both": [1, 20, 21, 22, 24, 25, 29, 34, 36, 38, 43], "row_key_expr": 1, "structexpress": [1, 2, 16, 20, 21, 22, 23, 25, 26, 29, 34, 35, 37], "search": 1, "gene": [1, 2, 23, 35, 37, 40], "id": [1, 15, 16, 20, 22, 23, 24, 35, 36], "most_severe_consequ": [1, 23, 35, 37], "float64express": [1, 22, 23, 26], "high_expression_cutoff": 1, "9": [1, 14, 15, 36], "low_expression_cutoff": 1, "1": [1, 4, 5, 9, 15, 16, 18, 19, 20, 22, 23, 24, 26, 30, 34, 36, 38, 40, 41, 43, 46], "additional_csq_set": 1, "missense_vari": [1, 35, 37], "synonymous_vari": [1, 35, 37], "all_transcript": 1, "filter_an": 1, "filter_to_rar": 1, "pre_lofte": 1, "lof_csq_set": [1, 27, 37], "frameshift_vari": [1, 35, 37], "splice_acceptor_vari": [1, 35, 37], "splice_donor_vari": [1, 35, 37], "stop_gain": [1, 35, 37], "remove_ultra_common": 1, "loss": [1, 23, 37], "matrix": [1, 6, 20, 22, 34, 40, 41, 42], "metric": [1, 2, 16, 20, 22, 23, 36], "level": [1, 14, 22, 23, 34, 35, 36, 37], "minimum": [1, 15, 18, 20, 22, 23, 25, 36], "mean": [1, 18, 21, 22, 23, 34, 35, 36, 48], "consid": [1, 2, 19, 20, 22, 23, 25], "highli": [1, 20, 22], "upper": [1, 2, 16, 20, 21, 23, 30], "lowli": 1, "set": [1, 4, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 28, 31, 32, 34, 35, 36, 37, 38, 41, 42, 43, 48], "addit": [1, 14, 15, 18, 20, 22, 23, 25, 30, 34, 35, 37, 38, 41, 42], "keep": [1, 18, 20, 22, 23, 25, 34, 35, 36, 37, 42], "instead": [1, 9, 16, 18, 21, 22, 23, 25, 34, 35, 36, 37, 48], "just": [1, 25], "rare": 1, "5": [1, 2, 15, 18, 20, 21, 22, 23, 34, 36, 41, 42], "remov": [1, 2, 9, 20, 22, 24, 25, 30, 34, 37], "ultra": 1, "common": [1, 20, 29], "95": [1, 22, 23], "csq_set": 1, "defined_sites_expr": 1, "num_homs_expr": 1, "num_hets_expr": 1, "pop_expr": [1, 22], "site": [1, 2, 5, 8, 9, 11, 14, 18, 19, 22, 23, 34, 36, 38, 41, 45], "defin": [1, 2, 16, 20, 22, 23, 25, 34, 35, 36, 37, 38, 41, 43, 48], "heterozyg": [1, 2, 23, 36], "homozyg": [1, 18, 21, 23, 36], "popul": [1, 2, 5, 6, 9, 15, 22, 23, 25, 32, 36, 40], "label": [1, 2, 9, 15, 20, 22, 23, 30, 32, 36, 42, 43], "collapse_indel": 1, "tx": 1, "pop_field": 1, "filter_lofte": 1, "p": [1, 18, 22, 30, 36], "haplotyp": [1, 36], "carri": 1, "put": 1, "plof": [1, 23], "observ": [1, 18, 22, 23, 25, 36], "expect": [1, 2, 14, 16, 18, 20, 22, 23, 25, 34, 36, 41], "oe": [1, 23], "ratio": [1, 22, 23, 36, 41, 43], "abov": [1, 18, 20, 21, 22, 23, 25, 34], "stat": [1, 2, 18, 20, 21, 22, 24, 26, 34, 41, 43], "creat": [1, 16, 18, 20, 21, 22, 23, 25, 31, 32, 34, 36, 37, 38, 41, 42, 45], "were": [1, 20, 22, 23, 25, 26, 36, 41, 42], "If": [1, 2, 4, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 31, 33, 34, 35, 36, 37, 38, 41, 42, 43, 48], "row": [1, 2, 18, 20, 22, 23, 25, 30, 34, 35, 36, 37, 38, 41, 42, 43, 45], "collaps": [1, 23], "data": [1, 2, 4, 8, 9, 11, 14, 15, 16, 18, 20, 22, 25, 26, 29, 30, 32, 33, 34, 36, 37, 38, 41, 42, 45, 46, 48], "top": [1, 2, 35, 37], "indici": 1, "het": [1, 18, 20, 21, 22, 34], "hom": [1, 18, 21, 22], "check_descript": 2, "display_field": 2, "cond_expr": 2, "verbos": 2, "show_percent_sit": 2, "n_fail": 2, "ht_count": 2, "check": [2, 14, 22, 23, 24, 37, 42], "logic": [2, 36], "condit": [2, 20], "involv": 2, "hail": [2, 4, 9, 14, 15, 16, 20, 21, 22, 23, 24, 29, 30, 31, 35, 37, 38, 41, 42, 45, 46, 47, 48], "when": [2, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 32, 34, 35, 36, 37, 45, 46], "absent": 2, "print": [2, 24, 42], "result": [2, 15, 16, 19, 20, 22, 30, 37, 41, 42], "stdout": 2, "displai": [2, 25, 30], "percent": [2, 24, 36], "either": [2, 9, 15, 20, 22, 23, 24, 25, 48], "previous": [2, 15, 22], "comput": [2, 15, 16, 18, 19, 20, 22, 23, 26, 34, 36, 38, 41, 42], "match": [2, 19, 29, 35, 37, 42], "otherwis": [2, 4, 15, 22, 28, 36, 48], "opposit": 2, "should": [2, 15, 16, 19, 20, 22, 23, 24, 25, 30, 31, 34, 35, 36, 38, 41, 42, 43], "never": 2, "same": [2, 16, 19, 20, 22, 23, 24, 30, 34, 37, 38, 42], "e": [2, 5, 9, 12, 14, 15, 16, 19, 20, 22, 23, 25, 32, 34, 36, 37, 38, 42], "g": [2, 9, 14, 16, 19, 20, 22, 23, 24, 25, 29, 30, 32, 34, 36, 37, 38, 42, 48], "instanc": [2, 16], "where": [2, 15, 20, 22, 23, 25, 31, 32, 34, 35, 36, 37, 38, 42], "raw": [2, 9, 20, 22, 23, 32, 36, 41], "less": [2, 16, 22, 23, 36], "than": [2, 15, 16, 20, 21, 22, 23, 24, 36, 43], "greater": [2, 16, 23, 24, 43], "equal": [2, 22, 25, 37, 38, 41], "describ": [2, 22, 23, 36, 46], "being": [2, 18, 22, 23, 36, 48], "messag": [2, 24, 33], "case": [2, 14, 15, 20, 22, 25, 34, 38], "failur": 2, "troubleshoot": 2, "purpos": [2, 16], "refer": [2, 16, 18, 20, 22, 23, 29, 31, 34, 35, 36, 37, 40, 45, 47], "show": 2, "percentag": 2, "suppli": [2, 15, 18, 21, 22, 23, 36, 37], "obtain": [2, 9, 23, 43], "within": [2, 16, 20, 22, 23, 30, 34, 36, 37, 38, 40], "perform": [2, 9, 20, 22, 23, 34, 36, 37, 38, 41], "extra_filter_check": 2, "variant_filter_field": 2, "rf": [2, 15, 36, 41, 42], "make": [2, 24, 32, 36, 41], "measur": 2, "under": [2, 22, 35, 36], "vari": [2, 23], "interest": [2, 24, 37], "fraction": [2, 18, 21, 34], "due": 2, "inbreed": [2, 18, 22, 36], "coeffici": [2, 18, 20, 22, 36], "combin": [2, 16, 21, 22, 23, 25, 32, 36, 38, 41], "ac0": [2, 36], "examin": [2, 23], "extra": [2, 23], "filtrat": [2, 41], "vqsr": [2, 34, 36, 41], "as_vqsr": [2, 36], "t": [2, 14, 16, 18, 20, 22, 23, 24, 25, 29, 30, 34, 35, 36, 37, 40, 42], "subset": [2, 9, 10, 16, 22, 23, 25, 32, 34, 36, 38, 43], "label_group": [2, 36], "sort_ord": [2, 27, 32, 36], "downsampl": [2, 9, 10, 22, 23, 25, 32, 36], "popmax": [2, 9, 22, 32, 36], "grpmax": [2, 32, 36], "gen_anc": [2, 23, 32, 36], "subpop": [2, 5, 9, 25, 32, 36], "delimit": [2, 14, 32, 36, 37], "metric_first_field": 2, "nhomalt": [2, 36], "sum": [2, 22, 23, 34, 35, 36], "specifi": [2, 9, 16, 18, 19, 20, 22, 23, 24, 25, 30, 34, 35, 36, 37, 41, 42], "compar": [2, 38], "version": [2, 5, 8, 9, 11, 12, 14, 15, 22, 25, 34, 35, 37], "For": [2, 4, 14, 16, 20, 22, 23, 24, 25, 32, 35, 36, 38, 41, 48], "exampl": [2, 4, 12, 20, 22, 25, 26, 30, 32, 33, 35, 36, 38, 41, 42, 43, 47, 48], "subset1": 2, "consist": [2, 34], "pop1": 2, "pop2": 2, "pop3": 2, "info": [2, 9, 18, 22, 34, 36, 37, 41, 42], "union": [2, 4, 15, 16, 18, 20, 21, 22, 23, 24, 25, 28, 29, 30, 31, 33, 34, 35, 36, 37], "list": [2, 9, 12, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 28, 30, 31, 32, 34, 35, 36, 37, 41, 42, 48], "entri": [2, 16, 18, 19, 21, 22, 25, 27, 32, 34, 35, 36], "possibl": [2, 22, 23, 31, 36], "afr": [2, 9, 22, 25, 32, 36], "nfe": [2, 5, 9, 22, 23, 32, 36, 40], "amr": [2, 9, 32, 36], "order": [2, 9, 18, 20, 22, 23, 25, 35, 36, 37, 38, 41, 42], "sort": [2, 9, 28, 32, 36, 37], "preced": [2, 36], "hgdp": [2, 9], "ht1": 2, "ht2": 2, "first": [2, 18, 20, 22, 23, 24, 34, 37, 41], "second": [2, 20, 22, 34, 36], "problematic_region": 2, "lcr": [2, 18, 22, 25, 36], "segdup": [2, 18, 22, 25, 36], "nonpar": [2, 22, 36], "single_filter_count": 2, "site_gt_check_expr": 2, "n_row": 2, "50": [2, 9, 16, 34], "n_col": 2, "140": [2, 30], "summar": [2, 35], "variou": 2, "problemat": [2, 18, 22], "run": [2, 4, 15, 16, 18, 19, 20, 22, 23, 33, 41, 42, 48], "explod": [2, 20, 23, 37], "column": [2, 15, 20, 22, 30, 34, 35, 38, 41, 42], "give": [2, 16, 20, 22, 43], "supplement": 2, "boolean": [2, 9, 16, 22, 23, 24, 25, 38], "typic": [2, 14, 15, 18, 20, 22, 23, 24, 34], "log": [2, 14, 22, 30, 36, 47], "how": [2, 25, 36], "mani": 2, "monoallel": [2, 36], "100": [2, 9, 12, 15, 23, 34, 38, 41], "field_check_expr": 2, "loop": [2, 20], "through": [2, 22, 48], "thi": [2, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 29, 31, 34, 35, 36, 37, 38, 40, 41, 42, 43, 45, 47, 48], "allow": [2, 35, 48], "onc": [2, 9, 20, 22], "oppos": 2, "dure": [2, 9, 16, 36, 48], "everi": [2, 34], "whose": [2, 20], "valid": 2, "callset": [2, 23, 36, 38], "eg": 2, "overal": [2, 36], "non_v2": [2, 9], "between": [2, 20, 23, 25, 36, 37, 38, 40, 41], "entir": 2, "asj": [2, 9, 32, 36], "ea": [2, 5, 9, 32, 36], "fin": [2, 9, 22, 32, 36], "mid": [2, 9, 32, 36], "remain": [2, 9, 32, 36], "sa": [2, 9, 32, 36], "additional_subsets_and_pop": 2, "present": [2, 12, 15, 22, 23, 24, 32, 34, 36, 37, 38, 41], "test": [2, 18, 22, 36, 41, 42], "current_major_releas": [2, 32], "exom": [2, 5, 9, 18, 19, 23, 25, 29, 32, 36, 46], "callstat": [2, 22, 36], "doe": [2, 25, 36, 43], "store": [2, 12, 14, 15, 16, 18, 22, 34, 36, 37, 42, 45], "sample_sum_sets_and_pop": 2, "male": [2, 20, 21, 36], "get": [2, 4, 5, 6, 18, 20, 21, 22, 23, 24, 34, 37, 38, 40, 41, 47], "chromosom": [2, 9, 16, 18, 21, 22, 23, 25, 26, 34, 36], "neg": [2, 20, 22, 23, 36, 41, 43], "correspond": [2, 8, 11, 16, 20, 22, 23, 32, 34, 36, 37, 40, 43], "must": [2, 14, 18, 20, 21, 22, 25, 34, 36, 37, 38, 41, 45, 48], "info_metr": 2, "contig": [2, 5, 9, 28, 31, 34, 36, 38], "That": 2, "chry": [2, 9, 22, 36], "na": [2, 14, 36, 42], "non": [2, 14, 16, 19, 21, 22, 23, 25, 26, 34, 35, 36, 42], "par": [2, 21, 22, 25, 36], "chrx": [2, 22, 36], "non_info_metr": 2, "n_site": 2, "missingness_threshold": 2, "amount": [2, 23, 34], "missing": 2, "sdout": 2, "exce": 2, "minu": 2, "header_dict": 2, "row_annot": 2, "entry_annot": 2, "hist": [2, 22, 27, 30, 36], "gq_hist_alt": [2, 36], "gq_hist_al": [2, 36], "dp_hist_alt": [2, 36], "dp_hist_al": [2, 36], "ab_hist_alt": [2, 36], "vcf": [2, 3, 5, 9, 14, 16, 22, 24, 27, 32, 34, 37, 45, 47], "descript": [2, 36, 37], "header": [2, 25, 36, 37], "export": [2, 9, 22, 24, 32, 36, 37], "histogram": [2, 22, 30, 36], "row_to_globals_check": 2, "check_all_row": 2, "length": [2, 22, 23, 28, 36, 41], "associ": [2, 22, 31, 36], "pretti": 2, "sum_metr": 2, "summarize_variants_check": 2, "filters_check": 2, "raw_adj_check": 2, "subset_freq_check": 2, "samples_sum_check": 2, "sex_chr_check": 2, "missingness_check": 2, "pprint_glob": 2, "check_all_rows_in_row_to_global_check": 2, "batteri": 2, "statu": [2, 22, 23, 25], "differ": [2, 16, 22, 36, 37, 48], "partit": [2, 18, 23, 24, 34], "outlier": [2, 16, 23, 25], "subgroup": [2, 9, 36], "add": [2, 9, 22, 23, 25, 29, 31, 37, 38, 41, 45], "up": [2, 32, 45], "supergroup": 2, "main": [2, 12, 13, 22], "relev": [2, 16, 22, 25, 36], "regardless": 2, "violat": 2, "method": [2, 9, 14, 16, 20, 22, 34, 36], "check_sex_chr_metricss": 2, "vep_ht": [2, 40], "interval_ht": [2, 34], "interv": [2, 3, 18, 19, 23, 24, 25, 27, 31, 34, 35, 37, 38, 41, 42, 47], "transcript_consequ": [2, 23, 35, 37], "biotyp": [2, 37], "determin": [2, 4, 16, 18, 20, 21, 22, 23, 34, 36, 38, 41, 43, 48], "protein": [2, 23, 25, 35, 36, 37], "code": [2, 22, 23, 25, 33, 35, 37], "gene_stable_id": 2, "ensembl": [2, 36, 37, 45], "releas": [2, 3, 5, 9, 16, 27, 36, 37, 47], "The": [2, 4, 8, 11, 12, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 26, 30, 32, 34, 35, 36, 37, 38, 41, 42, 43, 48], "follow": [2, 8, 11, 12, 16, 18, 20, 21, 22, 23, 24, 25, 34, 36, 37, 38, 41, 43], "ad": [2, 9, 20, 21, 22, 23, 25, 29, 31, 35, 36, 37, 38, 41, 42], "n_total_vari": 2, "n_pcg_variant": 2, "variant_qc": [3, 47], "evalu": [3, 20, 25, 39, 41, 47], "ld": [3, 6, 18, 39, 47], "pipelin": [3, 17, 23, 24, 39, 42, 47], "random_forest": [3, 39, 47], "train": [3, 15, 36, 39, 41, 42, 47], "util": [3, 41, 45, 47], "constraint": [3, 27, 47], "file_util": [3, 27, 47], "gen_stat": [3, 27, 47], "liftov": [3, 5, 7, 27, 47], "plot": [3, 27, 47], "reference_genom": [3, 9, 27, 40, 45, 47], "slack": [3, 27, 47], "sparse_mt": [3, 27, 47], "transcript_annot": [3, 27, 47], "resourc": [3, 22, 25, 35, 36, 37, 46, 47], "config": [3, 13, 37, 47, 48], "grch37": [3, 9, 12, 13, 25, 29, 31, 37, 40, 45, 46, 47], "grch38": [3, 13, 18, 25, 29, 31, 36, 37, 46, 47], "import_resourc": [3, 13, 14, 47], "resource_util": [3, 13, 47], "sample_qc": [3, 47], "ancestri": [3, 9, 17, 22, 23, 36, 47], "platform": [3, 17, 20, 22, 34, 47], "related": [3, 17, 41, 47], "assess": [3, 47], "configur": [4, 37, 45, 47], "load": [4, 5, 15, 20, 29, 42, 48], "class": [4, 14, 33], "gnomadpublicresourcesourc": [4, 13, 48], "modul": [4, 12, 20, 23, 35, 41], "qualnam": 4, "start": [4, 22, 23, 28, 30, 36, 37, 45, 47, 48], "boundari": [4, 22, 30, 34, 37], "public": [4, 5, 9, 14, 48], "google_cloud_public_dataset": 4, "googl": [4, 24, 45, 46, 48], "cloud": [4, 24, 46, 48], "dataset": [4, 9, 16, 20, 22, 23, 24, 25, 32, 36, 45, 48], "registry_of_open_data_on_aw": 4, "registri": 4, "open": [4, 48], "aw": 4, "azure_open_dataset": 4, "azur": [4, 48], "get_default_public_resource_sourc": [4, 13], "gnomad_default_public_resource_sourc": [4, 48], "environ": [4, 37, 47], "variabl": [4, 12, 20, 37, 48], "spark": [4, 42, 48], "hdinsight": [4, 48], "cluster": [4, 9, 19, 37, 45, 46, 48], "public_releas": [5, 7, 9, 10, 46], "data_typ": [5, 9, 29], "retriev": [5, 9, 22], "publicli": [5, 9], "One": [5, 9, 22, 25, 29, 35, 36], "genom": [5, 8, 9, 11, 23, 25, 29, 31, 34, 36, 37, 40, 41, 45, 46, 47], "versionedtableresourc": [5, 9, 13, 14, 37], "coverag": [5, 7, 9, 10, 18, 23, 34, 36], "38": 5, "v2": [5, 29, 46], "public_pca_load": [5, 7], "tableresourc": [5, 13, 14], "pca": [5, 15, 19], "can": [5, 12, 15, 16, 19, 20, 21, 22, 23, 24, 25, 30, 31, 34, 37, 38, 41, 48], "gnomadpublictableresourc": [5, 6, 13, 14], "release_vcf_path": [5, 7, 9, 10], "20": [5, 9, 18, 22, 34, 36], "singl": [5, 9, 15, 16, 18, 20, 21, 25, 34, 35, 37, 38], "y": [5, 18, 20, 21, 22, 23, 25, 30, 34, 36], "path": [5, 9, 12, 14, 16, 18, 24, 25, 29, 30, 33, 37, 41, 42, 45], "ld_matrix": [6, 7], "gnomadpublicblockmatrixresourc": [6, 13, 14], "ld_index": [6, 7, 40], "ld_score": [6, 7], "score": [6, 15, 16, 19, 23, 25, 36, 37, 38, 41], "gnomad_ld": [7, 13], "reference_data": [7, 10, 13, 25], "get_truth_ht": [7, 8, 10, 11], "latest": [8, 11, 37], "truth": [8, 11, 38, 41], "hapmap": [8, 11, 41], "kgp_omni": [8, 11], "1000": [8, 9, 11, 22, 41], "intersect": [8, 11], "onni": [8, 11], "2": [8, 11, 15, 16, 18, 20, 22, 23, 25, 36, 38, 41, 42], "5m": [8, 11], "kgp_phase_1_hc": [8, 11], "high": [8, 9, 11, 18, 22, 23, 25, 36, 37, 38, 41], "genonm": [8, 11], "mill": [8, 11, 41], "devin": [8, 11], "A": [8, 11, 12, 14, 15, 16, 19, 20, 22, 23, 25, 35, 36, 37, 40, 41, 42, 43], "popular": [8, 11], "v3": [9, 36, 46], "non_topm": 9, "non_canc": 9, "controls_and_biobank": 9, "non_neuro": 9, "tgp": 9, "v4": [9, 36, 37], "non_ukb": [9, 32], "ensur": [9, 20, 36], "tgp_raw_ac_esn_xx": 9, "qualiti": [9, 22, 23, 34, 36], "stratifi": [9, 20, 22, 34, 36, 38, 41], "ami": [9, 36], "oth": [9, 15, 36], "cohorts_with_pop_stored_as_subpop": [9, 10], "broken": [9, 16, 42], "down": [9, 42, 46], "known": [9, 15, 18, 42], "tgp_pop": [9, 10], "esn": 9, "pur": 9, "pjl": 9, "clm": 9, "jpt": 9, "chb": 9, "stu": 9, "itu": 9, "tsi": 9, "mxl": 9, "ceu": 9, "msl": 9, "yri": 9, "beb": 9, "khv": 9, "cdx": 9, "lwk": 9, "acb": 9, "asw": 9, "ib": 9, "gbr": [9, 36], "pel": 9, "gih": 9, "ch": 9, "gwd": 9, "project": [9, 14, 15, 22, 24, 36, 45, 47, 48], "1kg": 9, "hgdp_pop": [9, 10], "japanes": [9, 36], "papuanhighland": 9, "papuansepik": 9, "adygei": 9, "orcadian": 9, "biaka": 9, "yakut": 9, "han": [9, 25], "northernhan": 9, "uygur": 9, "miao": 9, "mongolian": 9, "balochi": 9, "bedouin": 9, "russian": 9, "daur": 9, "pima": 9, "hezhen": 9, "sindhi": 9, "yi": 9, "oroqen": 9, "san": 9, "tuscan": 9, "tu": 9, "palestinian": 9, "tujia": 9, "druze": 9, "pathan": 9, "basqu": 9, "makrani": 9, "bergamoitalian": 9, "naxi": 9, "karitiana": 9, "sardinian": 9, "mbuti": 9, "mozabit": 9, "yoruba": 9, "lahu": 9, "dai": 9, "cambodian": 9, "bougainvil": 9, "french": 9, "brahui": 9, "hazara": 9, "bantusouthafrica": 9, "surui": 9, "mandenka": 9, "kalash": 9, "xibo": 9, "colombian": 9, "bantukenya": 9, "she": 9, "burusho": 9, "maya": 9, "human": [9, 23, 36], "divers": 9, "tgp_pop_nam": [9, 10], "african": [9, 22, 36], "caribbean": 9, "american": [9, 22, 36], "bengali": 9, "chines": 9, "utah": 9, "resid": 9, "european": [9, 36], "southern": [9, 36], "esan": 9, "finnish": [9, 36], "british": [9, 36], "gujarati": 9, "gambian": 9, "iberian": 9, "indian": 9, "telugu": 9, "kinh": 9, "luhya": 9, "mend": 9, "mexican": 9, "peruvian": 9, "punjabi": 9, "puerto": 9, "rican": 9, "sri": 9, "lankan": 9, "tamil": 9, "toscani": 9, "map": [9, 22, 29, 30, 34, 36], "pops_to_remove_for_popmax": [9, 10], "10": [9, 15, 16, 18, 19, 20, 22, 30, 34], "200": [9, 22], "500": [9, 16, 19, 41, 42], "2000": 9, "5000": [9, 23, 34], "10000": 9, "15000": 9, "20000": 9, "25000": 9, "30000": 9, "40000": 9, "50000": 9, "60000": 9, "70000": 9, "75000": 9, "80000": 9, "85000": 9, "90000": 9, "95000": 9, "100000": 9, "110000": 9, "120000": 9, "200000": 9, "500000": 9, "all_sites_an": [9, 10], "coverage_tsv_path": [9, 10], "chr20": [9, 18, 34, 36], "chr1": [9, 36], "add_grpmaxfaf95_v4": [9, 10], "grpmaxfaf95": 9, "popmax_popul": 9, "jointgrpmaxfaf95": 9, "fafmax": [9, 36], "joint_fafmax": 9, "structur": [9, 22, 36], "gnomad_gk": [9, 10], "locus_interv": 9, "by_ancestry_group": 9, "by_sex": [9, 22], "vrs_onli": 9, "custom_ht": 9, "skip_checkpoint": 9, "skip_coverag": 9, "custom_coverage_ht": 9, "gk": [9, 22], "rang": [9, 23, 25, 34, 36], "intervalexpress": [9, 24, 31], "hl": [9, 14, 16, 18, 20, 22, 23, 30, 34, 35, 37, 40, 41, 45], "6424776": 9, "6461367": 9, "read": [9, 14, 22, 24, 34, 36, 45, 48], "cohort": [9, 22, 32, 36], "vr": [9, 22, 36], "what": [9, 23, 41], "would": [9, 23, 32, 35, 42], "skip": [9, 22, 23], "checkpoint": [9, 18, 38], "mai": [9, 23, 25, 37], "larg": [9, 16, 20, 22, 34, 40], "reduc": 9, "copi": 9, "statist": [9, 22, 34, 36, 37, 41], "custom": [9, 47], "get_module_importable_resourc": [12, 13], "take": [12, 15, 20, 22, 25, 30, 41], "import": [12, 14, 25, 41, 42, 45, 46, 48], "import_func": [12, 14], "produc": [12, 20, 22], "resource_nam": 12, "grch37_resourc": 12, "tupl": [12, 15, 16, 19, 20, 21, 22, 23, 24, 25, 29, 34, 35, 40, 41, 42], "baseresourc": [12, 13, 14], "get_resources_descript": [12, 13], "width": [12, 30], "along": [12, 20], "thei": [12, 20, 22, 24, 26, 34, 35, 36, 38], "maximum": [12, 20, 22, 23, 25, 36, 42], "line": [12, 20, 24, 30], "arg": 12, "gnomad_public_bucket": [13, 14], "matrixtableresourc": [13, 14], "variantdatasetresourc": [13, 14], "pedigreeresourc": [13, 14], "blockmatrixresourc": [13, 14], "expressionresourc": [13, 14], "baseversionedresourc": [13, 14], "versionedmatrixtableresourc": [13, 14], "versionedvariantdatasetresourc": [13, 14], "versionedpedigreeresourc": [13, 14], "versionedblockmatrixresourc": [13, 14], "resourcenotavail": [13, 14], "gnomadpublicresourc": [13, 14], "gnomadpublicmatrixtableresourc": [13, 14], "gnomadpublicpedigreeresourc": [13, 14], "dataexcept": [13, 14], "import_sites_vcf": [13, 14], "import_gencod": [13, 14], "request": [14, 23, 24, 35, 38, 45, 48], "pai": [14, 24, 45, 48], "bucket": [14, 24, 30, 45, 48], "stage": [14, 42], "legaci": 14, "one": [14, 15, 16, 18, 20, 21, 22, 23, 25, 29, 30, 31, 34, 35, 36, 37, 38, 41, 48], "readm": 14, "text": [14, 33, 36], "file": [14, 22, 24, 25, 29, 30, 33, 36, 37, 42], "product": 14, "team": 14, "write": [14, 16, 24], "output": [14, 15, 16, 20, 21, 22, 23, 30, 34, 35, 37, 42], "sync": 14, "gcp": 14, "import_arg": 14, "abstract": 14, "requir": [14, 18, 20, 22, 23, 24, 25, 29, 34, 37, 38], "need": [14, 16, 22, 23, 34, 36, 38], "kept": [14, 18, 20, 25, 34, 35], "track": 14, "callabl": [14, 15, 22, 25, 30, 34, 35, 37], "kwarg": [14, 30, 34, 35], "expected_file_extens": 14, "extens": [14, 24], "doesn": [14, 18, 24, 29, 34], "end": [14, 22, 23, 24, 28, 30, 34, 36], "warn": [14, 16, 40], "properti": 14, "overwrit": [14, 24, 42], "its": [14, 22, 38], "exist": [14, 18, 23, 24, 36, 42], "destin": [14, 29], "underli": 14, "accept": 14, "depend": [14, 15, 20], "force_import": 14, "read_arg": 14, "forc": 14, "even": [14, 16], "alreadi": [14, 22, 23, 24, 28, 29], "argument": [14, 22, 31, 34, 35, 42], "read_tabl": 14, "noth": [14, 42], "read_matrix_t": 14, "overwritten": 14, "variantdataset": [14, 16, 18, 22, 25, 34], "vd": [14, 18, 24, 25, 34], "read_vd": 14, "quant_pheno": 14, "miss": [14, 21, 22, 23, 25, 30, 42], "pedigre": [14, 20], "fam": 14, "ped": 14, "phenotyp": 14, "interpret": [14, 25, 36, 37], "quantit": 14, "regex": 14, "denot": 14, "control": 14, "numer": [14, 16, 22, 25, 30, 42], "treat": [14, 34, 37], "famili": [14, 20, 36, 41], "import_fam": [14, 20], "current": [14, 18, 21, 22, 25, 36, 37], "implement": [14, 18, 22], "blockmatrix": [14, 40], "bm": [14, 24, 40], "he": [14, 24], "experiment": [14, 20, 41], "read_express": 14, "write_express": 14, "default_vers": 14, "attribut": [14, 36], "those": [14, 15, 18, 20, 22, 23, 34, 37, 43], "In": [14, 15, 18, 22, 23, 36, 38, 41], "resource_class": 14, "alia": 14, "param": [14, 25], "except": [14, 24, 30, 33], "rais": [14, 16, 20, 22, 24, 33], "avail": [14, 22, 48], "is_resource_avail": 14, "publish": 14, "gtf_path": 14, "gencod": [14, 23, 25, 35], "gtf": 14, "pc_project": [15, 17], "loadings_ht": [15, 19], "loading_loc": 15, "af_loc": 15, "pca_af": 15, "pre": [15, 48], "pc": [15, 16, 19, 20], "locat": [15, 23, 36, 42], "apply_onnx_classification_model": [15, 17], "data_pd": 15, "fit": [15, 23], "appli": [15, 18, 22, 23, 25, 34, 37, 41, 42], "onnx": 15, "classif": 15, "model": [15, 18, 21, 23, 36, 41, 42], "panda": 15, "datafram": [15, 30, 42], "classifi": 15, "modelproto": 15, "ndarrai": 15, "probabl": [15, 23, 36, 37, 42], "apply_sklearn_classification_model": [15, 17], "sklearn": 15, "convert_sklearn_rf_to_onnx": [15, 17], "target_opset": 15, "convert": [15, 21, 22, 33, 35, 36], "random": [15, 20, 36, 38, 41, 42], "forest": [15, 36, 41, 42], "target": [15, 23], "opset": 15, "assign_population_pc": [15, 17], "pop_pca_scor": 15, "pc_col": 15, "known_col": 15, "known_pop": 15, "seed": 15, "42": [15, 20], "prop_train": 15, "n_estim": 15, "min_prob": 15, "output_col": 15, "missing_label": 15, "pc_expr": 15, "convert_model_func": 15, "apply_model_func": 15, "As": 15, "hwe_normalized_pca": 15, "integ": [15, 22, 23, 36], "element": [15, 22, 23, 24, 25, 32, 35, 38], "separ": [15, 16, 22, 26, 36, 38], "you": [15, 35, 37], "expand_pd_array_col": 15, "expand": 15, "4": [15, 16, 20, 22, 32, 36, 38], "pc1": 15, "pc2": 15, "pc4": 15, "pc5": 15, "previou": 15, "randomforestclassifi": [15, 42], "tree": [15, 16, 41, 42], "belong": [15, 22, 35], "smaller": [15, 20], "format": [15, 18, 20, 22, 23, 25, 35, 36, 37, 46], "convers": [15, 36, 42], "work": [15, 16, 20, 37], "imput": [15, 18, 19, 34, 42], "run_pca_with_rel": [15, 17], "qc_mt": 15, "related_samples_to_drop": 15, "additional_samples_to_drop": 15, "n_pc": [15, 16, 19], "autosomes_onli": [15, 19, 41], "exclud": [15, 18, 20, 22, 23, 31, 34, 35, 43], "relat": [15, 20, 36, 41], "space": [15, 36], "qc": [15, 16, 18, 36, 38, 41], "drop": [15, 20, 34, 36, 42], "analysi": [15, 23], "eigenvalu": [15, 19], "compute_qc_metrics_residu": [16, 17], "pc_score": 16, "qc_metric": 16, "use_pc_squar": 16, "regression_sample_inclusion_expr": 16, "strata": [16, 20, 22, 25, 34, 42], "residu": 16, "after": [16, 23, 33, 34, 37, 41, 42], "regress": [16, 23], "alwai": [16, 36], "arraynumericexpress": [16, 22, 23, 26, 34], "numericexpress": [16, 18, 20, 21, 22, 25, 26, 34, 35, 38], "stratif": [16, 20, 22, 41, 42], "These": [16, 20, 21, 25, 37], "discret": 16, "compute_stratified_metrics_filt": [16, 17], "lower_threshold": 16, "upper_threshold": 16, "metric_threshold": 16, "filter_nam": 16, "qc_metrics_filt": 16, "comparison_sample_expr": 16, "median": [16, 23, 26, 34, 35, 42], "mad": [16, 26], "lower": [16, 20, 21, 22, 23, 30], "threshold": [16, 18, 19, 20, 22, 23], "expr": [16, 22, 38], "critic": 16, "more": [16, 20, 21, 22, 23, 24, 25, 34, 36, 38, 46], "collectionexpress": 16, "well": [16, 22, 23, 29, 34, 36], "determine_nearest_neighbor": [16, 17], "compute_stratified_sample_qc": [16, 17], "mtd": [16, 18, 25, 34], "tmp_ht_prefix": 16, "gt_col": 16, "merg": [16, 22, 26, 28], "overlap": [16, 19, 26, 28, 34], "v": [16, 36, 38], "intermedi": [16, 38], "recommend": [16, 21, 34], "larger": [16, 20, 36], "gt": [16, 18, 22, 25, 34, 36, 38], "strat": 16, "merge_sample_qc_expr": [16, 17], "sample_qc_expr": 16, "note": [16, 18, 20, 21, 22, 23, 26, 34, 36, 37, 38, 45], "regard": 16, "dp_stat": 16, "gq_stat": 16, "n": [16, 20, 30, 31, 41, 42], "stdev": [16, 21], "n_call": [16, 18], "veri": 16, "standard": [16, 18, 20, 21, 23, 34, 36], "gatk": [16, 18, 22, 34], "essenti": 16, "dp": [16, 18, 22, 34, 36], "gq": [16, 18, 22, 36], "do": [16, 20, 22, 24, 34, 36], "assumpt": 16, "some": [16, 20, 22, 33, 34, 43], "shouldn": 16, "matter": 16, "too": [16, 18, 20, 41], "much": [16, 20, 24], "scores_expr": 16, "n_neighbor": 16, "n_job": 16, "add_neighbor_dist": 16, "distance_metr": 16, "euclidean": 16, "use_approxim": 16, "n_tree": 16, "nearest": 16, "neighbor": 16, "limit": 16, "nearest_neighbor": 16, "nearest_neighbor_dist": 16, "identifi": [16, 20, 33, 36, 45], "thread": 16, "find": [16, 37], "cpu": 16, "head": 16, "node": 16, "distanc": [16, 37], "scikit": 16, "learn": 16, "cityblock": 16, "cosin": 16, "haversin": 16, "l1": 16, "l2": 16, "manhattan": 16, "annoi": 16, "angular": 16, "ham": 16, "dot": 16, "packag": [16, 45, 46, 47, 48], "approxim": [16, 34, 36, 38, 42], "nearestneighbor": 16, "faster": [16, 36, 37], "000": 16, "approach": 16, "build": [16, 22, 23, 25, 29, 35, 36, 37], "time": [16, 23], "affect": 16, "size": [16, 18, 22, 24, 30, 32, 34, 41], "accur": 16, "filter_rows_for_qc": [17, 18], "get_qc_mt": [17, 18], "infer_sex_karyotyp": [17, 18], "annotate_sex": [17, 18], "compute_callrate_mt": [17, 19], "run_platform_pca": [17, 19], "assign_platform_from_pc": [17, 19], "unrel": [17, 20, 36, 41], "second_degree_rel": [17, 20], "parent_child": [17, 20], "sibl": [17, 20, 36, 41], "duplicate_or_twin": [17, 20], "ambiguous_relationship": [17, 20], "get_duplicated_sampl": [17, 20], "get_duplicated_samples_ht": [17, 20], "explode_duplicate_samples_ht": [17, 20], "get_relationship_expr": [17, 20], "get_slope_int_relationship_expr": [17, 20], "infer_famili": [17, 20], "create_fake_pedigre": [17, 20], "compute_related_samples_to_drop": [17, 20], "filter_mt_to_trio": [17, 20], "generate_trio_stats_expr": [17, 20, 41], "generate_sib_stats_expr": [17, 20, 41], "adjusted_sex_ploidy_expr": [17, 21, 22], "adjust_sex_ploidi": [17, 21], "gaussian_mixture_model_karyotype_assign": [17, 21], "get_ploidy_cutoff": [17, 18, 21], "get_chr_x_hom_alt_cutoff": [17, 21], "get_sex_expr": [17, 18, 21], "min_af": 18, "001": [18, 22, 23, 42], "min_callr": 18, "99": [18, 22, 36], "min_inbreeding_coeff_threshold": 18, "min_hardy_weinberg_threshold": 18, "1e": [18, 22], "08": 18, "apply_hard_filt": 18, "bi_allelic_onli": [18, 19, 41], "snv_onli": 18, "sites_callr": 18, "site_inbreeding_coeff": 18, "callrat": [18, 19], "taken": [18, 23, 25], "coeff": 18, "mq": [18, 34, 36, 41], "f": [18, 21, 22, 36, 41], "qd": [18, 36, 41], "best": 18, "practic": 18, "hard": [18, 20, 36], "Not": 18, "hw": 18, "gakt": 18, "60": [18, 23, 41], "30": [18, 22, 23, 34, 41], "adj_onli": 18, "ld_r2": 18, "filter_lcr": [18, 25], "filter_segdup": [18, 25], "filter_exome_low_coverage_region": [18, 25], "high_conf_region": [18, 25], "checkpoint_path": [18, 38], "n_partit": [18, 34], "block_siz": 18, "readi": [18, 36, 42], "outsid": [18, 25, 30, 36], "maf": 18, "ab": [18, 22], "prune": [18, 43], "r2": 18, "segment": [18, 36], "duplic": [18, 20, 36], "persist": 18, "repartit": 18, "written": 18, "reread": 18, "new": [18, 24, 36], "block": [18, 33, 34, 36, 40], "ploidy_ht": 18, "f_stat_cutoff": [18, 21], "use_gaussian_mixture_model": 18, "normal_ploidy_cutoff": [18, 21], "aneuploidy_cutoff": [18, 21], "6": [18, 21, 36], "chr_x_frac_hom_alt_expr": [18, 21], "normal_chr_x_hom_alt_cutoff": 18, "x_karyotyp": [18, 21], "y_karyotyp": [18, 21], "x": [18, 20, 21, 22, 23, 25, 34, 36], "ploidi": [18, 21, 22, 34], "karyotyp": [18, 21, 22, 34], "By": [18, 23, 25, 34, 35, 36, 41], "roughli": [18, 21], "gaussian": [18, 21, 36], "mixtur": [18, 21, 36], "divid": [18, 21, 38], "below": [18, 21, 23, 25, 36], "deviat": [18, 21, 23, 26], "aneuploidi": [18, 21], "altern": [18, 20, 21, 22, 23, 34, 36, 48], "alt": [18, 21, 22, 23, 36], "is_spars": 18, "excluded_interv": [18, 31], "included_interv": 18, "normalization_contig": [18, 34], "sites_ht": [18, 34], "aaf_expr": 18, "gt_expr": [18, 21, 22, 25], "aaf_threshold": 18, "variants_only_x_ploidi": 18, "variants_only_y_ploidi": 18, "variants_filter_lcr": 18, "variants_filter_segdup": 18, "variants_filter_decoi": 18, "variants_snv_onli": 18, "coverage_mt": 18, "compute_x_frac_variants_hom_alt": 18, "compute_fstat": 18, "infer_karyotyp": 18, "heterozygos": [18, 22, 36], "_mean_dp": 18, "float32": 18, "over": [18, 20, 23, 24, 29, 34, 36], "chrx_mean_dp": 18, "chry_mean_dp": 18, "chrx_ploidi": [18, 21], "chry_ploidi": [18, 21], "f_stat": [18, 21], "float64": [18, 22, 23, 35], "impute_sex": 18, "int64": [18, 36], "expected_hom": 18, "homozygot": [18, 23], "observed_hom": 18, "infer": [18, 20, 22, 29], "spars": [18, 20, 25, 34, 36, 41], "normal": [18, 22, 26, 30, 34, 36], "prior": [18, 20, 22, 23, 29, 34], "depth": [18, 22, 34, 36, 41, 42], "estim": [18, 23, 36], "nucleotid": 18, "precomput": [18, 45], "intervals_ht": 19, "sequenc": [19, 29, 31, 36], "interval_info": 19, "callrate_mt": 19, "binarization_threshold": 19, "25": [19, 20, 34], "binzarization_threshold": 19, "transform": [19, 22, 34], "scores_ht": 19, "platform_pca_scores_ht": 19, "pc_scores_ann": 19, "hdbscan_min_cluster_s": 19, "hdbscan_min_sampl": 19, "hbdscan": 19, "hdbscan": 19, "min_cluster_s": 19, "smallest": 19, "n_sampl": 19, "min_sampl": 19, "qc_platform": 19, "represent": [20, 37], "pair": [20, 23, 25, 40, 41], "individu": [20, 36, 41], "2nd": [20, 41], "degre": 20, "rel": [20, 21, 36, 42], "user": [20, 33, 35], "parent": [20, 36], "child": [20, 36], "twin": 20, "who": 20, "ident": 20, "mz": 20, "ambigu": 20, "relationship": [20, 41], "kinship": 20, "ibd": 20, "biolog": 20, "relationship_ht": 20, "i_col": [20, 41], "j_col": [20, 41], "j": [20, 41], "rel_col": 20, "extract": [20, 23, 35, 42], "ouput": 20, "pc_relat": 20, "1st": [20, 41], "duplicated_sampl": 20, "samples_rankings_ht": 20, "rank_ann": 20, "rank": [20, 36, 38], "better": [20, 34, 37], "samples_ranking_ht": 20, "dups_ht": 20, "so": [20, 21, 22, 23, 24, 37], "dup_filt": 20, "flatten": 20, "kin_expr": 20, "ibd0_expr": 20, "ibd1_expr": 20, "ibd2_expr": 20, "first_degree_kin_threshold": 20, "19": 20, "second_degree_min_kin": 20, "ibd0_0_max": 20, "025": 20, "ibd0_25_threshold": 20, "425": 20, "ibd1_0_threshold": 20, "15": [20, 34], "ibd1_50_threshold": 20, "275": 20, "75": 20, "ibd1_100_min": 20, "ibd2_0_max": 20, "125": 20, "ibd2_25_threshold": 20, "ibd2_100_threshold": 20, "kin": 20, "ibdo": 20, "ibd1": 20, "ibd2": 20, "http": [20, 22, 24], "doc": 20, "genet": [20, 22, 23, 36], "html": 20, "highlight": 20, "idb2": 20, "min": [20, 41], "max": [20, 22, 36, 41], "ibd0": 20, "share": [20, 33, 41], "corner": 20, "00": 20, "constant": [20, 25, 37, 41], "y_expr": 20, "parent_child_max_i": 20, "second_degree_sibling_lower_cutoff_slop": 20, "second_degree_sibling_lower_cutoff_intercept": 20, "second_degree_upper_sibling_lower_cutoff_slop": 20, "second_degree_upper_sibling_lower_cutoff_intercept": 20, "duplicate_twin_min_kin": 20, "duplicate_twin_ibd1_min": 20, "duplicate_twin_ibd1_max": 20, "slope": [20, 23], "intercept": [20, 23], "axi": [20, 30], "independ": [20, 22], "ibs0": 20, "ibs2": 20, "No": [20, 22], "els": 20, "second_degree_sibling_lower_cutoff": 20, "second_degree_upper_sibling_lower_cutoff": 20, "met": [20, 23, 25], "08838835": 20, "bycroft": 20, "et": [20, 23], "al": [20, 23], "2018": 20, "theoret": 20, "distribut": [20, 22, 23, 26, 36], "duplicate_samples_ht": 20, "relationship_col": [20, 41], "trio": [20, 41], "ok": [20, 41], "member": 20, "sib": 20, "complet": [20, 22, 23], "father": 20, "mother": 20, "femal": [20, 21, 22, 36], "is_femal": 20, "TO": 20, "won": 20, "exactli": 20, "contatin": 20, "sample_list": 20, "exclude_real_proband": 20, "max_tri": 20, "real_pedigre": 20, "sample_list_stratif": 20, "made": 20, "3": [20, 22, 23, 25, 30, 38, 42], "children": 20, "fake": 20, "proband": 20, "real": 20, "ones": [20, 22, 31], "cannot": 20, "try": 20, "bail": 20, "prevent": 20, "infinit": 20, "w": 20, "r": [20, 36, 40], "pick": 20, "chosen": [20, 37], "relatedness_ht": [20, 41], "rank_ht": 20, "kin_threshold": 20, "filtered_sampl": 20, "min_related_hard_filt": 20, "keep_sampl": 20, "keep_samples_when_rel": 20, "maxim": [20, 23], "prefer": [20, 23], "setexpress": [20, 25], "appear": [20, 34], "error": [20, 24, 42], "among": 20, "don": [20, 23, 35], "fam_ht": 20, "trio_mt": 20, "transmitted_strata": 20, "de_novo_strata": 20, "ac_strata": 20, "proband_is_female_expr": 20, "wise": [20, 22, 36], "transmiss": 20, "transmit": [20, 36, 41], "de": [20, 41], "novo": [20, 41], "mutat": [20, 23], "suffix": [20, 36], "append": [20, 36], "dens": [20, 34, 41], "deal": [20, 41], "densifi": [20, 34, 36, 41], "trio_matrix": [20, 41], "dnm": 20, "sib_ht": 20, "stata": 20, "col": [20, 22], "anoth": [20, 29, 36], "karyotype_expr": [21, 22], "xy_karyotype_str": [21, 22], "xx_karyotype_str": [21, 22], "haploid": [21, 22], "callexpress": [21, 22, 38], "repres": [21, 22, 34, 36], "adjust": [21, 22, 23, 34], "sex_expr": [21, 22], "male_str": [21, 22], "female_str": 21, "point": [21, 23, 24], "chang": [21, 22, 23, 47], "fix": [21, 23], "sex_ht": 21, "chrx_ploidy_expr": 21, "chry_ploidy_expr": 21, "karyotype_output_prefix": 21, "gmm": 21, "compon": [21, 36], "_x_karyotyp": 21, "karyotype_output_prefix_y_karyotyp": 21, "_karyotyp": 21, "_y_karyotyp": 21, "yy": 21, "It": [21, 22, 23, 34, 36, 37], "group_by_expr": 21, "x_ploidy_cutoff": 21, "y_ploidy_cutoff": 21, "doubl": [21, 22], "tripl": 21, "final": [21, 38], "cutoff_stdev": 21, "chr_x_ploidi": 21, "chr_y_ploidi": 21, "chr_x_frac_hom_alt_cutoff": 21, "x0": 21, "pop_max_expr": [22, 27], "freq_meta": [22, 23, 25, 32, 34], "pops_to_exclud": 22, "pop_label": 22, "about": [22, 36, 42, 43], "highest": 22, "int32": [22, 34, 36], "homozygote_count": [22, 25], "calcluat": 22, "project_max_expr": [22, 27], "project_expr": 22, "alleles_expr": 22, "n_project": 22, "largest": [22, 36], "Will": 22, "itself": 22, "ti": [22, 43, 45], "guarante": 22, "projectmax": 22, "faf_expr": [22, 27], "faf_threshold": 22, "faf": [22, 32, 36], "see": [22, 24, 25, 38, 46, 48], "cardiodb": 22, "org": 22, "allelefrequencyapp": 22, "found": [22, 23, 25, 32, 42], "agg": [22, 30, 35], "call_stat": 22, "bottleneck": 22, "consanguin": [22, 36], "gen_anc_faf_max_expr": [22, 27], "faf_meta": [22, 32], "faf95_max": 22, "faf95_max_gen_anc": 22, "faf99_max": 22, "faf99_max_gen_anc": 22, "faf95": [22, 36], "faf99": [22, 36], "qual_hist_expr": [22, 27], "gq_expr": 22, "dp_expr": 22, "ad_expr": 22, "adj_expr": 22, "ab_expr": 22, "split_adj_and_raw": 22, "balanc": [22, 36], "here": [22, 23], "age_hists_expr": [22, 27], "age_expr": 22, "lowest_boundari": 22, "highest_boundari": 22, "80": 22, "n_bin": [22, 38, 41], "ag": [22, 36], "lowest": [22, 36], "younger": 22, "n_smaller": [22, 36], "older": 22, "n_larger": [22, 36], "age_hist_het": 22, "age_hist_hom": 22, "get_lowqual_expr": [22, 27], "qual_approx_expr": 22, "snv_phred_threshold": 22, "snv_phred_het_prior": 22, "indel_phred_threshold": 22, "indel_phred_het_prior": 22, "39": 22, "lowqual": [22, 34], "unsplit": 22, "qualapprox": [22, 34, 36], "as_qualapprox": [22, 36], "least": [22, 23, 25, 34], "stringent": 22, "mix": [22, 36], "thu": 22, "certain": [22, 35, 37], "phred": [22, 26, 34, 36], "scale": [22, 23, 26, 34, 36], "emiss": 22, "similar": [22, 23], "arraynumer": 22, "get_annotations_hist": [22, 27], "annotations_hist": 22, "log10_annot": 22, "json": [22, 37, 42], "meric": 22, "create_frequency_bins_expr": [22, 27], "prepar": [22, 29, 35, 37], "qual": [22, 34, 36], "singleton": [22, 23, 36, 38, 41], "doubleton": [22, 36], "00005": 22, "0001": 22, "0002": 22, "0005": 22, "002": 22, "005": 22, "01": 22, "02": 22, "05": [22, 23, 36], "annotate_and_index_source_mt_for_sex_ploidi": [22, 27], "downstream": [22, 25], "optim": 22, "get_is_haploid_expr": [22, 27], "in_non_par": 22, "x_nonpar": 22, "y_par": 22, "y_nonpar": 22, "get_gq_dp_adj_expr": [22, 27], "adj_gq": 22, "adj_dp": 22, "haploid_adj_dp": 22, "account": [22, 25], "int32express": [22, 23, 34, 36], "get_het_ab_adj_expr": [22, 27], "adj_ab": 22, "get_adj_expr": [22, 27], "annotate_adj": [22, 27], "diploid": 22, "add_variant_typ": [22, 27], "alt_allel": 22, "variant_typ": [22, 23, 36], "n_alt_allel": [22, 36], "annotate_allele_info": [22, 27], "allele_info": 22, "has_star": [22, 36], "star": [22, 25, 36], "allele_typ": [22, 36], "insert": [22, 36, 41], "delet": [22, 36, 41], "was_mix": [22, 36], "nonsplit_allel": 22, "annotation_type_is_numer": [22, 27], "annotation_type_in_vcf_info": [22, 27], "nativ": 22, "aren": 22, "bi_allelic_site_inbreeding_expr": [22, 27], "callstats_expr": 22, "inbreedingcoeff": [22, 36], "softwar": 22, "broadinstitut": 22, "document": 22, "articl": 22, "php": 22, "8032": 22, "place": 22, "float32express": 22, "fs_from_sb": [22, 27], "sb": [22, 34, 36], "min_cell_count": 22, "min_count": 22, "min_p_valu": 22, "320": 22, "fisher": [22, 36], "strand": [22, 23, 36, 37], "side": 22, "exact": [22, 36], "behavior": 22, "truncat": [22, 23, 30], "chi": 22, "squar": [22, 23, 36], "cell": 22, "four": [22, 38], "forward": [22, 36], "revers": [22, 23, 36], "ref": [22, 23, 31, 34, 36, 37], "fwd": 22, "rev": 22, "dimension": 22, "github": 22, "com": 22, "blob": 22, "master": 22, "src": 22, "java": 22, "hellbend": 22, "tool": [22, 30], "walker": 22, "fisherstrand": 22, "sq": 22, "fet": 22, "null": [22, 23], "sor_from_sb": [22, 27], "sor": [22, 36], "symmetr": [22, 36], "odd": [22, 36], "strandoddsratio": 22, "pab_max_expr": [22, 27], "la_expr": 22, "n_alleles_expr": 22, "binomi": [22, 36], "pab": 22, "local": [22, 24, 34, 37], "bi_allelic_expr": [22, 27], "unphase_call_expr": [22, 27], "call_expr": 22, "unphas": 22, "phase": [22, 36, 41], "region_flag_expr": [22, 27], "non_par": [22, 36], "prob_region": 22, "region_flag": 22, "hg38": [22, 36], "self": 22, "chain": [22, 29], "yet": 22, "loci": [22, 25], "occur": 22, "pseudoautosom": [22, 36], "missing_callstats_expr": [22, 27], "set_female_y_metrics_to_na_expr": [22, 27], "freq_meta_expr": [22, 23], "freq_index_dict_expr": 22, "freq_index_dict": 22, "dictexpress": [22, 23], "hemi_expr": [22, 27], "hemizyg": 22, "merge_freq_arrai": [22, 27], "farrai": 22, "fmeta": 22, "oper": [22, 25], "set_negatives_to_zero": 22, "count_arrai": 22, "freq1": 22, "freq2": 22, "fmeta1": 22, "fmeta2": 22, "diff": 22, "thier": 22, "subtract": 22, "primari": 22, "valueerror": 22, "descriptor": 22, "merge_histogram": [22, 27], "fashion": 22, "bin_edg": [22, 36], "bin_freq": 22, "them": [22, 34, 37], "edg": [22, 36], "subpop_expr": 22, "additional_strata_expr": 22, "downsampling_expr": 22, "ds_pop_count": 22, "entry_agg_func": [22, 34], "annotate_mt": 22, "freq_meta_sample_count": [22, 25], "support": [22, 24, 31, 36, 37, 42, 43], "age_bin": 22, "replac": [22, 42], "enough": 22, "randomli": 22, "higher": [22, 36], "global_idx": 22, "pop_idx": 22, "annotate_downsampl": [22, 27], "wai": [22, 25, 34, 37], "adj_sampl": 22, "stratifict": 22, "sub": [22, 25, 41], "continent": 22, "full": [22, 23, 38], "build_freq_stratification_list": [22, 27], "primarili": 22, "generate_freq_group_membership_arrai": [22, 27], "strata_expr": [22, 25, 34], "remove_zero_sample_group": 22, "no_raw_group": 22, "group_membership": [22, 34], "pleas": 22, "compute_freq_by_strata": [22, 27], "select_field": 22, "group_membership_includes_raw_group": 22, "like": [22, 25, 36], "still": [22, 43], "membership": [22, 34], "agg_by_strata": [22, 27], "group_membership_ht": [22, 34], "entry_agg_group_membership": [22, 34], "update_structured_annot": [22, 27], "annotation_update_expr": 22, "annotation_update_label": 22, "updat": [22, 36], "recurs": 22, "add_gks_vr": [22, 27], "input_locu": 22, "input_vr": 22, "ga4gh": [22, 36], "collect": [22, 23], "python": [22, 24], "conform": 22, "add_gks_va": [22, 27], "input_struct": 22, "label_nam": 22, "label_vers": 22, "ancestry_group": 22, "ancestry_groups_dict": 22, "va": 22, "schema": [22, 35], "subcohort": 22, "mean_depth": 22, "ancillaryresult": 22, "gks_va_freq_dict": 22, "focusallel": 22, "fill": [22, 30], "caller": 22, "shorten": 22, "breakdown": 22, "script": [23, 47], "coverage_cutoff": [23, 27], "differenti": 23, "calibr": 23, "variat": 23, "annotate_with_mu": [23, 27], "mutation_ht": 23, "mu_annot": 23, "mu_snp": 23, "snp": [23, 36], "count_variants_by_group": [23, 27], "count_singleton": 23, "count_downsampl": 23, "additional_group": 23, "partition_hint": 23, "omit_methyl": 23, "use_table_group_bi": 23, "singleton_expr": 23, "max_af": 23, "context": [23, 25, 37], "methylation_level": 23, "detail": [23, 25, 46], "variant_count": 23, "trinucleotid": 23, "methyl": 23, "featur": [23, 37, 41, 42], "exome_coverag": 23, "omit": 23, "singleton_express": 23, "singleton_count": 23, "get_downsampling_freq_indic": [23, 27], "variant_qu": 23, "genetic_ancestry_label": 23, "downsampling_counts_expr": [23, 27], "annotate_mutation_typ": [23, 27], "context_length": 23, "num_scan_context_length": 23, "cpg": 23, "transit": [23, 41], "mutation_typ": 23, "transvers": 23, "mutation_type_model": 23, "term": [23, 36, 37], "repo": 23, "trimer_from_heptam": [23, 27], "trim": 23, "heptam": 23, "trimer": 23, "collapse_strand": [23, 27], "dedupl": [23, 37], "dna": 23, "complement": 23, "was_flip": 23, "flip": 23, "build_model": [23, 27], "coverage_ht": 23, "weight": 23, "high_cov_definit": 23, "upper_cov_cutoff": 23, "skip_coverage_model": 23, "plateau": 23, "plateau_model": 23, "linear": [23, 26], "against": [23, 30], "substitut": [23, 31, 36], "remaind": 23, "observed_vari": 23, "observed_": 23, "possible_vari": 23, "built": [23, 25], "factor": [23, 26], "correct": [23, 36], "log10": 23, "high_coverage_scale_factor": 23, "get_proportion_observed_by_coverag": 23, "downsampling_counts_": 23, "cover": 23, "build_plateau_model": [23, 27], "cpg_expr": 23, "mu_snp_expr": 23, "observed_variants_expr": 23, "possible_variants_expr": 23, "pops_observed_variants_array_expr": 23, "predict": [23, 25, 36, 42], "nest": [23, 35], "build_coverage_model": [23, 27], "low_coverage_oe_expr": 23, "log_coverage_expr": 23, "get_all_pop_length": [23, 27], "obs_expr": 23, "categor": 23, "get_constraint_grouping_expr": [23, 27], "vep_annotation_expr": 23, "coverage_expr": 23, "include_transcript_group": 23, "include_canonical_group": 23, "include_mane_select_group": 23, "modifi": [23, 37], "classic": 23, "polyphen": [23, 37], "polyphen_predict": 23, "neither": 23, "gene_symbol": [23, 35, 37], "insid": [23, 37], "transcript_id": [23, 25, 35], "mane_select": [23, 37], "actual": 23, "annotate_exploded_vep_for_constraint_group": [23, 27], "vep_annot": [23, 35, 37], "worst_csq_by_gen": [23, 35, 37, 45], "ignor": [23, 35, 42], "unless": [23, 34], "compute_expected_vari": [23, 27], "plateau_models_expr": 23, "mu_expr": 23, "cov_corr_expr": 23, "oe_aggregation_expr": [23, 27], "filter_expr": [23, 25], "exclude_mu_sum": 23, "ob": 23, "mu": 23, "exp": 23, "pop_exp": 23, "pop_ob": 23, "expected_vari": 23, "expected_variants_": 23, "compute_pli": [23, 27], "exp_expr": 23, "expected_valu": 23, "min_diff_converg": 23, "pli": 23, "exac": [23, 36], "paper": 23, "lek": 23, "m": [23, 36], "karczewski": 23, "k": [23, 26], "minikel": 23, "706": 23, "natur": 23, "536": 23, "285": 23, "291": 23, "2016": 23, "intoler": 23, "em": 23, "algorithm": 23, "we": [23, 34, 36, 38], "state": [23, 36], "three": [23, 38, 41], "respect": [23, 25, 30, 37], "sensit": 23, "toler": [23, 36], "recess": 23, "rec": 23, "haploinsuffici": 23, "li": 23, "deplet": 23, "blekhman": 23, "clingen": 23, "dosag": 23, "supplementari": 23, "12": 23, "463": 23, "deriv": 23, "empir": 23, "diseas": 23, "089": 23, "fall": [23, 30, 35, 36], "pnull": 23, "unconstrain": 23, "prec": 23, "iter": [23, 33], "converg": 23, "oe_confidence_interv": [23, 27], "alpha": [23, 30], "around": [23, 30], "densiti": [23, 30], "poisson": 23, "dpoi": 23, "lambda": 23, "lamb": 23, "want": [23, 35, 37, 38], "extend": [23, 34, 41], "bound": 23, "captur": [23, 36], "cumul": [23, 25, 30], "signific": [23, 25], "calculate_raw_z_scor": [23, 27], "sign": 23, "z": [23, 36], "posit": [23, 28, 29, 34, 36, 41, 43], "had": 23, "fewer": 23, "get_constraint_flag": [23, 27], "raw_z_expr": 23, "raw_z_lower_threshold": 23, "raw_z_upper_threshold": 23, "flag_postfix": 23, "why": 23, "no_exp_": 23, "zero": [23, 25], "outlier_": 23, "postfix": 23, "calculate_raw_z_score_sd": [23, 27], "flag_expr": 23, "mirror_neg_raw_z": 23, "multipli": 23, "mirror": 23, "add_gencode_transcript_annot": [23, 27], "gencode_ht": [23, 25, 35], "transcript_typ": 23, "cds_length": 23, "num_coding_exon": 23, "becom": 23, "file_exist": [24, 27], "fname": 24, "parquet": 24, "_success": 24, "check_file_exists_raise_error": [24, 27], "error_if_exist": 24, "error_if_not_exist": 24, "error_if_exists_msg": 24, "error_if_not_exists_msg": 24, "therefor": [24, 25], "write_temp_gc": [24, 27], "gcs_path": 24, "temp_path": 24, "select_primitives_from_ht": [24, 27], "primit": 24, "particularli": 24, "get_file_stat": [24, 27], "url": [24, 48], "project_id": 24, "md5": 24, "read_list_data": [24, 27], "input_file_path": 24, "storag": 24, "gz": [24, 29, 45], "compress": 24, "repartition_for_join": [24, 27], "ht_path": 24, "new_partition_perc": 24, "via": 24, "_interv": 24, "join": [24, 32, 34, 42], "effici": 24, "discuss": 24, "room": 24, "improv": 24, "2278": 24, "initi": 24, "increas": 24, "filter_to_adj": [25, 27], "filter_by_frequ": [25, 27], "direct": 25, "allele_count": 25, "At": [25, 34], "filter_row": 25, "combine_funct": [25, 27], "func_list": 25, "operator_func": 25, "iand": 25, "left": 25, "right": 25, "filter_low_conf_region": [25, 27], "filter_telomeres_and_centromer": 25, "telomer": 25, "centromer": 25, "restrict": 25, "filter_to_autosom": [25, 27], "add_filters_expr": [25, 27], "current_filt": 25, "subset_samples_and_vari": [25, 27], "sample_path": 25, "table_kei": 25, "remove_dead_allel": 25, "filter_to_clinvar_pathogen": [25, 27], "clnrevstat_field": 25, "clnrevstat": 25, "clnsig_field": 25, "clnsig": 25, "clnsigconf_field": 25, "clnsigconf": 25, "remove_no_assert": 25, "remove_conflict": 25, "clinvar": [25, 41], "pathogen": [25, 36], "clinvar_ht": 25, "could": [25, 38], "review": 25, "clinic": 25, "signifc": 25, "conflict": 25, "undefin": 25, "assert": 25, "filter_to_gencode_cd": [25, 27], "cd": [25, 35], "filter_gencode_to_cd": 25, "analys": 25, "step": 25, "retain": 25, "remove_fields_from_const": [25, 27], "fields_to_remov": 25, "origin": [25, 29, 38, 41], "filter_x_nonpar": [25, 27], "filter_y_nonpar": [25, 27], "filter_by_numeric_expr_rang": [25, 27], "filter_rang": 25, "keep_between": 25, "inclus": 25, "themselv": 25, "filter_for_mu": [25, 27], "gerp_lower_cutoff": 25, "9885": 25, "gerp_upper_cutoff": 25, "6607": 25, "gerp": 25, "precalcul": 25, "5th": [25, 32], "95th": 25, "percentil": 25, "intron": [25, 37], "intergen": 25, "split_vds_by_strata": [25, 27], "vdss": 25, "variant_data": [25, 34], "filter_arrays_by_meta": [25, 27], "meta_expr": 25, "meta_indexed_expr": 25, "items_to_filt": 25, "combine_oper": 25, "exact_match": 25, "item": [25, 38], "papuan": 25, "index_glob": [25, 27, 36], "howev": 25, "often": [25, 37], "to_phr": [26, 27], "linear_expr": 26, "from_phr": [26, 27], "phred_score_expr": 26, "get_median_and_mad_expr": [26, 27], "metric_expr": 26, "4826": 26, "absolut": 26, "merge_stats_counters_expr": [26, 27], "counter": 26, "sort_interv": [27, 28], "union_interv": [27, 28], "interval_length": [27, 28], "grch37_to_grch38_chain": [27, 29], "grch38_to_grch37_chain": [27, 29], "get_liftover_genom": [27, 29], "liftover_expr": [27, 29], "default_lift_data": [27, 29], "liftover_using_gnomad_map": [27, 29], "new_show": [27, 30], "plot_hail_hist": [27, 30], "plot_multi_hail_hist": [27, 30], "plot_hail_hist_cumul": [27, 30], "plot_hail_hist_both": [27, 30], "set_font_s": [27, 30], "linear_and_log_tab": [27, 30], "plot_hail_file_metadata": [27, 30], "scale_file_s": [27, 30], "get_rows_data": [27, 30], "pair_plot": [27, 30], "get_reference_ht": [27, 31, 34], "add_reference_sequ": [27, 31], "get_reference_genom": [27, 31], "make_faf_index_dict": [27, 32], "make_freq_index_dict": [27, 32], "make_freq_index_dict_from_meta": [27, 32], "slackclient": [27, 33], "slack_notif": [27, 33], "compute_last_ref_block_end": [27, 34], "densify_sit": [27, 34], "get_as_info_expr": [27, 34], "get_site_info_expr": [27, 34], "default_compute_info": [27, 34], "split_info_annot": [27, 34], "split_lowqual_annot": [27, 34], "impute_sex_ploidi": [27, 34], "densify_all_reference_sit": [27, 34], "compute_stats_per_ref_sit": [27, 34], "compute_coverage_stat": [27, 34], "get_allele_number_agg_func": [27, 34], "compute_allele_number_per_ref_sit": [27, 34], "filter_ref_block": [27, 34], "summarize_transcript_express": [27, 35], "get_expression_proport": [27, 35], "filter_expression_ht_by_tissu": [27, 35], "tissue_expression_ht_to_arrai": [27, 35], "tx_filter_variants_by_csq": [27, 35], "tx_annotate_vari": [27, 35], "tx_aggregate_vari": [27, 35], "perform_tx_annotation_pipelin": [27, 35], "faf_pop": [27, 36], "as_field": [27, 36], "site_field": [27, 36], "allele_type_field": [27, 36], "region_flag_field": [27, 36], "joint_region_flag_field": [27, 36], "rf_field": [27, 36], "as_vqsr_field": [27, 36], "vqsr_field": [27, 36], "info_dict": [27, 36], "in_silico_annotations_info_dict": [27, 36], "vrs_fields_dict": [27, 36], "sparse_entri": [27, 36], "format_dict": [27, 36], "adjust_vcf_incompatible_typ": [27, 36], "make_label_combo": [27, 36], "make_combo_header_text": [27, 36], "create_label_group": [27, 36], "make_info_dict": [27, 36], "add_as_info_dict": [27, 36], "make_vcf_filter_dict": [27, 36], "make_hist_bin_edges_expr": [27, 36], "make_hist_dict": [27, 36], "set_female_y_metrics_to_na": [27, 36], "build_vcf_export_refer": [27, 36], "rekey_new_refer": [27, 36], "current_vep_vers": [27, 37], "csq_code": [27, 37], "csq_splice": [27, 37], "possible_ref": [27, 37], "vep_config_path": [27, 37], "vep_csq_field": [27, 37], "vep_csq_head": [27, 37], "loftee_label": [27, 37], "get_vep_help": [27, 37], "get_vep_context": [27, 37], "vep_or_lookup_vep": [27, 37, 45], "add_most_severe_consequence_to_consequ": [27, 37], "process_consequ": [27, 37, 45], "filter_vep_to_canonical_transcript": [27, 37], "filter_vep_to_mane_select_transcript": [27, 37], "filter_vep_to_synonymous_vari": [27, 37], "filter_vep_to_gene_list": [27, 37], "vep_struct_to_csq": [27, 37], "get_most_severe_consequence_for_summari": [27, 37], "filter_vep_transcript_csq": [27, 37], "add_most_severe_csq_to_tc_within_vep_root": [27, 37], "explode_by_vep_annot": [27, 37], "is_sort": 28, "grch37_to_grch38": 29, "lift": 29, "grch38_to_grch37": 29, "referencegenom": [29, 31, 36], "destination_refer": 29, "coordin": [29, 36], "original_locu": 29, "original_allel": [29, 36], "locus_fail_liftov": 29, "ref_allele_mismatch": 29, "remove_failed_sit": 29, "establish": 29, "shuffl": [29, 38], "40": [30, 34], "hist_data": 30, "titl": 30, "fill_color": 30, "033649": 30, "outlier_fill_color": 30, "036564": 30, "line_color": 30, "hover_mod": 30, "mous": 30, "hide_zero": 30, "come": 30, "straight": 30, "color": 30, "bar": 30, "hover": 30, "mode": 30, "vline": 30, "hline": 30, "figur": 30, "usag": [30, 33, 35], "group_bi": 30, "len": [30, 34], "line_width": 30, "font_siz": 30, "12pt": 30, "plot_func": 30, "tab": 30, "t_path": 30, "grid": 30, "Or": 30, "unord": 30, "directori": 30, "file_s": 30, "rows_fil": 30, "label_col": [30, 42], "save": [30, 42], "pan": 30, "box_zoom": 30, "reset": [30, 36], "wheel_zoom": 30, "box_select": 30, "lasso_select": 30, "help": [30, 37], "tooltip_col": 30, "diagon": 30, "scatter": 30, "color_dict": 30, "rgb": 30, "hex": 30, "tooltip": 30, "add_all_substitut": 31, "filter_n": 31, "obscur": 31, "slow": 31, "unknown": [31, 36], "fasta": 31, "add_sequ": 31, "label_delimit": [32, 36], "_": [32, 36], "look": [32, 38], "access": [32, 48], "nfe_xx": 32, "token": 33, "api": [33, 47], "client": 33, "send_fil": 33, "content": [33, 42], "filenam": 33, "txt": 33, "filetyp": 33, "comment": 33, "send": 33, "channel": 33, "upload": 33, "send_messag": 33, "icon_emoji": 33, "long": 33, "snippet": 33, "emoji": 33, "icon": 33, "notif": 33, "wrap": 33, "stack": 33, "trace": 33, "usernam": 33, "run_analysi": 33, "upstream": 34, "sinc": 34, "beyond": 34, "last_end_posit": 34, "last_end_positions_ht": 34, "semi_join_row": 34, "minim": 34, "furthest": 34, "tag": 34, "semi": 34, "filter_interv": 34, "few": 34, "sum_agg_field": 34, "int32_sum_agg_field": 34, "vardp": [34, 36], "median_agg_field": 34, "readposranksum": [34, 36], "mqranksum": [34, 36], "array_sum_agg_field": 34, "raw_mqanddp": 34, "alt_alleles_range_array_field": 34, "alt_alleles_range_arrai": 34, "treat_fields_as_allele_specif": 34, "gvcf": [34, 36], "as_sb_tabl": [34, 36], "accord": 34, "nomenclatur": 34, "raw_mq": 34, "mq_dp": 34, "gvcf_info": 34, "prioriti": 34, "clash": 34, "Then": 34, "AS": 34, "site_annot": 34, "as_annot": 34, "quasi_as_annot": 34, "lowqual_indel_phred_het_prior": 34, "ac_filter_group": 34, "nonref": 34, "naive_coalesc": 34, "10k": 34, "broad": [34, 36], "scienc": 34, "info_expr": 34, "a_index": 34, "split_multi": 34, "split_multi_ht": 34, "lowqual_expr": 34, "particular": [34, 36], "excluded_calling_interv": 34, "included_calling_interv": 34, "chr_x": 34, "chr_y": 34, "use_only_vari": 34, "reference_ht": 34, "row_key_field": 34, "entry_keep_field": 34, "densif": 34, "row_keep_field": 34, "sex_karyotype_field": 34, "lgt": [34, 36], "coverage_over_x_bin": 34, "gt_field": 34, "keyword": 34, "awar": 35, "transcript_expression_expr": 35, "transcript_tpm": 35, "tissue_expr": 35, "tissu": 35, "summary_agg_func": 35, "transcript_express": 35, "expression_proport": 35, "gene_id": [35, 37], "tissue_1": 35, "tissue_2": 35, "quantif": 35, "tissues_to_keep": 35, "tissues_to_filt": 35, "annotations_to_extract": 35, "tissue_express": 35, "filter_to_cd": 35, "filter_to_gen": 35, "match_by_gene_symbol": [35, 37], "filter_to_csq": 35, "ignore_spl": 35, "filter_to_protein_cod": 35, "vep_root": [35, 37], "further": 35, "amino_acid": [35, 37], "stop_retained_vari": [35, 37], "And": 35, "symbol": [35, 37, 40], "root": [35, 36, 37], "preprocess": 35, "gtex": 35, "process": [35, 45], "worst_csq_for_vari": [35, 45], "worst_csq_by_gene_canon": 35, "worst_csq_for_variant_canon": 35, "worst": [35, 36, 37], "additional_group_bi": 35, "lof_flag": [35, 37], "transcript_abl": [35, 37], "stop_lost": [35, 37], "start_lost": [35, 37], "initiator_codon_vari": [35, 37], "transcript_amplif": [35, 37], "inframe_insert": [35, 37], "inframe_delet": [35, 37], "protein_altering_vari": [35, 37], "splice_region_vari": [35, 37], "incomplete_terminal_codon_vari": [35, 37], "start_retained_vari": [35, 37], "coding_sequence_vari": [35, 37], "stop": [35, 46], "raw_ac_afr_femal": 36, "earlier": 36, "as_f": 36, "as_mq": 36, "as_mqranksum": 36, "as_pab_max": 36, "as_qd": 36, "as_readposranksum": 36, "as_sor": 36, "as_vardp": 36, "fail_interval_qc": 36, "outside_broad_capture_region": 36, "outside_ukb_capture_region": 36, "outside_broad_calling_region": 36, "outside_ukb_calling_region": 36, "not_called_in_exom": 36, "not_called_in_genom": 36, "joint": 36, "rf_positive_label": 36, "rf_negative_label": 36, "rf_label": [36, 41, 42], "rf_train": [36, 41], "rf_tp_probabl": 36, "as_culprit": 36, "as_vqslod": 36, "negative_train_sit": [36, 41], "positive_train_sit": [36, 41], "bia": 36, "baseqranksum": 36, "wilcoxon": 36, "excess": 36, "hardi": 36, "weinberg": 36, "equilibrium": 36, "pl": 36, "vqslod": 36, "versu": 36, "culprit": 36, "85": 36, "meet": 36, "20x": 36, "10x": 36, "coincid": 36, "span": 36, "elsewher": 36, "inbreeding_coeff": 36, "complex": 36, "only_het": 36, "uk": 36, "biobank": 36, "sibling_singleton": 36, "wide": 36, "amongst": 36, "transmitted_singleton": 36, "cadd_phr": 36, "cadd": 36, "c": 36, "billion": 36, "deleteri": 36, "cadd_raw_scor": 36, "extent": 36, "profil": 36, "suggest": 36, "simul": 36, "pangolin_largest_d": 36, "pangolin": 36, "delta": 36, "reflect": 36, "alter": 36, "phylop": 36, "conserv": 36, "241": 36, "placent": 36, "mammal": 36, "zoonomia": 36, "28": 36, "acceler": 36, "evolut": 36, "neutral": 36, "drift": 36, "slower": 36, "polyphen_max": 36, "impact": [36, 37], "amino": 36, "acid": 36, "priorit": 36, "report": 36, "revel_max": 36, "revel": 36, "13": 36, "predictor": [36, 44, 47], "sift_max": 36, "spliceai_ds_max": 36, "illumina": 36, "spliceai": 36, "silico": 36, "vrs_allele_id": 36, "vrs_end": 36, "interresidu": 36, "vrs_start": 36, "vrs_state": 36, "liter": 36, "min_dp": 36, "pgt": 36, "pid": 36, "la": 36, "lad": 36, "lpgt": 36, "lpl": 36, "rgq": 36, "255": 36, "bad": 36, "mate": 36, "physic": 36, "uniqu": 36, "connect": [36, 46], "record": 36, "likelihood": 36, "compris": 36, "detect": 36, "pipe_delimited_annot": 36, "as_mq_dp": 36, "as_raw_mq": 36, "done": [36, 38, 41, 42], "coerc": 36, "pipe": 36, "afr_mal": 36, "afr_femal": 36, "nfe_mal": 36, "nfe_femal": 36, "amr_mal": 36, "amr_femal": 36, "globals_arrai": 36, "meta_arrai": 36, "preposit": 36, "combo_dict": 36, "pop_nam": 36, "programmat": 36, "group_typ": 36, "combo_field": 36, "automat": [36, 41], "combo": 36, "all_group": 36, "pop_sex_group": 36, "prefix_before_metr": 36, "amish": 36, "admix": 36, "ashkenazi": 36, "jewish": 36, "bgr": 36, "bulgarian": 36, "eastern": 36, "south": 36, "asian": 36, "east": 36, "est": 36, "estonian": 36, "eur": 36, "jpn": 36, "kor": 36, "korean": 36, "mde": 36, "middl": 36, "nwe": 36, "north": 36, "western": 36, "oea": 36, "oeu": 36, "onf": 36, "sas_non_consang": 36, "seu": 36, "sgp": 36, "singaporean": 36, "swe": 36, "swedish": 36, "uniform": 36, "unk": 36, "freq_ctt": 36, "freq_cmh": 36, "description_text": 36, "age_hist_distribut": 36, "carrier": 36, "subpopul": 36, "auto": 36, "freq_conting": 36, "conting": 36, "ctt": 36, "cochran": 36, "mantel": 36, "haenszel": 36, "cmh": 36, "snp_cutoff": 36, "indel_cutoff": 36, "inbreeding_cutoff": 36, "variant_qc_filt": 36, "ann_with_hist": 36, "include_age_hist": 36, "reformat": 36, "hist_metric_list": 36, "drop_n_smaller_larg": 36, "keep_contig": 36, "chr2": 36, "chr3": 36, "chr4": 36, "chr5": 36, "chr6": 36, "chr7": 36, "chr8": 36, "chr9": 36, "chr10": 36, "chr11": 36, "chr12": 36, "chr13": 36, "chr14": 36, "chr15": 36, "chr16": 36, "chr17": 36, "chr18": 36, "chr19": 36, "chr21": 36, "chr22": 36, "keep_chrm": 36, "elimin": 36, "chr": 36, "22": 36, "chr3_gl000221v1_random": 36, "155397": 36, "assembli": 36, "chrm": 36, "re": [36, 38], "105": 37, "vep_data": 37, "gcloud": 37, "101": 37, "feature_typ": 37, "exon": 37, "hgvsc": 37, "hgvsp": 37, "cdna_posit": 37, "cds_posit": 37, "protein_posit": 37, "codon": 37, "allele_num": 37, "variant_class": 37, "minimis": 37, "symbol_sourc": 37, "hgnc_id": 37, "tsl": 37, "appri": 37, "ccd": 37, "ensp": 37, "swissprot": 37, "trembl": 37, "uniparc": 37, "gene_pheno": 37, "sift": 37, "domain": 37, "hgvs_offset": 37, "motif_nam": 37, "motif_po": 37, "high_inf_po": 37, "motif_score_chang": 37, "lof_filt": 37, "lof_info": 37, "mane_plus_clin": 37, "uniprot_isoform": 37, "mirna": 37, "pubm": 37, "transcription_factor": 37, "hc": 37, "lc": 37, "dataproc": [37, 45, 46, 48], "hailctl": [37, 45, 46, 48], "command": 37, "vep_config_uri": 37, "default_refer": 37, "vepe": 37, "reference_vep_ht": 37, "vep_vers": 37, "lookup": 37, "databas": 37, "confirm": 37, "referenc": 37, "suitabl": 37, "grab": 37, "vep_config": 37, "vep_context": 37, "tc": 37, "intron_vari": 37, "penalize_flag": 37, "csq_order": 37, "has_polyphen": 37, "any_lof": 37, "later": [37, 42], "penal": 37, "filter_empty_csq": 37, "upstream_gene_vari": 37, "downstream_gene_vari": 37, "vep_expr": 37, "has_polyphen_sift": 37, "flexibl": 37, "counterpart": 37, "while": 37, "Their": 37, "usual": 37, "mature_mirna_vari": 37, "5_prime_utr_vari": 37, "3_prime_utr_vari": 37, "non_coding_transcript_exon_vari": 37, "non_coding_exon_vari": 37, "nmd_transcript_vari": 37, "non_coding_transcript_vari": 37, "nc_transcript_vari": 37, "tfbs_ablat": 37, "tfbs_amplif": 37, "tf_binding_site_vari": 37, "regulatory_region_abl": 37, "regulatory_region_amplif": 37, "feature_elong": 37, "regulatory_region_vari": 37, "feature_trunc": 37, "intergenic_vari": 37, "most_severe_csq": 37, "protein_cod": 37, "no_lof_flag": 37, "ensembl_onli": 37, "keep_csq": 37, "keep_gen": 37, "additional_filtering_criteria": 37, "refseq": 37, "emsembl": 37, "compute_ranked_bin": [38, 39, 41], "score_expr": 38, "bin_expr": 38, "compute_snv_indel_separ": 38, "desc": 38, "mutual": 38, "exclus": 38, "var1": [38, 40], "var2": [38, 40], "var3": 38, "var4": 38, "biallel": [38, 41], "biallelic_bin": 38, "was_split": 38, "singleton_bin": 38, "descend": 38, "compute_grouped_binned_ht": [38, 39, 41], "bin_ht": 38, "create_binned_ht": [38, 39, 41], "bin_id": 38, "etc": 38, "bi_allel": 38, "score_bin_agg": [38, 39, 41], "_parent": [38, 41], "groupedt": [38, 41], "compute_binned_truth_sample_concord": [38, 39], "binned_score_ht": 38, "add_bin": 38, "concord": 38, "tp": [38, 41, 43], "fp": [38, 41, 43], "fn": 38, "truth_gt": 38, "create_truth_sample_ht": [38, 39], "truth_mt": 38, "high_confidence_intervals_ht": 38, "add_rank": [38, 39], "subrank_expr": 38, "subrank": 38, "rank_variant_count": 38, "form": 38, "name_of_subrank": 38, "subrank_filtering_expr": 38, "get_r_human_read": [39, 40], "get_r_for_pair_of_vari": [39, 40], "get_r_within_gene_in_pop": [39, 40], "get_r_within_gen": [39, 40], "generate_trio_stat": [39, 41], "generate_sib_stat": [39, 41], "train_rf_model": [39, 41], "run_rf_test": [39, 42], "check_ht_fields_for_spark": [39, 42], "get_columns_quantil": [39, 42], "median_impute_featur": [39, 42], "ht_to_rf_df": [39, 42], "get_features_import": [39, 42], "get_label": [39, 42], "test_model": [39, 42], "apply_rf_model": [39, 42], "save_model": [39, 42], "load_model": [39, 42], "train_rf": [39, 41, 42], "get_rf_run": [39, 42], "get_run_data": [39, 42], "pretty_print_run": [39, 42], "sample_training_exampl": [39, 43], "ref_genom": 40, "get_ld_matrix": 40, "get_ld_index": 40, "parse_locu": 40, "10146": 40, "10151": 40, "ta": 40, "01789767935482124": 40, "tlocu": 40, "tarrai": 40, "correl": 40, "quadrat": 40, "exercis": 40, "caution": 40, "get_gnomad_public_data": 40, "get_gene_interv": 40, "fast": 40, "add_substrat": 41, "meant": 41, "wrapper": [41, 42], "ac_raw": 41, "fam_stats_ht": 41, "easili": 41, "binned_ht": 41, "grouped_binned_ht": 41, "agg_ht": 41, "ac_qc_samples_unrelated_raw": 41, "fail_hard_filt": 41, "truth_ht": 41, "omni": 41, "kgp_phase1_hc": 41, "n_de_novos_adj": 41, "n_de_novos_raw": 41, "n_transmitted_raw": 41, "n_untransmitted_raw": 41, "min_scor": 41, "minimun": 41, "max_scor": 41, "maiximum": 41, "n_in": 41, "n_del": 41, "n_ti": 41, "n_tv": 41, "trnasvers": 41, "n_1bp_indel": 41, "n_mod3bp_indel": 41, "divis": 41, "n_singleton": 41, "n_vqsr_pos_train": 41, "n_vqsr_neg_train": 41, "n_clinvar": 41, "n_de_novos_singleton_adj": 41, "n_de_novo_singleton": 41, "unfilt": 41, "n_de_novo": 41, "n_trans_singleton": 41, "n_untrans_singleton": 41, "untransmit": 41, "n_omni": 41, "n_mill": 41, "n_hapmap": 41, "n_kgp_phase1_hc": 41, "rf_featur": 41, "tp_expr": [41, 43], "fp_expr": [41, 43], "fp_to_tp": [41, 43], "num_tre": [41, 42], "max_depth": [41, 42], "test_expr": [41, 43], "withheld": [41, 42], "rf_test": 41, "features_import": [41, 42], "test_result": [41, 42], "maxmimum": 41, "hold": 41, "pipelinemodel": [41, 42], "tmp": 42, "dummi": 42, "quantil": 42, "relative_error": 42, "approx_median": 42, "feature_imput": 42, "features_median": 42, "variants_by_strata": 42, "basic": 42, "back": 42, "rf_pipelin": 42, "rf_index": 42, "assembler_index": 42, "vectorassembl": 42, "stringindex": 42, "rf_model": 42, "prediction_col_nam": 42, "rf_predict": 42, "confus": 42, "accuraci": 42, "tstruct": 42, "probability_col_nam": 42, "rf_probabl": 42, "out_path": 42, "input_path": 42, "rf_json_fp": 42, "wasn": 42, "input_arg": 42, "test_interv": 42, "hash": 42, "tv": 43, "effect": [44, 47], "To": [45, 48], "gnomad": [45, 46, 48], "bill": [45, 48], "charg": [45, 48], "d": 45, "import_vcf": 45, "drop_sampl": 45, "drastic": 45, "speed": 45, "worst_consequence_term": 45, "instal": 46, "pip": 46, "jupyt": 46, "notebook": 46, "gnomad_v2_exom": 46, "exomes_ht": 46, "gnomad_v2_genom": 46, "genomes_ht": 46, "gnomad_v3_genom": 46, "shut": 46, "finish": 46, "translat": 47, "program": 48, "gc": 48, "gnomad_public_resource_configur": 48, "transfer": 48, "your": 48, "creation": 48, "my": 48}, "objects": {"gnomad.assessment": [[1, 0, 0, "-", "summary_stats"], [2, 0, 0, "-", "validity_checks"]], "gnomad.assessment.summary_stats": [[1, 1, 1, "", "default_generate_gene_lof_matrix"], [1, 1, 1, "", "default_generate_gene_lof_summary"], [1, 1, 1, "", "freq_bin_expr"], [1, 1, 1, "", "get_an_criteria"], [1, 1, 1, "", "get_het_hom_summary_dict"], [1, 1, 1, "", "get_summary_ac_dict"], [1, 1, 1, "", "get_summary_counts"], [1, 1, 1, "", "get_summary_counts_dict"], [1, 1, 1, "", "get_tx_expression_expr"]], "gnomad.assessment.validity_checks": [[2, 1, 1, "", "check_global_and_row_annot_lengths"], [2, 1, 1, "", "check_raw_and_adj_callstats"], [2, 1, 1, "", "check_sex_chr_metrics"], [2, 1, 1, "", "compare_row_counts"], [2, 1, 1, "", "compare_subset_freqs"], [2, 1, 1, "", "compute_missingness"], [2, 1, 1, "", "count_vep_annotated_variants_per_interval"], [2, 1, 1, "", "generic_field_check"], [2, 1, 1, "", "generic_field_check_loop"], [2, 1, 1, "", "make_filters_expr_dict"], [2, 1, 1, "", "make_group_sum_expr_dict"], [2, 1, 1, "", "pprint_global_anns"], [2, 1, 1, "", "sum_group_callstats"], [2, 1, 1, "", "summarize_variant_filters"], [2, 1, 1, "", "summarize_variants"], [2, 1, 1, "", "validate_release_t"], [2, 1, 1, "", "vcf_field_check"]], "gnomad.resources": [[4, 0, 0, "-", "config"], [12, 0, 0, "-", "import_resources"], [14, 0, 0, "-", "resource_utils"]], "gnomad.resources.config": [[4, 2, 1, "", "GnomadPublicResourceSource"], [4, 1, 1, "", "get_default_public_resource_source"]], "gnomad.resources.config.GnomadPublicResourceSource": [[4, 3, 1, "", "AZURE_OPEN_DATASETS"], [4, 3, 1, "", "GNOMAD"], [4, 3, 1, "", "GOOGLE_CLOUD_PUBLIC_DATASETS"], [4, 3, 1, "", "REGISTRY_OF_OPEN_DATA_ON_AWS"]], "gnomad.resources.grch37": [[5, 0, 0, "-", "gnomad"], [6, 0, 0, "-", "gnomad_ld"], [8, 0, 0, "-", "reference_data"]], "gnomad.resources.grch37.gnomad": [[5, 1, 1, "", "coverage"], [5, 1, 1, "", "liftover"], [5, 1, 1, "", "public_pca_loadings"], [5, 1, 1, "", "public_release"], [5, 1, 1, "", "release_vcf_path"]], "gnomad.resources.grch37.gnomad_ld": [[6, 1, 1, "", "ld_index"], [6, 1, 1, "", "ld_matrix"], [6, 1, 1, "", "ld_scores"]], "gnomad.resources.grch37.reference_data": [[8, 1, 1, "", "get_truth_ht"]], "gnomad.resources.grch38": [[9, 0, 0, "-", "gnomad"], [11, 0, 0, "-", "reference_data"]], "gnomad.resources.grch38.gnomad": [[9, 4, 1, "", "COHORTS_WITH_POP_STORED_AS_SUBPOP"], [9, 4, 1, "", "DOWNSAMPLINGS"], [9, 4, 1, "", "GROUPS"], [9, 4, 1, "", "HGDP_POPS"], [9, 4, 1, "", "POPS"], [9, 4, 1, "", "POPS_TO_REMOVE_FOR_POPMAX"], [9, 4, 1, "", "SEXES"], [9, 4, 1, "", "SUBSETS"], [9, 4, 1, "", "TGP_POPS"], [9, 4, 1, "", "TGP_POP_NAMES"], [9, 1, 1, "", "add_grpMaxFAF95_v4"], [9, 1, 1, "", "all_sites_an"], [9, 1, 1, "", "coverage"], [9, 1, 1, "", "coverage_tsv_path"], [9, 1, 1, "", "gnomad_gks"], [9, 1, 1, "", "public_release"], [9, 1, 1, "", "release_vcf_path"]], "gnomad.resources.grch38.reference_data": [[11, 1, 1, "", "get_truth_ht"]], "gnomad.resources.import_resources": [[12, 1, 1, "", "get_module_importable_resources"], [12, 1, 1, "", "get_resources_descriptions"], [12, 1, 1, "", "main"]], "gnomad.resources.resource_utils": [[14, 2, 1, "", "BaseResource"], [14, 2, 1, "", "BaseVersionedResource"], [14, 2, 1, "", "BlockMatrixResource"], [14, 7, 1, "", "DataException"], [14, 2, 1, "", "ExpressionResource"], [14, 4, 1, "", "GNOMAD_PUBLIC_BUCKETS"], [14, 2, 1, "", "GnomadPublicBlockMatrixResource"], [14, 2, 1, "", "GnomadPublicMatrixTableResource"], [14, 2, 1, "", "GnomadPublicPedigreeResource"], [14, 2, 1, "", "GnomadPublicResource"], [14, 2, 1, "", "GnomadPublicTableResource"], [14, 2, 1, "", "MatrixTableResource"], [14, 2, 1, "", "PedigreeResource"], [14, 7, 1, "", "ResourceNotAvailable"], [14, 2, 1, "", "TableResource"], [14, 2, 1, "", "VariantDatasetResource"], [14, 2, 1, "", "VersionedBlockMatrixResource"], [14, 2, 1, "", "VersionedMatrixTableResource"], [14, 2, 1, "", "VersionedPedigreeResource"], [14, 2, 1, "", "VersionedTableResource"], [14, 2, 1, "", "VersionedVariantDatasetResource"], [14, 1, 1, "", "import_gencode"], [14, 1, 1, "", "import_sites_vcf"]], "gnomad.resources.resource_utils.BaseResource": [[14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "import_resource"], [14, 6, 1, "", "path"]], "gnomad.resources.resource_utils.BaseVersionedResource": [[14, 3, 1, "", "default_version"], [14, 3, 1, "", "resource_class"], [14, 3, 1, "", "versions"]], "gnomad.resources.resource_utils.BlockMatrixResource": [[14, 5, 1, "", "bm"], [14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "import_resource"]], "gnomad.resources.resource_utils.ExpressionResource": [[14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "he"], [14, 5, 1, "", "import_resource"]], "gnomad.resources.resource_utils.GnomadPublicBlockMatrixResource": [[14, 5, 1, "", "bm"]], "gnomad.resources.resource_utils.GnomadPublicMatrixTableResource": [[14, 5, 1, "", "mt"]], "gnomad.resources.resource_utils.GnomadPublicPedigreeResource": [[14, 5, 1, "", "ht"], [14, 5, 1, "", "pedigree"]], "gnomad.resources.resource_utils.GnomadPublicResource": [[14, 5, 1, "", "is_resource_available"]], "gnomad.resources.resource_utils.GnomadPublicTableResource": [[14, 5, 1, "", "ht"]], "gnomad.resources.resource_utils.MatrixTableResource": [[14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "import_resource"], [14, 5, 1, "", "mt"]], "gnomad.resources.resource_utils.PedigreeResource": [[14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "ht"], [14, 5, 1, "", "import_resource"], [14, 5, 1, "", "pedigree"]], "gnomad.resources.resource_utils.TableResource": [[14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "ht"], [14, 5, 1, "", "import_resource"]], "gnomad.resources.resource_utils.VariantDatasetResource": [[14, 3, 1, "", "expected_file_extensions"], [14, 5, 1, "", "import_resource"], [14, 5, 1, "", "vds"]], "gnomad.resources.resource_utils.VersionedBlockMatrixResource": [[14, 3, 1, "", "resource_class"]], "gnomad.resources.resource_utils.VersionedMatrixTableResource": [[14, 3, 1, "", "resource_class"]], "gnomad.resources.resource_utils.VersionedPedigreeResource": [[14, 3, 1, "", "resource_class"]], "gnomad.resources.resource_utils.VersionedTableResource": [[14, 3, 1, "", "resource_class"]], "gnomad.resources.resource_utils.VersionedVariantDatasetResource": [[14, 3, 1, "", "resource_class"]], "gnomad.sample_qc": [[15, 0, 0, "-", "ancestry"], [16, 0, 0, "-", "filtering"], [18, 0, 0, "-", "pipeline"], [19, 0, 0, "-", "platform"], [20, 0, 0, "-", "relatedness"], [21, 0, 0, "-", "sex"]], "gnomad.sample_qc.ancestry": [[15, 1, 1, "", "apply_onnx_classification_model"], [15, 1, 1, "", "apply_sklearn_classification_model"], [15, 1, 1, "", "assign_population_pcs"], [15, 1, 1, "", "convert_sklearn_rf_to_onnx"], [15, 1, 1, "", "pc_project"], [15, 1, 1, "", "run_pca_with_relateds"]], "gnomad.sample_qc.filtering": [[16, 1, 1, "", "compute_qc_metrics_residuals"], [16, 1, 1, "", "compute_stratified_metrics_filter"], [16, 1, 1, "", "compute_stratified_sample_qc"], [16, 1, 1, "", "determine_nearest_neighbors"], [16, 1, 1, "", "merge_sample_qc_expr"]], "gnomad.sample_qc.pipeline": [[18, 1, 1, "", "annotate_sex"], [18, 1, 1, "", "filter_rows_for_qc"], [18, 1, 1, "", "get_qc_mt"], [18, 1, 1, "", "infer_sex_karyotype"]], "gnomad.sample_qc.platform": [[19, 1, 1, "", "assign_platform_from_pcs"], [19, 1, 1, "", "compute_callrate_mt"], [19, 1, 1, "", "run_platform_pca"]], "gnomad.sample_qc.relatedness": [[20, 4, 1, "", "AMBIGUOUS_RELATIONSHIP"], [20, 4, 1, "", "DUPLICATE_OR_TWINS"], [20, 4, 1, "", "PARENT_CHILD"], [20, 4, 1, "", "SECOND_DEGREE_RELATIVES"], [20, 4, 1, "", "SIBLINGS"], [20, 4, 1, "", "UNRELATED"], [20, 1, 1, "", "compute_related_samples_to_drop"], [20, 1, 1, "", "create_fake_pedigree"], [20, 1, 1, "", "explode_duplicate_samples_ht"], [20, 1, 1, "", "filter_mt_to_trios"], [20, 1, 1, "", "generate_sib_stats_expr"], [20, 1, 1, "", "generate_trio_stats_expr"], [20, 1, 1, "", "get_duplicated_samples"], [20, 1, 1, "", "get_duplicated_samples_ht"], [20, 1, 1, "", "get_relationship_expr"], [20, 1, 1, "", "get_slope_int_relationship_expr"], [20, 1, 1, "", "infer_families"]], "gnomad.sample_qc.sex": [[21, 1, 1, "", "adjust_sex_ploidy"], [21, 1, 1, "", "adjusted_sex_ploidy_expr"], [21, 1, 1, "", "gaussian_mixture_model_karyotype_assignment"], [21, 1, 1, "", "get_chr_x_hom_alt_cutoffs"], [21, 1, 1, "", "get_ploidy_cutoffs"], [21, 1, 1, "", "get_sex_expr"]], "gnomad.utils": [[22, 0, 0, "-", "annotations"], [23, 0, 0, "-", "constraint"], [24, 0, 0, "-", "file_utils"], [25, 0, 0, "-", "filtering"], [26, 0, 0, "-", "gen_stats"], [28, 0, 0, "-", "intervals"], [29, 0, 0, "-", "liftover"], [30, 0, 0, "-", "plotting"], [31, 0, 0, "-", "reference_genome"], [32, 0, 0, "-", "release"], [33, 0, 0, "-", "slack"], [34, 0, 0, "-", "sparse_mt"], [35, 0, 0, "-", "transcript_annotation"], [36, 0, 0, "-", "vcf"], [37, 0, 0, "-", "vep"]], "gnomad.utils.annotations": [[22, 1, 1, "", "add_gks_va"], [22, 1, 1, "", "add_gks_vrs"], [22, 1, 1, "", "add_variant_type"], [22, 1, 1, "", "age_hists_expr"], [22, 1, 1, "", "agg_by_strata"], [22, 1, 1, "", "annotate_adj"], [22, 1, 1, "", "annotate_allele_info"], [22, 1, 1, "", "annotate_and_index_source_mt_for_sex_ploidy"], [22, 1, 1, "", "annotate_downsamplings"], [22, 1, 1, "", "annotate_freq"], [22, 1, 1, "", "annotation_type_in_vcf_info"], [22, 1, 1, "", "annotation_type_is_numeric"], [22, 1, 1, "", "bi_allelic_expr"], [22, 1, 1, "", "bi_allelic_site_inbreeding_expr"], [22, 1, 1, "", "build_freq_stratification_list"], [22, 1, 1, "", "compute_freq_by_strata"], [22, 1, 1, "", "create_frequency_bins_expr"], [22, 1, 1, "", "faf_expr"], [22, 1, 1, "", "fs_from_sb"], [22, 1, 1, "", "gen_anc_faf_max_expr"], [22, 1, 1, "", "generate_freq_group_membership_array"], [22, 1, 1, "", "get_adj_expr"], [22, 1, 1, "", "get_annotations_hists"], [22, 1, 1, "", "get_gq_dp_adj_expr"], [22, 1, 1, "", "get_het_ab_adj_expr"], [22, 1, 1, "", "get_is_haploid_expr"], [22, 1, 1, "", "get_lowqual_expr"], [22, 1, 1, "", "hemi_expr"], [22, 1, 1, "", "merge_freq_arrays"], [22, 1, 1, "", "merge_histograms"], [22, 1, 1, "", "missing_callstats_expr"], [22, 1, 1, "", "pab_max_expr"], [22, 1, 1, "", "pop_max_expr"], [22, 1, 1, "", "project_max_expr"], [22, 1, 1, "", "qual_hist_expr"], [22, 1, 1, "", "region_flag_expr"], [22, 1, 1, "", "set_female_y_metrics_to_na_expr"], [22, 1, 1, "", "sor_from_sb"], [22, 1, 1, "", "unphase_call_expr"], [22, 1, 1, "", "update_structured_annotations"]], "gnomad.utils.constraint": [[23, 4, 1, "", "COVERAGE_CUTOFF"], [23, 1, 1, "", "add_gencode_transcript_annotations"], [23, 1, 1, "", "annotate_exploded_vep_for_constraint_groupings"], [23, 1, 1, "", "annotate_mutation_type"], [23, 1, 1, "", "annotate_with_mu"], [23, 1, 1, "", "build_coverage_model"], [23, 1, 1, "", "build_models"], [23, 1, 1, "", "build_plateau_models"], [23, 1, 1, "", "calculate_raw_z_score"], [23, 1, 1, "", "calculate_raw_z_score_sd"], [23, 1, 1, "", "collapse_strand"], [23, 1, 1, "", "compute_expected_variants"], [23, 1, 1, "", "compute_pli"], [23, 1, 1, "", "count_variants_by_group"], [23, 1, 1, "", "downsampling_counts_expr"], [23, 1, 1, "", "get_all_pop_lengths"], [23, 1, 1, "", "get_constraint_flags"], [23, 1, 1, "", "get_constraint_grouping_expr"], [23, 1, 1, "", "get_downsampling_freq_indices"], [23, 1, 1, "", "oe_aggregation_expr"], [23, 1, 1, "", "oe_confidence_interval"], [23, 1, 1, "", "trimer_from_heptamer"]], "gnomad.utils.file_utils": [[24, 1, 1, "", "check_file_exists_raise_error"], [24, 1, 1, "", "file_exists"], [24, 1, 1, "", "get_file_stats"], [24, 1, 1, "", "read_list_data"], [24, 1, 1, "", "repartition_for_join"], [24, 1, 1, "", "select_primitives_from_ht"], [24, 1, 1, "", "write_temp_gcs"]], "gnomad.utils.filtering": [[25, 1, 1, "", "add_filters_expr"], [25, 1, 1, "", "combine_functions"], [25, 1, 1, "", "filter_arrays_by_meta"], [25, 1, 1, "", "filter_by_frequency"], [25, 1, 1, "", "filter_by_numeric_expr_range"], [25, 1, 1, "", "filter_for_mu"], [25, 1, 1, "", "filter_low_conf_regions"], [25, 1, 1, "", "filter_to_adj"], [25, 1, 1, "", "filter_to_autosomes"], [25, 1, 1, "", "filter_to_clinvar_pathogenic"], [25, 1, 1, "", "filter_to_gencode_cds"], [25, 1, 1, "", "filter_x_nonpar"], [25, 1, 1, "", "filter_y_nonpar"], [25, 1, 1, "", "remove_fields_from_constant"], [25, 1, 1, "", "split_vds_by_strata"], [25, 1, 1, "", "subset_samples_and_variants"]], "gnomad.utils.gen_stats": [[26, 1, 1, "", "from_phred"], [26, 1, 1, "", "get_median_and_mad_expr"], [26, 1, 1, "", "merge_stats_counters_expr"], [26, 1, 1, "", "to_phred"]], "gnomad.utils.intervals": [[28, 1, 1, "", "interval_length"], [28, 1, 1, "", "sort_intervals"], [28, 1, 1, "", "union_intervals"]], "gnomad.utils.liftover": [[29, 4, 1, "", "GRCH37_to_GRCH38_CHAIN"], [29, 4, 1, "", "GRCH38_TO_GRCH37_CHAIN"], [29, 1, 1, "", "default_lift_data"], [29, 1, 1, "", "get_liftover_genome"], [29, 1, 1, "", "liftover_expr"], [29, 1, 1, "", "liftover_using_gnomad_map"]], "gnomad.utils.plotting": [[30, 1, 1, "", "get_rows_data"], [30, 1, 1, "", "linear_and_log_tabs"], [30, 1, 1, "", "new_show"], [30, 1, 1, "", "pair_plot"], [30, 1, 1, "", "plot_hail_file_metadata"], [30, 1, 1, "", "plot_hail_hist"], [30, 1, 1, "", "plot_hail_hist_both"], [30, 1, 1, "", "plot_hail_hist_cumulative"], [30, 1, 1, "", "plot_multi_hail_hist"], [30, 1, 1, "", "scale_file_sizes"], [30, 1, 1, "", "set_font_size"]], "gnomad.utils.reference_genome": [[31, 1, 1, "", "add_reference_sequence"], [31, 1, 1, "", "get_reference_genome"], [31, 1, 1, "", "get_reference_ht"]], "gnomad.utils.release": [[32, 1, 1, "", "make_faf_index_dict"], [32, 1, 1, "", "make_freq_index_dict"], [32, 1, 1, "", "make_freq_index_dict_from_meta"]], "gnomad.utils.slack": [[33, 2, 1, "", "SlackClient"], [33, 1, 1, "", "slack_notifications"]], "gnomad.utils.slack.SlackClient": [[33, 5, 1, "", "send_file"], [33, 5, 1, "", "send_message"]], "gnomad.utils.sparse_mt": [[34, 1, 1, "", "compute_allele_number_per_ref_site"], [34, 1, 1, "", "compute_coverage_stats"], [34, 1, 1, "", "compute_last_ref_block_end"], [34, 1, 1, "", "compute_stats_per_ref_site"], [34, 1, 1, "", "default_compute_info"], [34, 1, 1, "", "densify_all_reference_sites"], [34, 1, 1, "", "densify_sites"], [34, 1, 1, "", "filter_ref_blocks"], [34, 1, 1, "", "get_allele_number_agg_func"], [34, 1, 1, "", "get_as_info_expr"], [34, 1, 1, "", "get_site_info_expr"], [34, 1, 1, "", "impute_sex_ploidy"], [34, 1, 1, "", "split_info_annotation"], [34, 1, 1, "", "split_lowqual_annotation"]], "gnomad.utils.transcript_annotation": [[35, 1, 1, "", "filter_expression_ht_by_tissues"], [35, 1, 1, "", "get_expression_proportion"], [35, 1, 1, "", "perform_tx_annotation_pipeline"], [35, 1, 1, "", "summarize_transcript_expression"], [35, 1, 1, "", "tissue_expression_ht_to_array"], [35, 1, 1, "", "tx_aggregate_variants"], [35, 1, 1, "", "tx_annotate_variants"], [35, 1, 1, "", "tx_filter_variants_by_csqs"]], "gnomad.utils.vcf": [[36, 4, 1, "", "ALLELE_TYPE_FIELDS"], [36, 4, 1, "", "AS_FIELDS"], [36, 4, 1, "", "AS_VQSR_FIELDS"], [36, 4, 1, "", "ENTRIES"], [36, 4, 1, "", "FAF_POPS"], [36, 4, 1, "", "FORMAT_DICT"], [36, 4, 1, "", "GROUPS"], [36, 4, 1, "", "HISTS"], [36, 4, 1, "", "INFO_DICT"], [36, 4, 1, "", "IN_SILICO_ANNOTATIONS_INFO_DICT"], [36, 4, 1, "", "JOINT_REGION_FLAG_FIELDS"], [36, 4, 1, "", "REGION_FLAG_FIELDS"], [36, 4, 1, "", "RF_FIELDS"], [36, 4, 1, "", "SEXES"], [36, 4, 1, "", "SITE_FIELDS"], [36, 4, 1, "", "SORT_ORDER"], [36, 4, 1, "", "SPARSE_ENTRIES"], [36, 4, 1, "", "VQSR_FIELDS"], [36, 4, 1, "", "VRS_FIELDS_DICT"], [36, 1, 1, "", "add_as_info_dict"], [36, 1, 1, "", "adjust_vcf_incompatible_types"], [36, 1, 1, "", "build_vcf_export_reference"], [36, 1, 1, "", "create_label_groups"], [36, 1, 1, "", "index_globals"], [36, 1, 1, "", "make_combo_header_text"], [36, 1, 1, "", "make_hist_bin_edges_expr"], [36, 1, 1, "", "make_hist_dict"], [36, 1, 1, "", "make_info_dict"], [36, 1, 1, "", "make_label_combos"], [36, 1, 1, "", "make_vcf_filter_dict"], [36, 1, 1, "", "rekey_new_reference"], [36, 1, 1, "", "set_female_y_metrics_to_na"]], "gnomad.utils.vep": [[37, 4, 1, "", "CSQ_CODING"], [37, 4, 1, "", "CSQ_SPLICE"], [37, 4, 1, "", "CURRENT_VEP_VERSION"], [37, 4, 1, "", "LOFTEE_LABELS"], [37, 4, 1, "", "LOF_CSQ_SET"], [37, 4, 1, "", "POSSIBLE_REFS"], [37, 4, 1, "", "VEP_CONFIG_PATH"], [37, 4, 1, "", "VEP_CSQ_FIELDS"], [37, 4, 1, "", "VEP_CSQ_HEADER"], [37, 1, 1, "", "add_most_severe_consequence_to_consequence"], [37, 1, 1, "", "add_most_severe_csq_to_tc_within_vep_root"], [37, 1, 1, "", "explode_by_vep_annotation"], [37, 1, 1, "", "filter_vep_to_canonical_transcripts"], [37, 1, 1, "", "filter_vep_to_gene_list"], [37, 1, 1, "", "filter_vep_to_mane_select_transcripts"], [37, 1, 1, "", "filter_vep_to_synonymous_variants"], [37, 1, 1, "", "filter_vep_transcript_csqs"], [37, 1, 1, "", "get_most_severe_consequence_for_summary"], [37, 1, 1, "", "get_vep_context"], [37, 1, 1, "", "get_vep_help"], [37, 1, 1, "", "process_consequences"], [37, 1, 1, "", "vep_or_lookup_vep"], [37, 1, 1, "", "vep_struct_to_csq"]], "gnomad.variant_qc": [[38, 0, 0, "-", "evaluation"], [40, 0, 0, "-", "ld"], [41, 0, 0, "-", "pipeline"], [42, 0, 0, "-", "random_forest"], [43, 0, 0, "-", "training"]], "gnomad.variant_qc.evaluation": [[38, 1, 1, "", "add_rank"], [38, 1, 1, "", "compute_binned_truth_sample_concordance"], [38, 1, 1, "", "compute_grouped_binned_ht"], [38, 1, 1, "", "compute_ranked_bin"], [38, 1, 1, "", "create_truth_sample_ht"]], "gnomad.variant_qc.ld": [[40, 1, 1, "", "get_r_for_pair_of_variants"], [40, 1, 1, "", "get_r_human_readable"], [40, 1, 1, "", "get_r_within_gene"], [40, 1, 1, "", "get_r_within_gene_in_pop"]], "gnomad.variant_qc.pipeline": [[41, 1, 1, "", "create_binned_ht"], [41, 1, 1, "", "generate_sib_stats"], [41, 1, 1, "", "generate_trio_stats"], [41, 1, 1, "", "score_bin_agg"], [41, 1, 1, "", "train_rf_model"]], "gnomad.variant_qc.random_forest": [[42, 1, 1, "", "apply_rf_model"], [42, 1, 1, "", "check_ht_fields_for_spark"], [42, 1, 1, "", "get_columns_quantiles"], [42, 1, 1, "", "get_features_importance"], [42, 1, 1, "", "get_labels"], [42, 1, 1, "", "get_rf_runs"], [42, 1, 1, "", "get_run_data"], [42, 1, 1, "", "ht_to_rf_df"], [42, 1, 1, "", "load_model"], [42, 1, 1, "", "median_impute_features"], [42, 1, 1, "", "pretty_print_runs"], [42, 1, 1, "", "run_rf_test"], [42, 1, 1, "", "save_model"], [42, 1, 1, "", "test_model"], [42, 1, 1, "", "train_rf"]], "gnomad.variant_qc.training": [[43, 1, 1, "", "sample_training_examples"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:attribute", "4": "py:data", "5": "py:method", "6": "py:property", "7": "py:exception"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "attribute", "Python attribute"], "4": ["py", "data", "Python data"], "5": ["py", "method", "Python method"], "6": ["py", "property", "Python property"], "7": ["py", "exception", "Python exception"]}, "titleterms": {"gnomad": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47], "assess": [0, 1, 2], "summary_stat": 1, "validity_check": 2, "resourc": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 48], "config": 4, "grch37": [5, 6, 7, 8], "gnomad_ld": 6, "reference_data": [8, 11], "grch38": [9, 10, 11], "import_resourc": 12, "resource_util": 14, "sample_qc": [15, 16, 17, 18, 19, 20, 21], "ancestri": 15, "filter": [16, 25], "pipelin": [18, 41], "platform": 19, "related": 20, "sex": 21, "util": [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37], "annot": 22, "constraint": 23, "file_util": 24, "gen_stat": 26, "interv": 28, "liftov": 29, "plot": 30, "reference_genom": 31, "releas": 32, "slack": 33, "sparse_mt": 34, "transcript_annot": 35, "vcf": 36, "vep": [37, 45], "variant_qc": [38, 39, 40, 41, 42, 43], "evalu": 38, "ld": 40, "random_forest": 42, "train": 43, "exampl": 44, "variant": 45, "effect": 45, "predictor": 45, "get": 46, "start": 46, "content": 47, "sourc": 48, "custom": 48, "environ": 48, "configur": 48}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1, "sphinx": 57}, "alltitles": {"gnomad.assessment": [[0, "gnomad-assessment"]], "gnomad.assessment.summary_stats": [[1, "gnomad-assessment-summary-stats"]], "gnomad.assessment.validity_checks": [[2, "gnomad-assessment-validity-checks"]], "gnomad": [[3, "gnomad"], [47, "gnomad"]], "gnomad.resources.config": [[4, "gnomad-resources-config"]], "gnomad.resources.grch37.gnomad": [[5, "gnomad-resources-grch37-gnomad"]], "gnomad.resources.grch37.gnomad_ld": [[6, "gnomad-resources-grch37-gnomad-ld"]], "gnomad.resources.grch37": [[7, "gnomad-resources-grch37"]], "gnomad.resources.grch37.reference_data": [[8, "gnomad-resources-grch37-reference-data"]], "gnomad.resources.grch38.gnomad": [[9, "gnomad-resources-grch38-gnomad"]], "gnomad.resources.grch38": [[10, "gnomad-resources-grch38"]], "gnomad.resources.grch38.reference_data": [[11, "gnomad-resources-grch38-reference-data"]], "gnomad.resources.import_resources": [[12, "gnomad-resources-import-resources"]], "gnomad.resources": [[13, "gnomad-resources"]], "gnomad.resources.resource_utils": [[14, "gnomad-resources-resource-utils"]], "gnomad.sample_qc.ancestry": [[15, "gnomad-sample-qc-ancestry"]], "gnomad.sample_qc.filtering": [[16, "gnomad-sample-qc-filtering"]], "gnomad.sample_qc": [[17, "gnomad-sample-qc"]], "gnomad.sample_qc.pipeline": [[18, "gnomad-sample-qc-pipeline"]], "gnomad.sample_qc.platform": [[19, "gnomad-sample-qc-platform"]], "gnomad.sample_qc.relatedness": [[20, "gnomad-sample-qc-relatedness"]], "gnomad.sample_qc.sex": [[21, "gnomad-sample-qc-sex"]], "gnomad.utils.annotations": [[22, "gnomad-utils-annotations"]], "gnomad.utils.constraint": [[23, "gnomad-utils-constraint"]], "gnomad.utils.file_utils": [[24, "gnomad-utils-file-utils"]], "gnomad.utils.filtering": [[25, "gnomad-utils-filtering"]], "gnomad.utils.gen_stats": [[26, "gnomad-utils-gen-stats"]], "gnomad.utils": [[27, "gnomad-utils"]], "gnomad.utils.intervals": [[28, "gnomad-utils-intervals"]], "gnomad.utils.liftover": [[29, "gnomad-utils-liftover"]], "gnomad.utils.plotting": [[30, "gnomad-utils-plotting"]], "gnomad.utils.reference_genome": [[31, "gnomad-utils-reference-genome"]], "gnomad.utils.release": [[32, "gnomad-utils-release"]], "gnomad.utils.slack": [[33, "gnomad-utils-slack"]], "gnomad.utils.sparse_mt": [[34, "gnomad-utils-sparse-mt"]], "gnomad.utils.transcript_annotation": [[35, "gnomad-utils-transcript-annotation"]], "gnomad.utils.vcf": [[36, "gnomad-utils-vcf"]], "gnomad.utils.vep": [[37, "gnomad-utils-vep"]], "gnomad.variant_qc.evaluation": [[38, "gnomad-variant-qc-evaluation"]], "gnomad.variant_qc": [[39, "gnomad-variant-qc"]], "gnomad.variant_qc.ld": [[40, "gnomad-variant-qc-ld"]], "gnomad.variant_qc.pipeline": [[41, "gnomad-variant-qc-pipeline"]], "gnomad.variant_qc.random_forest": [[42, "gnomad-variant-qc-random-forest"]], "gnomad.variant_qc.training": [[43, "gnomad-variant-qc-training"]], "Examples": [[44, "examples"]], "Variant Effect Predictor (VEP)": [[45, "variant-effect-predictor-vep"]], "Getting Started": [[46, "getting-started"]], "Contents": [[47, "contents"]], "Resource Sources": [[48, "resource-sources"]], "Custom Sources": [[48, "custom-sources"]], "Environment Configuration": [[48, "environment-configuration"]]}, "indexentries": {"default_generate_gene_lof_matrix() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.default_generate_gene_lof_matrix"]], "default_generate_gene_lof_summary() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.default_generate_gene_lof_summary"]], "freq_bin_expr() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.freq_bin_expr"]], "get_an_criteria() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.get_an_criteria"]], "get_het_hom_summary_dict() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.get_het_hom_summary_dict"]], "get_summary_ac_dict() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.get_summary_ac_dict"]], "get_summary_counts() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.get_summary_counts"]], "get_summary_counts_dict() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.get_summary_counts_dict"]], "get_tx_expression_expr() (in module gnomad.assessment.summary_stats)": [[1, "gnomad.assessment.summary_stats.get_tx_expression_expr"]], "gnomad.assessment.summary_stats": [[1, "module-gnomad.assessment.summary_stats"]], "module": [[1, "module-gnomad.assessment.summary_stats"], [2, "module-gnomad.assessment.validity_checks"], [4, "module-gnomad.resources.config"], [5, "module-gnomad.resources.grch37.gnomad"], [6, "module-gnomad.resources.grch37.gnomad_ld"], [8, "module-gnomad.resources.grch37.reference_data"], [9, "module-gnomad.resources.grch38.gnomad"], [11, "module-gnomad.resources.grch38.reference_data"], [12, "module-gnomad.resources.import_resources"], [14, "module-gnomad.resources.resource_utils"], [15, "module-gnomad.sample_qc.ancestry"], [16, "module-gnomad.sample_qc.filtering"], [18, "module-gnomad.sample_qc.pipeline"], [19, "module-gnomad.sample_qc.platform"], [20, "module-gnomad.sample_qc.relatedness"], [21, "module-gnomad.sample_qc.sex"], [22, "module-gnomad.utils.annotations"], [23, "module-gnomad.utils.constraint"], [24, "module-gnomad.utils.file_utils"], [25, "module-gnomad.utils.filtering"], [26, "module-gnomad.utils.gen_stats"], [28, "module-gnomad.utils.intervals"], [29, "module-gnomad.utils.liftover"], [30, "module-gnomad.utils.plotting"], [31, "module-gnomad.utils.reference_genome"], [32, "module-gnomad.utils.release"], [33, "module-gnomad.utils.slack"], [34, "module-gnomad.utils.sparse_mt"], [35, "module-gnomad.utils.transcript_annotation"], [36, "module-gnomad.utils.vcf"], [37, "module-gnomad.utils.vep"], [38, "module-gnomad.variant_qc.evaluation"], [40, "module-gnomad.variant_qc.ld"], [41, "module-gnomad.variant_qc.pipeline"], [42, "module-gnomad.variant_qc.random_forest"], [43, "module-gnomad.variant_qc.training"]], "check_global_and_row_annot_lengths() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.check_global_and_row_annot_lengths"]], "check_raw_and_adj_callstats() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.check_raw_and_adj_callstats"]], "check_sex_chr_metrics() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.check_sex_chr_metrics"]], "compare_row_counts() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.compare_row_counts"]], "compare_subset_freqs() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.compare_subset_freqs"]], "compute_missingness() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.compute_missingness"]], "count_vep_annotated_variants_per_interval() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.count_vep_annotated_variants_per_interval"]], "generic_field_check() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.generic_field_check"]], "generic_field_check_loop() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.generic_field_check_loop"]], "gnomad.assessment.validity_checks": [[2, "module-gnomad.assessment.validity_checks"]], "make_filters_expr_dict() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.make_filters_expr_dict"]], "make_group_sum_expr_dict() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.make_group_sum_expr_dict"]], "pprint_global_anns() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.pprint_global_anns"]], "sum_group_callstats() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.sum_group_callstats"]], "summarize_variant_filters() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.summarize_variant_filters"]], "summarize_variants() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.summarize_variants"]], "validate_release_t() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.validate_release_t"]], "vcf_field_check() (in module gnomad.assessment.validity_checks)": [[2, "gnomad.assessment.validity_checks.vcf_field_check"]], "azure_open_datasets (gnomad.resources.config.gnomadpublicresourcesource attribute)": [[4, "gnomad.resources.config.GnomadPublicResourceSource.AZURE_OPEN_DATASETS"]], "gnomad (gnomad.resources.config.gnomadpublicresourcesource attribute)": [[4, "gnomad.resources.config.GnomadPublicResourceSource.GNOMAD"]], "google_cloud_public_datasets (gnomad.resources.config.gnomadpublicresourcesource attribute)": [[4, "gnomad.resources.config.GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS"]], "gnomadpublicresourcesource (class in gnomad.resources.config)": [[4, "gnomad.resources.config.GnomadPublicResourceSource"]], "registry_of_open_data_on_aws (gnomad.resources.config.gnomadpublicresourcesource attribute)": [[4, "gnomad.resources.config.GnomadPublicResourceSource.REGISTRY_OF_OPEN_DATA_ON_AWS"]], "get_default_public_resource_source() (in module gnomad.resources.config)": [[4, "gnomad.resources.config.get_default_public_resource_source"]], "gnomad.resources.config": [[4, "module-gnomad.resources.config"]], "coverage() (in module gnomad.resources.grch37.gnomad)": [[5, "gnomad.resources.grch37.gnomad.coverage"]], "gnomad.resources.grch37.gnomad": [[5, "module-gnomad.resources.grch37.gnomad"]], "liftover() (in module gnomad.resources.grch37.gnomad)": [[5, "gnomad.resources.grch37.gnomad.liftover"]], "public_pca_loadings() (in module gnomad.resources.grch37.gnomad)": [[5, "gnomad.resources.grch37.gnomad.public_pca_loadings"]], "public_release() (in module gnomad.resources.grch37.gnomad)": [[5, "gnomad.resources.grch37.gnomad.public_release"]], "release_vcf_path() (in module gnomad.resources.grch37.gnomad)": [[5, "gnomad.resources.grch37.gnomad.release_vcf_path"]], "gnomad.resources.grch37.gnomad_ld": [[6, "module-gnomad.resources.grch37.gnomad_ld"]], "ld_index() (in module gnomad.resources.grch37.gnomad_ld)": [[6, "gnomad.resources.grch37.gnomad_ld.ld_index"]], "ld_matrix() (in module gnomad.resources.grch37.gnomad_ld)": [[6, "gnomad.resources.grch37.gnomad_ld.ld_matrix"]], "ld_scores() (in module gnomad.resources.grch37.gnomad_ld)": [[6, "gnomad.resources.grch37.gnomad_ld.ld_scores"]], "get_truth_ht() (in module gnomad.resources.grch37.reference_data)": [[8, "gnomad.resources.grch37.reference_data.get_truth_ht"]], "gnomad.resources.grch37.reference_data": [[8, "module-gnomad.resources.grch37.reference_data"]], "cohorts_with_pop_stored_as_subpop (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.COHORTS_WITH_POP_STORED_AS_SUBPOP"]], "downsamplings (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.DOWNSAMPLINGS"]], "groups (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.GROUPS"]], "hgdp_pops (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.HGDP_POPS"]], "pops (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.POPS"]], "pops_to_remove_for_popmax (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.POPS_TO_REMOVE_FOR_POPMAX"]], "sexes (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.SEXES"]], "subsets (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.SUBSETS"]], "tgp_pops (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.TGP_POPS"]], "tgp_pop_names (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.TGP_POP_NAMES"]], "add_grpmaxfaf95_v4() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.add_grpMaxFAF95_v4"]], "all_sites_an() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.all_sites_an"]], "coverage() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.coverage"]], "coverage_tsv_path() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.coverage_tsv_path"]], "gnomad.resources.grch38.gnomad": [[9, "module-gnomad.resources.grch38.gnomad"]], "gnomad_gks() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.gnomad_gks"]], "public_release() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.public_release"]], "release_vcf_path() (in module gnomad.resources.grch38.gnomad)": [[9, "gnomad.resources.grch38.gnomad.release_vcf_path"]], "get_truth_ht() (in module gnomad.resources.grch38.reference_data)": [[11, "gnomad.resources.grch38.reference_data.get_truth_ht"]], "gnomad.resources.grch38.reference_data": [[11, "module-gnomad.resources.grch38.reference_data"]], "get_module_importable_resources() (in module gnomad.resources.import_resources)": [[12, "gnomad.resources.import_resources.get_module_importable_resources"]], "get_resources_descriptions() (in module gnomad.resources.import_resources)": [[12, "gnomad.resources.import_resources.get_resources_descriptions"]], "gnomad.resources.import_resources": [[12, "module-gnomad.resources.import_resources"]], "main() (in module gnomad.resources.import_resources)": [[12, "gnomad.resources.import_resources.main"]], "baseresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.BaseResource"]], "baseversionedresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.BaseVersionedResource"]], "blockmatrixresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.BlockMatrixResource"]], "dataexception": [[14, "gnomad.resources.resource_utils.DataException"]], "expressionresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.ExpressionResource"]], "gnomad_public_buckets (in module gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.GNOMAD_PUBLIC_BUCKETS"]], "gnomadpublicblockmatrixresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.GnomadPublicBlockMatrixResource"]], "gnomadpublicmatrixtableresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.GnomadPublicMatrixTableResource"]], "gnomadpublicpedigreeresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.GnomadPublicPedigreeResource"]], "gnomadpublicresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.GnomadPublicResource"]], "gnomadpublictableresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.GnomadPublicTableResource"]], "matrixtableresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.MatrixTableResource"]], "pedigreeresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.PedigreeResource"]], "resourcenotavailable": [[14, "gnomad.resources.resource_utils.ResourceNotAvailable"]], "tableresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.TableResource"]], "variantdatasetresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.VariantDatasetResource"]], "versionedblockmatrixresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.VersionedBlockMatrixResource"]], "versionedmatrixtableresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.VersionedMatrixTableResource"]], "versionedpedigreeresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.VersionedPedigreeResource"]], "versionedtableresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.VersionedTableResource"]], "versionedvariantdatasetresource (class in gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.VersionedVariantDatasetResource"]], "bm() (gnomad.resources.resource_utils.blockmatrixresource method)": [[14, "gnomad.resources.resource_utils.BlockMatrixResource.bm"]], "bm() (gnomad.resources.resource_utils.gnomadpublicblockmatrixresource method)": [[14, "gnomad.resources.resource_utils.GnomadPublicBlockMatrixResource.bm"]], "default_version (gnomad.resources.resource_utils.baseversionedresource attribute)": [[14, "gnomad.resources.resource_utils.BaseVersionedResource.default_version"]], "expected_file_extensions (gnomad.resources.resource_utils.baseresource attribute)": [[14, "gnomad.resources.resource_utils.BaseResource.expected_file_extensions"]], "expected_file_extensions (gnomad.resources.resource_utils.blockmatrixresource attribute)": [[14, "gnomad.resources.resource_utils.BlockMatrixResource.expected_file_extensions"]], "expected_file_extensions (gnomad.resources.resource_utils.expressionresource attribute)": [[14, "gnomad.resources.resource_utils.ExpressionResource.expected_file_extensions"]], "expected_file_extensions (gnomad.resources.resource_utils.matrixtableresource attribute)": [[14, "gnomad.resources.resource_utils.MatrixTableResource.expected_file_extensions"]], "expected_file_extensions (gnomad.resources.resource_utils.pedigreeresource attribute)": [[14, "gnomad.resources.resource_utils.PedigreeResource.expected_file_extensions"]], "expected_file_extensions (gnomad.resources.resource_utils.tableresource attribute)": [[14, "gnomad.resources.resource_utils.TableResource.expected_file_extensions"]], "expected_file_extensions (gnomad.resources.resource_utils.variantdatasetresource attribute)": [[14, "gnomad.resources.resource_utils.VariantDatasetResource.expected_file_extensions"]], "gnomad.resources.resource_utils": [[14, "module-gnomad.resources.resource_utils"]], "he() (gnomad.resources.resource_utils.expressionresource method)": [[14, "gnomad.resources.resource_utils.ExpressionResource.he"]], "ht() (gnomad.resources.resource_utils.gnomadpublicpedigreeresource method)": [[14, "gnomad.resources.resource_utils.GnomadPublicPedigreeResource.ht"]], "ht() (gnomad.resources.resource_utils.gnomadpublictableresource method)": [[14, "gnomad.resources.resource_utils.GnomadPublicTableResource.ht"]], "ht() (gnomad.resources.resource_utils.pedigreeresource method)": [[14, "gnomad.resources.resource_utils.PedigreeResource.ht"]], "ht() (gnomad.resources.resource_utils.tableresource method)": [[14, "gnomad.resources.resource_utils.TableResource.ht"]], "import_gencode() (in module gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.import_gencode"]], "import_resource() (gnomad.resources.resource_utils.baseresource method)": [[14, "gnomad.resources.resource_utils.BaseResource.import_resource"]], "import_resource() (gnomad.resources.resource_utils.blockmatrixresource method)": [[14, "gnomad.resources.resource_utils.BlockMatrixResource.import_resource"]], "import_resource() (gnomad.resources.resource_utils.expressionresource method)": [[14, "gnomad.resources.resource_utils.ExpressionResource.import_resource"]], "import_resource() (gnomad.resources.resource_utils.matrixtableresource method)": [[14, "gnomad.resources.resource_utils.MatrixTableResource.import_resource"]], "import_resource() (gnomad.resources.resource_utils.pedigreeresource method)": [[14, "gnomad.resources.resource_utils.PedigreeResource.import_resource"]], "import_resource() (gnomad.resources.resource_utils.tableresource method)": [[14, "gnomad.resources.resource_utils.TableResource.import_resource"]], "import_resource() (gnomad.resources.resource_utils.variantdatasetresource method)": [[14, "gnomad.resources.resource_utils.VariantDatasetResource.import_resource"]], "import_sites_vcf() (in module gnomad.resources.resource_utils)": [[14, "gnomad.resources.resource_utils.import_sites_vcf"]], "is_resource_available() (gnomad.resources.resource_utils.gnomadpublicresource method)": [[14, "gnomad.resources.resource_utils.GnomadPublicResource.is_resource_available"]], "mt() (gnomad.resources.resource_utils.gnomadpublicmatrixtableresource method)": [[14, "gnomad.resources.resource_utils.GnomadPublicMatrixTableResource.mt"]], "mt() (gnomad.resources.resource_utils.matrixtableresource method)": [[14, "gnomad.resources.resource_utils.MatrixTableResource.mt"]], "path (gnomad.resources.resource_utils.baseresource property)": [[14, "gnomad.resources.resource_utils.BaseResource.path"]], "pedigree() (gnomad.resources.resource_utils.gnomadpublicpedigreeresource method)": [[14, "gnomad.resources.resource_utils.GnomadPublicPedigreeResource.pedigree"]], "pedigree() (gnomad.resources.resource_utils.pedigreeresource method)": [[14, "gnomad.resources.resource_utils.PedigreeResource.pedigree"]], "resource_class (gnomad.resources.resource_utils.baseversionedresource attribute)": [[14, "gnomad.resources.resource_utils.BaseVersionedResource.resource_class"]], "resource_class (gnomad.resources.resource_utils.versionedblockmatrixresource attribute)": [[14, "gnomad.resources.resource_utils.VersionedBlockMatrixResource.resource_class"]], "resource_class (gnomad.resources.resource_utils.versionedmatrixtableresource attribute)": [[14, "gnomad.resources.resource_utils.VersionedMatrixTableResource.resource_class"]], "resource_class (gnomad.resources.resource_utils.versionedpedigreeresource attribute)": [[14, "gnomad.resources.resource_utils.VersionedPedigreeResource.resource_class"]], "resource_class (gnomad.resources.resource_utils.versionedtableresource attribute)": [[14, "gnomad.resources.resource_utils.VersionedTableResource.resource_class"]], "resource_class (gnomad.resources.resource_utils.versionedvariantdatasetresource attribute)": [[14, "gnomad.resources.resource_utils.VersionedVariantDatasetResource.resource_class"]], "vds() (gnomad.resources.resource_utils.variantdatasetresource method)": [[14, "gnomad.resources.resource_utils.VariantDatasetResource.vds"]], "versions (gnomad.resources.resource_utils.baseversionedresource attribute)": [[14, "gnomad.resources.resource_utils.BaseVersionedResource.versions"]], "apply_onnx_classification_model() (in module gnomad.sample_qc.ancestry)": [[15, "gnomad.sample_qc.ancestry.apply_onnx_classification_model"]], "apply_sklearn_classification_model() (in module gnomad.sample_qc.ancestry)": [[15, "gnomad.sample_qc.ancestry.apply_sklearn_classification_model"]], "assign_population_pcs() (in module gnomad.sample_qc.ancestry)": [[15, "gnomad.sample_qc.ancestry.assign_population_pcs"]], "convert_sklearn_rf_to_onnx() (in module gnomad.sample_qc.ancestry)": [[15, "gnomad.sample_qc.ancestry.convert_sklearn_rf_to_onnx"]], "gnomad.sample_qc.ancestry": [[15, "module-gnomad.sample_qc.ancestry"]], "pc_project() (in module gnomad.sample_qc.ancestry)": [[15, "gnomad.sample_qc.ancestry.pc_project"]], "run_pca_with_relateds() (in module gnomad.sample_qc.ancestry)": [[15, "gnomad.sample_qc.ancestry.run_pca_with_relateds"]], "compute_qc_metrics_residuals() (in module gnomad.sample_qc.filtering)": [[16, "gnomad.sample_qc.filtering.compute_qc_metrics_residuals"]], "compute_stratified_metrics_filter() (in module gnomad.sample_qc.filtering)": [[16, "gnomad.sample_qc.filtering.compute_stratified_metrics_filter"]], "compute_stratified_sample_qc() (in module gnomad.sample_qc.filtering)": [[16, "gnomad.sample_qc.filtering.compute_stratified_sample_qc"]], "determine_nearest_neighbors() (in module gnomad.sample_qc.filtering)": [[16, "gnomad.sample_qc.filtering.determine_nearest_neighbors"]], "gnomad.sample_qc.filtering": [[16, "module-gnomad.sample_qc.filtering"]], "merge_sample_qc_expr() (in module gnomad.sample_qc.filtering)": [[16, "gnomad.sample_qc.filtering.merge_sample_qc_expr"]], "annotate_sex() (in module gnomad.sample_qc.pipeline)": [[18, "gnomad.sample_qc.pipeline.annotate_sex"]], "filter_rows_for_qc() (in module gnomad.sample_qc.pipeline)": [[18, "gnomad.sample_qc.pipeline.filter_rows_for_qc"]], "get_qc_mt() (in module gnomad.sample_qc.pipeline)": [[18, "gnomad.sample_qc.pipeline.get_qc_mt"]], "gnomad.sample_qc.pipeline": [[18, "module-gnomad.sample_qc.pipeline"]], "infer_sex_karyotype() (in module gnomad.sample_qc.pipeline)": [[18, "gnomad.sample_qc.pipeline.infer_sex_karyotype"]], "assign_platform_from_pcs() (in module gnomad.sample_qc.platform)": [[19, "gnomad.sample_qc.platform.assign_platform_from_pcs"]], "compute_callrate_mt() (in module gnomad.sample_qc.platform)": [[19, "gnomad.sample_qc.platform.compute_callrate_mt"]], "gnomad.sample_qc.platform": [[19, "module-gnomad.sample_qc.platform"]], "run_platform_pca() (in module gnomad.sample_qc.platform)": [[19, "gnomad.sample_qc.platform.run_platform_pca"]], "ambiguous_relationship (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.AMBIGUOUS_RELATIONSHIP"]], "duplicate_or_twins (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.DUPLICATE_OR_TWINS"]], "parent_child (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.PARENT_CHILD"]], "second_degree_relatives (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.SECOND_DEGREE_RELATIVES"]], "siblings (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.SIBLINGS"]], "unrelated (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.UNRELATED"]], "compute_related_samples_to_drop() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.compute_related_samples_to_drop"]], "create_fake_pedigree() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.create_fake_pedigree"]], "explode_duplicate_samples_ht() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.explode_duplicate_samples_ht"]], "filter_mt_to_trios() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.filter_mt_to_trios"]], "generate_sib_stats_expr() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.generate_sib_stats_expr"]], "generate_trio_stats_expr() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.generate_trio_stats_expr"]], "get_duplicated_samples() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.get_duplicated_samples"]], "get_duplicated_samples_ht() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.get_duplicated_samples_ht"]], "get_relationship_expr() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.get_relationship_expr"]], "get_slope_int_relationship_expr() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.get_slope_int_relationship_expr"]], "gnomad.sample_qc.relatedness": [[20, "module-gnomad.sample_qc.relatedness"]], "infer_families() (in module gnomad.sample_qc.relatedness)": [[20, "gnomad.sample_qc.relatedness.infer_families"]], "adjust_sex_ploidy() (in module gnomad.sample_qc.sex)": [[21, "gnomad.sample_qc.sex.adjust_sex_ploidy"]], "adjusted_sex_ploidy_expr() (in module gnomad.sample_qc.sex)": [[21, "gnomad.sample_qc.sex.adjusted_sex_ploidy_expr"]], "gaussian_mixture_model_karyotype_assignment() (in module gnomad.sample_qc.sex)": [[21, "gnomad.sample_qc.sex.gaussian_mixture_model_karyotype_assignment"]], "get_chr_x_hom_alt_cutoffs() (in module gnomad.sample_qc.sex)": [[21, "gnomad.sample_qc.sex.get_chr_x_hom_alt_cutoffs"]], "get_ploidy_cutoffs() (in module gnomad.sample_qc.sex)": [[21, "gnomad.sample_qc.sex.get_ploidy_cutoffs"]], "get_sex_expr() (in module gnomad.sample_qc.sex)": [[21, "gnomad.sample_qc.sex.get_sex_expr"]], "gnomad.sample_qc.sex": [[21, "module-gnomad.sample_qc.sex"]], "add_gks_va() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.add_gks_va"]], "add_gks_vrs() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.add_gks_vrs"]], "add_variant_type() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.add_variant_type"]], "age_hists_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.age_hists_expr"]], "agg_by_strata() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.agg_by_strata"]], "annotate_adj() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotate_adj"]], "annotate_allele_info() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotate_allele_info"]], "annotate_and_index_source_mt_for_sex_ploidy() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotate_and_index_source_mt_for_sex_ploidy"]], "annotate_downsamplings() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotate_downsamplings"]], "annotate_freq() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotate_freq"]], "annotation_type_in_vcf_info() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotation_type_in_vcf_info"]], "annotation_type_is_numeric() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.annotation_type_is_numeric"]], "bi_allelic_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.bi_allelic_expr"]], "bi_allelic_site_inbreeding_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.bi_allelic_site_inbreeding_expr"]], "build_freq_stratification_list() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.build_freq_stratification_list"]], "compute_freq_by_strata() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.compute_freq_by_strata"]], "create_frequency_bins_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.create_frequency_bins_expr"]], "faf_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.faf_expr"]], "fs_from_sb() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.fs_from_sb"]], "gen_anc_faf_max_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.gen_anc_faf_max_expr"]], "generate_freq_group_membership_array() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.generate_freq_group_membership_array"]], "get_adj_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.get_adj_expr"]], "get_annotations_hists() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.get_annotations_hists"]], "get_gq_dp_adj_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.get_gq_dp_adj_expr"]], "get_het_ab_adj_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.get_het_ab_adj_expr"]], "get_is_haploid_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.get_is_haploid_expr"]], "get_lowqual_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.get_lowqual_expr"]], "gnomad.utils.annotations": [[22, "module-gnomad.utils.annotations"]], "hemi_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.hemi_expr"]], "merge_freq_arrays() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.merge_freq_arrays"]], "merge_histograms() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.merge_histograms"]], "missing_callstats_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.missing_callstats_expr"]], "pab_max_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.pab_max_expr"]], "pop_max_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.pop_max_expr"]], "project_max_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.project_max_expr"]], "qual_hist_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.qual_hist_expr"]], "region_flag_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.region_flag_expr"]], "set_female_y_metrics_to_na_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.set_female_y_metrics_to_na_expr"]], "sor_from_sb() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.sor_from_sb"]], "unphase_call_expr() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.unphase_call_expr"]], "update_structured_annotations() (in module gnomad.utils.annotations)": [[22, "gnomad.utils.annotations.update_structured_annotations"]], "coverage_cutoff (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.COVERAGE_CUTOFF"]], "add_gencode_transcript_annotations() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.add_gencode_transcript_annotations"]], "annotate_exploded_vep_for_constraint_groupings() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.annotate_exploded_vep_for_constraint_groupings"]], "annotate_mutation_type() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.annotate_mutation_type"]], "annotate_with_mu() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.annotate_with_mu"]], "build_coverage_model() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.build_coverage_model"]], "build_models() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.build_models"]], "build_plateau_models() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.build_plateau_models"]], "calculate_raw_z_score() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.calculate_raw_z_score"]], "calculate_raw_z_score_sd() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.calculate_raw_z_score_sd"]], "collapse_strand() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.collapse_strand"]], "compute_expected_variants() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.compute_expected_variants"]], "compute_pli() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.compute_pli"]], "count_variants_by_group() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.count_variants_by_group"]], "downsampling_counts_expr() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.downsampling_counts_expr"]], "get_all_pop_lengths() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.get_all_pop_lengths"]], "get_constraint_flags() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.get_constraint_flags"]], "get_constraint_grouping_expr() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.get_constraint_grouping_expr"]], "get_downsampling_freq_indices() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.get_downsampling_freq_indices"]], "gnomad.utils.constraint": [[23, "module-gnomad.utils.constraint"]], "oe_aggregation_expr() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.oe_aggregation_expr"]], "oe_confidence_interval() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.oe_confidence_interval"]], "trimer_from_heptamer() (in module gnomad.utils.constraint)": [[23, "gnomad.utils.constraint.trimer_from_heptamer"]], "check_file_exists_raise_error() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.check_file_exists_raise_error"]], "file_exists() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.file_exists"]], "get_file_stats() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.get_file_stats"]], "gnomad.utils.file_utils": [[24, "module-gnomad.utils.file_utils"]], "read_list_data() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.read_list_data"]], "repartition_for_join() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.repartition_for_join"]], "select_primitives_from_ht() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.select_primitives_from_ht"]], "write_temp_gcs() (in module gnomad.utils.file_utils)": [[24, "gnomad.utils.file_utils.write_temp_gcs"]], "add_filters_expr() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.add_filters_expr"]], "combine_functions() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.combine_functions"]], "filter_arrays_by_meta() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_arrays_by_meta"]], "filter_by_frequency() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_by_frequency"]], "filter_by_numeric_expr_range() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_by_numeric_expr_range"]], "filter_for_mu() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_for_mu"]], "filter_low_conf_regions() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_low_conf_regions"]], "filter_to_adj() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_to_adj"]], "filter_to_autosomes() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_to_autosomes"]], "filter_to_clinvar_pathogenic() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_to_clinvar_pathogenic"]], "filter_to_gencode_cds() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_to_gencode_cds"]], "filter_x_nonpar() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_x_nonpar"]], "filter_y_nonpar() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.filter_y_nonpar"]], "gnomad.utils.filtering": [[25, "module-gnomad.utils.filtering"]], "remove_fields_from_constant() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.remove_fields_from_constant"]], "split_vds_by_strata() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.split_vds_by_strata"]], "subset_samples_and_variants() (in module gnomad.utils.filtering)": [[25, "gnomad.utils.filtering.subset_samples_and_variants"]], "from_phred() (in module gnomad.utils.gen_stats)": [[26, "gnomad.utils.gen_stats.from_phred"]], "get_median_and_mad_expr() (in module gnomad.utils.gen_stats)": [[26, "gnomad.utils.gen_stats.get_median_and_mad_expr"]], "gnomad.utils.gen_stats": [[26, "module-gnomad.utils.gen_stats"]], "merge_stats_counters_expr() (in module gnomad.utils.gen_stats)": [[26, "gnomad.utils.gen_stats.merge_stats_counters_expr"]], "to_phred() (in module gnomad.utils.gen_stats)": [[26, "gnomad.utils.gen_stats.to_phred"]], "gnomad.utils.intervals": [[28, "module-gnomad.utils.intervals"]], "interval_length() (in module gnomad.utils.intervals)": [[28, "gnomad.utils.intervals.interval_length"]], "sort_intervals() (in module gnomad.utils.intervals)": [[28, "gnomad.utils.intervals.sort_intervals"]], "union_intervals() (in module gnomad.utils.intervals)": [[28, "gnomad.utils.intervals.union_intervals"]], "grch37_to_grch38_chain (in module gnomad.utils.liftover)": [[29, "gnomad.utils.liftover.GRCH37_to_GRCH38_CHAIN"]], "grch38_to_grch37_chain (in module gnomad.utils.liftover)": [[29, "gnomad.utils.liftover.GRCH38_TO_GRCH37_CHAIN"]], "default_lift_data() (in module gnomad.utils.liftover)": [[29, "gnomad.utils.liftover.default_lift_data"]], "get_liftover_genome() (in module gnomad.utils.liftover)": [[29, "gnomad.utils.liftover.get_liftover_genome"]], "gnomad.utils.liftover": [[29, "module-gnomad.utils.liftover"]], "liftover_expr() (in module gnomad.utils.liftover)": [[29, "gnomad.utils.liftover.liftover_expr"]], "liftover_using_gnomad_map() (in module gnomad.utils.liftover)": [[29, "gnomad.utils.liftover.liftover_using_gnomad_map"]], "get_rows_data() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.get_rows_data"]], "gnomad.utils.plotting": [[30, "module-gnomad.utils.plotting"]], "linear_and_log_tabs() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.linear_and_log_tabs"]], "new_show() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.new_show"]], "pair_plot() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.pair_plot"]], "plot_hail_file_metadata() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.plot_hail_file_metadata"]], "plot_hail_hist() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.plot_hail_hist"]], "plot_hail_hist_both() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.plot_hail_hist_both"]], "plot_hail_hist_cumulative() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.plot_hail_hist_cumulative"]], "plot_multi_hail_hist() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.plot_multi_hail_hist"]], "scale_file_sizes() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.scale_file_sizes"]], "set_font_size() (in module gnomad.utils.plotting)": [[30, "gnomad.utils.plotting.set_font_size"]], "add_reference_sequence() (in module gnomad.utils.reference_genome)": [[31, "gnomad.utils.reference_genome.add_reference_sequence"]], "get_reference_genome() (in module gnomad.utils.reference_genome)": [[31, "gnomad.utils.reference_genome.get_reference_genome"]], "get_reference_ht() (in module gnomad.utils.reference_genome)": [[31, "gnomad.utils.reference_genome.get_reference_ht"]], "gnomad.utils.reference_genome": [[31, "module-gnomad.utils.reference_genome"]], "gnomad.utils.release": [[32, "module-gnomad.utils.release"]], "make_faf_index_dict() (in module gnomad.utils.release)": [[32, "gnomad.utils.release.make_faf_index_dict"]], "make_freq_index_dict() (in module gnomad.utils.release)": [[32, "gnomad.utils.release.make_freq_index_dict"]], "make_freq_index_dict_from_meta() (in module gnomad.utils.release)": [[32, "gnomad.utils.release.make_freq_index_dict_from_meta"]], "slackclient (class in gnomad.utils.slack)": [[33, "gnomad.utils.slack.SlackClient"]], "gnomad.utils.slack": [[33, "module-gnomad.utils.slack"]], "send_file() (gnomad.utils.slack.slackclient method)": [[33, "gnomad.utils.slack.SlackClient.send_file"]], "send_message() (gnomad.utils.slack.slackclient method)": [[33, "gnomad.utils.slack.SlackClient.send_message"]], "slack_notifications() (in module gnomad.utils.slack)": [[33, "gnomad.utils.slack.slack_notifications"]], "compute_allele_number_per_ref_site() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.compute_allele_number_per_ref_site"]], "compute_coverage_stats() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.compute_coverage_stats"]], "compute_last_ref_block_end() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.compute_last_ref_block_end"]], "compute_stats_per_ref_site() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.compute_stats_per_ref_site"]], "default_compute_info() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.default_compute_info"]], "densify_all_reference_sites() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.densify_all_reference_sites"]], "densify_sites() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.densify_sites"]], "filter_ref_blocks() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.filter_ref_blocks"]], "get_allele_number_agg_func() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.get_allele_number_agg_func"]], "get_as_info_expr() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.get_as_info_expr"]], "get_site_info_expr() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.get_site_info_expr"]], "gnomad.utils.sparse_mt": [[34, "module-gnomad.utils.sparse_mt"]], "impute_sex_ploidy() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.impute_sex_ploidy"]], "split_info_annotation() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.split_info_annotation"]], "split_lowqual_annotation() (in module gnomad.utils.sparse_mt)": [[34, "gnomad.utils.sparse_mt.split_lowqual_annotation"]], "filter_expression_ht_by_tissues() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.filter_expression_ht_by_tissues"]], "get_expression_proportion() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.get_expression_proportion"]], "gnomad.utils.transcript_annotation": [[35, "module-gnomad.utils.transcript_annotation"]], "perform_tx_annotation_pipeline() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.perform_tx_annotation_pipeline"]], "summarize_transcript_expression() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.summarize_transcript_expression"]], "tissue_expression_ht_to_array() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.tissue_expression_ht_to_array"]], "tx_aggregate_variants() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.tx_aggregate_variants"]], "tx_annotate_variants() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.tx_annotate_variants"]], "tx_filter_variants_by_csqs() (in module gnomad.utils.transcript_annotation)": [[35, "gnomad.utils.transcript_annotation.tx_filter_variants_by_csqs"]], "allele_type_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.ALLELE_TYPE_FIELDS"]], "as_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.AS_FIELDS"]], "as_vqsr_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.AS_VQSR_FIELDS"]], "entries (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.ENTRIES"]], "faf_pops (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.FAF_POPS"]], "format_dict (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.FORMAT_DICT"]], "groups (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.GROUPS"]], "hists (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.HISTS"]], "info_dict (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.INFO_DICT"]], "in_silico_annotations_info_dict (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.IN_SILICO_ANNOTATIONS_INFO_DICT"]], "joint_region_flag_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.JOINT_REGION_FLAG_FIELDS"]], "region_flag_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.REGION_FLAG_FIELDS"]], "rf_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.RF_FIELDS"]], "sexes (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.SEXES"]], "site_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.SITE_FIELDS"]], "sort_order (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.SORT_ORDER"]], "sparse_entries (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.SPARSE_ENTRIES"]], "vqsr_fields (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.VQSR_FIELDS"]], "vrs_fields_dict (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.VRS_FIELDS_DICT"]], "add_as_info_dict() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.add_as_info_dict"]], "adjust_vcf_incompatible_types() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.adjust_vcf_incompatible_types"]], "build_vcf_export_reference() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.build_vcf_export_reference"]], "create_label_groups() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.create_label_groups"]], "gnomad.utils.vcf": [[36, "module-gnomad.utils.vcf"]], "index_globals() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.index_globals"]], "make_combo_header_text() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.make_combo_header_text"]], "make_hist_bin_edges_expr() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.make_hist_bin_edges_expr"]], "make_hist_dict() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.make_hist_dict"]], "make_info_dict() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.make_info_dict"]], "make_label_combos() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.make_label_combos"]], "make_vcf_filter_dict() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.make_vcf_filter_dict"]], "rekey_new_reference() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.rekey_new_reference"]], "set_female_y_metrics_to_na() (in module gnomad.utils.vcf)": [[36, "gnomad.utils.vcf.set_female_y_metrics_to_na"]], "csq_coding (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.CSQ_CODING"]], "csq_splice (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.CSQ_SPLICE"]], "current_vep_version (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.CURRENT_VEP_VERSION"]], "loftee_labels (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.LOFTEE_LABELS"]], "lof_csq_set (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.LOF_CSQ_SET"]], "possible_refs (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.POSSIBLE_REFS"]], "vep_config_path (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.VEP_CONFIG_PATH"]], "vep_csq_fields (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.VEP_CSQ_FIELDS"]], "vep_csq_header (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.VEP_CSQ_HEADER"]], "add_most_severe_consequence_to_consequence() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.add_most_severe_consequence_to_consequence"]], "add_most_severe_csq_to_tc_within_vep_root() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.add_most_severe_csq_to_tc_within_vep_root"]], "explode_by_vep_annotation() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.explode_by_vep_annotation"]], "filter_vep_to_canonical_transcripts() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.filter_vep_to_canonical_transcripts"]], "filter_vep_to_gene_list() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.filter_vep_to_gene_list"]], "filter_vep_to_mane_select_transcripts() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.filter_vep_to_mane_select_transcripts"]], "filter_vep_to_synonymous_variants() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.filter_vep_to_synonymous_variants"]], "filter_vep_transcript_csqs() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.filter_vep_transcript_csqs"]], "get_most_severe_consequence_for_summary() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.get_most_severe_consequence_for_summary"]], "get_vep_context() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.get_vep_context"]], "get_vep_help() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.get_vep_help"]], "gnomad.utils.vep": [[37, "module-gnomad.utils.vep"]], "process_consequences() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.process_consequences"]], "vep_or_lookup_vep() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.vep_or_lookup_vep"]], "vep_struct_to_csq() (in module gnomad.utils.vep)": [[37, "gnomad.utils.vep.vep_struct_to_csq"]], "add_rank() (in module gnomad.variant_qc.evaluation)": [[38, "gnomad.variant_qc.evaluation.add_rank"]], "compute_binned_truth_sample_concordance() (in module gnomad.variant_qc.evaluation)": [[38, "gnomad.variant_qc.evaluation.compute_binned_truth_sample_concordance"]], "compute_grouped_binned_ht() (in module gnomad.variant_qc.evaluation)": [[38, "gnomad.variant_qc.evaluation.compute_grouped_binned_ht"]], "compute_ranked_bin() (in module gnomad.variant_qc.evaluation)": [[38, "gnomad.variant_qc.evaluation.compute_ranked_bin"]], "create_truth_sample_ht() (in module gnomad.variant_qc.evaluation)": [[38, "gnomad.variant_qc.evaluation.create_truth_sample_ht"]], "gnomad.variant_qc.evaluation": [[38, "module-gnomad.variant_qc.evaluation"]], "get_r_for_pair_of_variants() (in module gnomad.variant_qc.ld)": [[40, "gnomad.variant_qc.ld.get_r_for_pair_of_variants"]], "get_r_human_readable() (in module gnomad.variant_qc.ld)": [[40, "gnomad.variant_qc.ld.get_r_human_readable"]], "get_r_within_gene() (in module gnomad.variant_qc.ld)": [[40, "gnomad.variant_qc.ld.get_r_within_gene"]], "get_r_within_gene_in_pop() (in module gnomad.variant_qc.ld)": [[40, "gnomad.variant_qc.ld.get_r_within_gene_in_pop"]], "gnomad.variant_qc.ld": [[40, "module-gnomad.variant_qc.ld"]], "create_binned_ht() (in module gnomad.variant_qc.pipeline)": [[41, "gnomad.variant_qc.pipeline.create_binned_ht"]], "generate_sib_stats() (in module gnomad.variant_qc.pipeline)": [[41, "gnomad.variant_qc.pipeline.generate_sib_stats"]], "generate_trio_stats() (in module gnomad.variant_qc.pipeline)": [[41, "gnomad.variant_qc.pipeline.generate_trio_stats"]], "gnomad.variant_qc.pipeline": [[41, "module-gnomad.variant_qc.pipeline"]], "score_bin_agg() (in module gnomad.variant_qc.pipeline)": [[41, "gnomad.variant_qc.pipeline.score_bin_agg"]], "train_rf_model() (in module gnomad.variant_qc.pipeline)": [[41, "gnomad.variant_qc.pipeline.train_rf_model"]], "apply_rf_model() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.apply_rf_model"]], "check_ht_fields_for_spark() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.check_ht_fields_for_spark"]], "get_columns_quantiles() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.get_columns_quantiles"]], "get_features_importance() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.get_features_importance"]], "get_labels() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.get_labels"]], "get_rf_runs() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.get_rf_runs"]], "get_run_data() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.get_run_data"]], "gnomad.variant_qc.random_forest": [[42, "module-gnomad.variant_qc.random_forest"]], "ht_to_rf_df() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.ht_to_rf_df"]], "load_model() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.load_model"]], "median_impute_features() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.median_impute_features"]], "pretty_print_runs() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.pretty_print_runs"]], "run_rf_test() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.run_rf_test"]], "save_model() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.save_model"]], "test_model() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.test_model"]], "train_rf() (in module gnomad.variant_qc.random_forest)": [[42, "gnomad.variant_qc.random_forest.train_rf"]], "gnomad.variant_qc.training": [[43, "module-gnomad.variant_qc.training"]], "sample_training_examples() (in module gnomad.variant_qc.training)": [[43, "gnomad.variant_qc.training.sample_training_examples"]]}}) \ No newline at end of file