config.yml

# This is the Snakemake configuration file that specifies paths and 
# and options for the pipeline. Anybody wishing to use
# the provided snakemake pipeline should first fill out this file with paths to
# their own data, as the Snakefile requires it.
# Every config option has reasonable defaults unless it is labeled as "required."
# All paths are relative to the directory that Snakemake is executed in.
# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)


# A multi-sample VCF containing STRs in hg19 (ie b37)
# The VCF must be bgzipped and a .tbi index must exist in the same directory as the VCF
# You can use HipSTR (https://github.com/HipSTR-Tool/HipSTR) to generate the VCF
# TODO: If your VCF is split by chromosome, provide the Unix-style pattern for matching them
# This input is required!
str_vcf: data/str.vcf.gz

# A multi-sample VCF containing SNPs in hg38
# The VCF must be bgzipped and a .tbi index must exist in the same directory as the VCF
# The samples in this VCF must match those in the STR VCF _exactly_ (ie same ordering)
# TODO: If your VCF is split by chromosome, provide the Unix-style pattern for matching them
# This input is required!
snp_vcf: data/snp.vcf.gz

# Which samples from the VCF should we execute the pipeline on?
# Comment out this line or set it to a falsey value if you want to use all of the
# samples in the VCF
# TODO: this option hasn't been implemented yet, so anything here will be ignored
SAMP_NAMES: []

# Path to the b37 reference genome
# Must be properly indexed
# This input is required!
ref_genome: data/hg19.chr2.fa

# Path to the hg19 reference genome
# Must be properly indexed with .fai and .dict files
# This input is required!
ref_genome_hg19: /storage/resources/dbase/human/hg19/hg19.fa

# If the variants in the VCF are unphased, provide the path to our SNP+STR reference
# haplotype panel for phasing
# Provide the path to a directory containing all of the chr*.vcf.gz files from here:
# http://gymreklab.com/2018/03/05/snpstr_imputation.html
# The coordinates in the ref panel are assumed to come from the b37 reference genome
# If this option is a falsey value, the pipeline will assume all sample VCFs are phased
ref_panel: data/ref_panel

# hg38 to hg19 lift-over chain file
# required!
lift_over_chain: data/hg38ToHg19.over.chain.gz

# Path to the jar file for the most recent version of conform-gt (24May16)
# you can download this from https://faculty.washington.edu/browning/conform-gt.html
# required!
conform_gt_jar: /home/massaraa/bin/installed/conform-gt.24May16.cee.jar

# Path to the jar file for the most recent version of BEAGLE (18May20)
# you can download this from https://faculty.washington.edu/browning/beagle/beagle.html
# required!
beagle_jar: /home/massaraa/bin/installed/beagle.18May20.d20.jar

# A directory containing BEAGLE human genetic map files for GRCh37
# required!
genetic_map: data/map

# Path to the WASP-corrected ASE counts for the SNPs
# Must be a string to the path of each file with the "{sample}" wildcard replacing the
# sample name. The sample names must match those in the VCF.
# This file must be a gzipped TSV containing at least the following columns:
# CHR,POS,SAMPLE_ID,SUBJECT_ID,TISSUE_ID,REF_ALLELE,
# ALT_ALLELE,REF_COUNT,ALT_COUNT,TOTAL_COUNT,REF_RATIO,
# OTHER_ALLELE_COUNT,BINOM_P,BINOM_P_ADJUSTED,GENOTYPE,
# VARIANT_ANNOTATION,GENE_ID
snp_counts: "data/snp_counts/{sample}.v8.wasp_corrected.ase_table.tsv.gz"

# How many of the most promising STRs should we plot?
# If set to a falsey value or commented out, 5 will be used
num_plots: 10