From cbf1b524852a1664ad515a4a0c91c80b85be5b1f Mon Sep 17 00:00:00 2001
From: tjstruck <tjstruck@email.arizona.edu>
Date: Wed, 10 Apr 2024 13:22:11 -0700
Subject: [PATCH] Readme update (#100)

* Started README section to supplement the publication
---
 dadi_cli/__main__.py                     |  4 +--
 docs/paper-resources/data-preperation.md | 43 ++++++++++++++++++++++++
 docs/userguide/demog.md                  |  2 +-
 3 files changed, 46 insertions(+), 3 deletions(-)
 create mode 100644 docs/paper-resources/data-preperation.md

diff --git a/dadi_cli/__main__.py b/dadi_cli/__main__.py
index 8fac1043..4e7a7554 100644
--- a/dadi_cli/__main__.py
+++ b/dadi_cli/__main__.py
@@ -1248,14 +1248,14 @@ def add_inference_argument(parser):
         default=0,
         type=_check_positive_int,
         dest="check_convergence",
-        help="Start checking for convergence after a chosen number of optimizations. Stop optimization runs when convergence criteria are reached. BestFit results file will be call <output_prefix>.InferDM.bestfits. Convergence not checked by default.",
+        help="Start checking for convergence after a chosen number of optimizations. Optimization runs will stop early if convergence criteria are reached. BestFit results file will be call <output_prefix>.InferDM.bestfits. Convergence not checked by default.",
     )
     parser.add_argument(
         "--force-convergence",
         default=0,
         type=_check_positive_int,
         dest="force_convergence",
-        help="Start checking for convergence after a chosen number of optimizations. Only stop optimization once convergence criteria is reached. BestFit results file will be call <output_prefix>.InferDM.bestfits. Convergence not checked by default.",
+        help="Start checking for convergence after a chosen number of optimizations. Optimization runs will continue until convergence criteria is reached (--optimizations flag will be ignored). BestFit results file will be call <output_prefix>.InferDM.bestfits. Convergence not checked by default.",
     )
     parser.add_argument(
         "--work-queue",
diff --git a/docs/paper-resources/data-preperation.md b/docs/paper-resources/data-preperation.md
new file mode 100644
index 00000000..06b8a2e0
--- /dev/null
+++ b/docs/paper-resources/data-preperation.md
@@ -0,0 +1,43 @@
+# Preparing Data Analysis
+
+## Note on the human data
+
+The human data reflacts the Snakemake workflows found here: 
+https://github.com/xin-huang/dadi-cli-analysis
+
+## Download the data
+
+The 1000 Genomes VCFs can be downloaded from the [FTP website](https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/).
+
+The mouse VCF (AllMouse.vcf_90_recalibrated_snps_raw_indels_reheader_PopSorted.PASS.vcf.gz) can be downloaded from https://wwwuser.gwdg.de/~evolbio/evolgen/wildmouse/vcf/.
+
+## Processing 1000 Genomes Data
+
+After downloading the 1000 Genomes Project data, we used [BCFtools](https://samtools.github.io/bcftools/) to extract biallelic single nucleotide polymorphisms (SNPs) and [ANNOVAR](https://annovar.openbioinformatics.org/en/latest/) to annotate these SNPs as synonymous and nonsynonymous mutations.
+Then we stored the synonymous and nonsynonymous SNPs in compressed VCF files: `1KG.YRI.CEU.syn.vcf.gz` and `1KG.YRI.CEU.non.vcf.gz`, respectively.
+To generate an unfolded AFS, we reintroduced what 1000 Genomes determined as the ancestral allele state (which is based on Ensembl multiple alignments using Ortheus) of each SNP to the INFO field of these input VCF files with the ID `AA`, using the `annotate` command in BCFtools.
+(See an example [Snakemake file](https://github.com/xin-huang/dadi-cli-analysis/blob/main/workflows/step3_dfes.smk)). 
+In addition, we created a text file containing the population information of each individual, as following:
+\begin{verbatim}
+NA12718 CEU
+NA12748 CEU
+NA12775 CEU
+NA12777 CEU
+NA12778 CEU
+\end{verbatim}
+The first column is the identifier of each individual and the second is the name of the population that the individual belongs to.
+All the input files used in this manuscript can be found in the [dadi-cli GitHub repository](https://github.com/xin-huang/dadi-cli/tree/master/examples/data/).
+The commands in the following sections for executing dadi-cli in a personal computer or a computing node of a high-performance computing cluster can be found [here](https://github.com/xin-huang/dadi-cli/blob/master/examples/dfe.smk).
+\tjscomment{Let's add an "Example Data Generate" section to the readthedocs to have these links/details?}
+
+## Processing Mouse Data
+
+bla.
+
+## References for Tools Used
+
+Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM,
+Li H (2021) Twelve years of SAMtools and BCFtools. GigaScience 10:giab008.
+
+Wang K, Li M, Hakonarson H (2010) ANNOVAR: functional annotation of genetic variants from high-throughput se-
+quencing data. Nucleic Acids Research 38:e164.
\ No newline at end of file
diff --git a/docs/userguide/demog.md b/docs/userguide/demog.md
index c30a5c61..4c28f2a8 100644
--- a/docs/userguide/demog.md
+++ b/docs/userguide/demog.md
@@ -58,7 +58,7 @@ Because there is randomness built into `dadi-cli` for where the starting paramet
 
 When using `BestFit`, users can adjust the criteria for convergence. By default optimizations are considered convergent if there are two other optimizations with a log-likelihood within 0.01% units of the optimization with the best log-likelihood. This criteria can be adjusted using the `--delta-ll` option and passing in the percentage difference in decimal form (so the default is 0.0001, rather than 0.01). Generally a higher `--delta-ll` can result in a false positive convergence, but this is dependent on the data being used (for example, the sample size can have a big effect on the size of the log-likelihood). Optimizations in the bestfit file will be ordered by log-likelihood and should be examined closely for similarity of parameter values in convergent fits.
 
-Finally, if users have experience with the data they are using, they can use the `--check-convergence` or `--force-convergence` option in `InferDM`. The `--check-convergence` option will run `BestFit` after a specified number of optimization to check for convergence and stop running optimizations once convergence is reached. For example `--check-convergence 10` will run 10 optimizations and then start checking for convergence. The `--force-convergence` option will constantly add new optimization runs until convergence is reached. When using `--check-convergence` or `--force-convergence` users can also use `--delta-ll` to change the convergence criteria. Users can use the output files from `InferDM` or `BestFit` as a starting point for the inital parameters with the `--bestfit-p0-file` flag and passing in the file you want to use. The starting parameters will be randomly chosen from the top ten fits and perturbed.
+Finally, if users have experience with the data they are using, they can use the `--check-convergence` or `--force-convergence` option in `InferDM`. The `--check-convergence` option will run `BestFit` after a specified number of optimizations to check for convergence and stop running optimizations if convergence is reached. For example, `--check-convergence 10` will run 10 optimizations and then start checking for convergence. Optimization runs will stop before the requested number of `--optimizations` if convergence is met. The `--force-convergence` option will constantly add new optimization runs until convergence is reached, ignoring `--optimization`. When using `--check-convergence` or `--force-convergence` users can also use `--delta-ll` to change the convergence criteria. Users can use the output files from `InferDM` or `BestFit` as a starting point for the inital parameters with the `--bestfit-p0-file` flag and passing in the file they want to use. The starting parameters will be randomly chosen from the top ten fits and perturbed.
 
 Sometimes parameters may be close to the boundaries. Users should be cautious and test increasing the boundaries to examine whether these boundaries would affect the results significantly. The best fit parameters are shown below mirroring the bestfits file. The first column is the log-likelihood, then the corresponding to these parameters, and the last column is the population-scaled mutation rate of the synonymous SNPs.