Updated heterozygosity, created differentiation, website testing

kfarleigh · Feb 2, 2024 · 55bf726 · 55bf726
1 parent 1460ebd
commit 55bf726
Show file tree

Hide file tree

Showing 32 changed files with 428 additions and 87 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -1,31 +1,3 @@
-idx <- i*2-1
-tmp[,idx:(idx+1)] <- Loc_split_df
-View(IR_dat)
-tmp <- Dat[,3:ncol(Dat)]
-tmp[tmp == "0"] <- "0/0"
-tmp[tmp == "1"] <- "0/2"
-tmp[tmp == "2"] <- "2/2"
-tmp2 <- tmp
-View(tmp2)
-rownames(tmp) <- Inds
-Loc_IR_formatted_tmp <- data.frame(matrix(NA, nrow = nrow(IR_dat), ncol = ncol(IR_dat)*2), row.names = rownames(tmp))
-for(i in 1:ncol(tmp)){
-# Get the locus name
-Loc_nam <- colnames(tmp)[i]
-# Split the genotype
-Loc_split <- strsplit(tmp[,i], "/", fixed = T)
-# Set the name of individuals
-names(Loc_split) <- Dat[,1]
-# Bind into a data frame
-Loc_split_df <- do.call(rbind, Loc_split)
-# Set an index to position the genotypes correctly
-idx <- i*2-1
-Loc_IR_formatted_tmp[,idx:(idx+1)] <- Loc_split_df
-colnames(tmp)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = ""))
-}
-View(Loc_IR_formatted_tmp)
-tmp <- Dat[,3:ncol(Dat)]
-tmp[tmp == "0"] <- "0/0"
 tmp[tmp == "1"] <- "0/2"
 tmp[tmp == "2"] <- "2/2"
 rownames(tmp) <- Inds
@@ -510,3 +482,31 @@ sum.Ej <- sum.Ej + E[j]
 res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej)
 }
 View(res_tab)
+library(devtools)
+document()
+load_all()
+?heterozygosity)_
+?heterozygosity()
+build()
+load_all()
+?heterozygosity
+library(pkgdown)
+build_site()
+document()
+build_site()
+build_site()
+build_site()
+document()
+build_site()
+document()
+document()
+document()
+document()
+data(nancycat)
+data(nancycats)
+library(adegenet)
+data("nancycats")
+Test <- nancycats@tab
+View(Test)
+document()
+build_site()
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: PopGenHelpR
 Title: Streamline Population Genomic and Genetic Analyses
-Version: 1.2.2
+Version: 1.3.0
 Authors@R: 
     c(person("Keaka", "Farleigh", , "keakafarleigh@gmail.com", role = c("aut","cph", "cre"), comment = c(ORCID = "0000-0002-9195-121X")),
     person("Mason", "Murphy", , "momurph@bgsu.edu", role = c("aut","cph", "ctb"), comment = c(ORCID = "0000-0002-5317-1386")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(Dif_Stats_Map)
 export(Dif_stats)
+export(Differentiation)
 export(Div_Stats_Map)
 export(Div_stats)
 export(Heterozygosity)

diff --git a/NEWS.md b/NEWS.md
@@ -2,16 +2,13 @@
 - December 6th, 2023
 ** To Do for KF
 - Split `Dif_Stats` into `Fst` and `NeisD`
-- Rewrite all functions to allow 012/geno files
-- Rewrite all functions to auto detect vcf vs 012/geno
 - Rename mapping functions
 - Split `Plot_Ancestry` into `Piechart_Map` and `Ancestry_barchart`
 - Add `Nucleotide_diversity`
-- Add internal relatedness (IR) to `Heterozygosity`
-- Add homozygosity by locus (HL) to `Heterozygosity`
 - Update documentation
 - Update vignette 
 
+* `Differentiation` has been added to estimate Fst, Nei's D, and Jost's D. Please see the documentation for more details. 
 * `Heterozygosity` has been added to estimate 7 different measures of heterozygosity. Please see the documentation for more details. 
 * `Private.alleles` has been added to calculate the number of private alleles in each population. 
 * The `Div_Stats` function has been deprecated, please use the `Heterozygosity` function if you wish to estimate heterozygosity or the `Private.alleles` function if you wish to calculate the number of private alleles per population. Please use the `Point_Map` function if you wish to visualize the results on a map or plot.

diff --git a/R/Differentiation.R b/R/Differentiation.R
@@ -0,0 +1,150 @@
+#' A function to estimate seven measures of heterozygosity using geno files, vcf files, or vcfR objects. Data is assumed to be bi-allelic.
+#'
+#' @param data Character. String indicating the name of the vcf file, geno file or vcfR object to be used in the analysis.
+#' @param pops Character. String indicating the name of the population assignment file or dataframe containing the population assignment information for each individual in the data. This file must be in the same order as the vcf file and include columns specifying the individual and the population that individual belongs to. The first column should contain individual names and the second column should indicate the population assignment of each individual. Alternatively, you can indicate the column containing the individual and population information using the individual_col and population_col arguments.
+#' @param statistic Character. String or vector indicating the statistic to calculate. Options are any of: all; all of the statistics; Fst, Weir and Cockerham (1984) Fst; NeisD, Nei's D statistic; JostD, Jost's D; StHe, heterozygosity standardized by the average expected heterozygosity; StHo, heterozygosity standardized by the average observed heterozygosity; IR, internal relatedness; HL, homozygosity by locus.
+#' @param missing_value Character. String indicating missing data in the input data. It is assumed to be NA, but that may not be true (is likely not) in the case of geno files.
+#' @param write Boolean. Whether or not to write the output to files in the current working directory. There will be one or two files for each statistic. Files will be named based on their statistic such as Ho_perpop.csv or Ho_perloc.csv.
+#' @param prefix Character. Optional argument. String that will be appended to file output. Please provide a prefix if write is set to TRUE.
+#' @param population_col Numeric. Optional argument (a number) indicating the column that contains the population assignment information.
+#' @param individual_col Numeric. Optional argument (a number) indicating the column that contains the individuals (i.e., sample name) in the data.
+
+#' @return A list containing the estimated heterozygosity statistics. The per pop values are calculated by taking the average of the per locus estimates.
+#'
+#' @references
+#' \bold{For Fst:}
+#'
+#' \href{https://www.jstor.org/stable/2408641}{Weir, B. S., & Cockerham, C. C. (1984)}. Estimating F-statistics for the analysis of population structure. evolution, 1358-1370.
+#'
+#' \bold{For Nei's D:}
+#'
+#' \href{https://www.journals.uchicago.edu/doi/abs/10.1086/282771}{Nei, M. (1972)}. Genetic distance between populations. The American Naturalist, 106(949), 283-292.
+#'
+#'
+#' \bold{For Jost's D:}
+#'
+#' \href{https://doi.org/10.1111/j.1365-294X.2008.03887.x}{Jost L (2008)}. GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015–4026..
+#'
+#' @author \email{Keaka Farleigh (farleik@@miamioh.edu)}
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' data("HornedLizard_Pop")
+#' data("HornedLizard_VCF")
+#' Test <- Differentiation(data = HornedLizard_VCF, pops = HornedLizard_Pop, write = FALSE)}
+Differentiation <- function(data, pops, statistic = 'all', missing_value = NA, write = FALSE, prefix = NULL, population_col = NULL, individual_col = NULL) {
+  Statistic <- NULL
+
+  # Read in population assignment data
+  if(is.data.frame(pops)){
+    Pops <- pops
+  } else if(is.character(pops)){
+      Pops <- utils::read.csv(pops)
+  } else{
+      stop("Please supply a csv file or data frame for population assignment")
+  }
+  # Get the list of individuals from population assignment file
+  if(is.null(individual_col)) {
+    Inds_popmap <- Pops[,1]
+  } else if(!is.null(individual_col)){
+    Inds_popmap <- Pops[,individual_col]
+  }
+
+  # Read in files and convert to necessary formats
+  if(missing(data)){
+    stop("Please supply a file or vcfR object for analysis")
+  }
+  if(methods::is(data,"vcfR")){
+    Dat <- data
+    print("vcfR object detected, proceeding to formatting.")
+    # Convert the vcf gt slot to a geno style table for calculations
+    gt <- vcfR::extract.gt(Dat)
+    gt[gt == "0/0"] <- 0
+    gt[gt == "0/1" | gt == "1/0"] <- 1
+    gt[gt == "1/1"] <- 2
+
+    # Transpose the numeric gt matrix
+    Dat <- as.data.frame(t(as.matrix(gt)))
+
+    # Preserve individual names
+    Inds <- rownames(Dat)
+    Dat <- sapply(Dat, as.numeric)
+  }
+  else if(tools::file_ext(data) == 'vcf') {
+    Dat <- vcfR::read.vcfR(data, verbose = FALSE)
+    print("VCF file detected, proceeding to formatting.")
+    # Convert the vcf gt slot to a geno style table for calculations
+    gt <- vcfR::extract.gt(Dat)
+    gt[gt == "0/0"] <- 0
+    gt[gt == "0/1" | gt == "1/0"] <- 1
+    gt[gt == "1/1"] <- 2
+
+    # Transpose the numeric gt matrix
+    Dat <- as.data.frame(t(as.matrix(gt)))
+
+    # Preserve individual names
+    Inds <- rownames(Dat)
+    Dat <- sapply(Dat, as.numeric)
+  }
+  else if(tools::file_ext(data) == 'geno'){
+    Dat <- utils::read.table(data)
+    print("Geno file detected, proceeding to formatting. Note, PopGenHelpR assumes that your individuals in the geno file and
+          popmap are in the same order, please check to avoid erroneous inferences.")
+  }
+  else {
+      stop("Please supply a geno file, vcf file, or vcfR object for analysis")
+  }
+
+  P <- Pops
+
+  ### Check to see if the individuals are in the same order in the vcf data and the population assignment file
+  # Make sure that the individuals in the vcf/genlight are in the same order as your popmap
+  if(methods::is(data,"vcfR") || tools::file_ext(data) == 'vcf') {
+    if(any(Inds != Inds_popmap)){
+      warning("Sample names in the VCF and Population file may not be in the same order or samples are missing,
+       The sample(s) in question are: ",
+              print(paste(Inds[(Inds != Inds_popmap)], collapse = ' ')))  }
+    if(is.null(population_col)){
+      Pops <- as.factor(Pops[,2])
+    }
+    else if(!is.null(population_col)){
+      Pops <- as.factor(Pops[,population_col])
+    }
+  } else if(tools::file_ext(data) == 'geno'){
+      if(is.null(population_col)){
+      Pops <- as.factor(Pops[,2])
+    } else if(!is.null(population_col)){
+        Pops <- as.factor(Pops[,population_col])
+    }
+  }
+
+  # Replace missing data value with NA
+  if(is.na(missing_value) == FALSE){
+    Dat[Dat == missing_value] <- NA
+  }
+
+  P <- Pops
+  Dat <- cbind.data.frame(Inds, P, Dat)
+
+  # Break into list with populations for each element
+  Dat_perpop <- list()
+  for(i in unique(P)){
+    Dat_perpop[[i]] <- Dat[which(Dat[,2] == i),]
+  }
+
+  message('Formatting has finished, moving onto calculations')
+  #######################################
+  ##### Differentiation calculations #####
+  #######################################
+
+ if(write == TRUE && !is.null(prefix)){
+    res_write <- which(Stat %in% statistic)
+    Output2write <- Output[which(Stat_idx %in% res_write)]
+    for (i in 1:length(Output2write)){
+      utils::write.csv(Output2write, file = paste(names(Output2write[i]), ".csv", sep = "_"))
+    }
+  } else if(write == TRUE && is.null(prefix)){
+      utils::write.csv(Output2write, file = paste(prefix, '_', names(Output2write[i]), ".csv", sep = "_"))
+    }
+}
diff --git a/R/Heterozygosity.R b/R/Heterozygosity.R
@@ -1,15 +1,34 @@
-#' A function to estimate heterozygosity.
+#' A function to estimate seven measures of heterozygosity using geno files, vcf files, or vcfR objects. Data is assumed to be bi-allelic.
 #'
-#' @param data Character. String indicating the name of the vcf file or vcfR object to be used in the analysis.
+#' @param data Character. String indicating the name of the vcf file, geno file or vcfR object to be used in the analysis.
 #' @param pops Character. String indicating the name of the population assignment file or dataframe containing the population assignment information for each individual in the data. This file must be in the same order as the vcf file and include columns specifying the individual and the population that individual belongs to. The first column should contain individual names and the second column should indicate the population assignment of each individual. Alternatively, you can indicate the column containing the individual and population information using the individual_col and population_col arguments.
-#' @param statistic Character. String or vector indicating the statistic to calculate. Options are any of: all; all of the statistics; Ho, observed heterozygosity; He, expected heterozygosity; PHt, proportion of heterozygous loci; StHe, heterozygosity standardized by the average expected heterozygosity; StHo, heterozygosity standardized by the average observed heterozygosity; IR, internal relatedness; HL, homozygosity by locus)
+#' @param statistic Character. String or vector indicating the statistic to calculate. Options are any of: all; all of the statistics; Ho, observed heterozygosity; He, expected heterozygosity; PHt, proportion of heterozygous loci; StHe, heterozygosity standardized by the average expected heterozygosity; StHo, heterozygosity standardized by the average observed heterozygosity; IR, internal relatedness; HL, homozygosity by locus.
 #' @param missing_value Character. String indicating missing data in the input data. It is assumed to be NA, but that may not be true (is likely not) in the case of geno files.
 #' @param write Boolean. Whether or not to write the output to files in the current working directory. There will be one or two files for each statistic. Files will be named based on their statistic such as Ho_perpop.csv or Ho_perloc.csv.
 #' @param prefix Character. Optional argument. String that will be appended to file output. Please provide a prefix if write is set to TRUE.
 #' @param population_col Numeric. Optional argument (a number) indicating the column that contains the population assignment information.
 #' @param individual_col Numeric. Optional argument (a number) indicating the column that contains the individuals (i.e., sample name) in the data.
 
 #' @return A list containing the estimated heterozygosity statistics. The per pop values are calculated by taking the average of the per locus estimates.
+#'
+#' @references
+#' \bold{For expected (He) and observed heterozygosity (Ho):}
+#'
+#' Nei, M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+#'
+#' \bold{For homozygosity by locus (HL) and internal relatedness (IR):}
+#'
+#' \href{https://onlinelibrary.wiley.com/doi/full/10.1111/j.1755-0998.2010.02830.x?casa_token=QiNcMSJyunkAAAAA%3Agv-CK7GrUn1bHSgz4qZSOcB2nyHDeR8B1Wtm9bM7q7vZCAcJhNkhTWnpM0EfkSCb2EvkRrr2ArMzC7v7}{Alho, J. S., Välimäki, K., & Merilä, J. (2010)}. Rhh: an R extension for estimating multilocus heterozygosity and heterozygosity–heterozygosity correlation. Molecular ecology resources, 10(4), 720-722.
+#'
+#' \href{https://royalsocietypublishing.org/doi/abs/10.1098/rspb.2001.1751?casa_token=aXK15yYNjpEAAAAA%3ATR6IXyuIl03yygS_CWET4ymKFthc0dxwkpgCRGGZL250Am_Ssbxi9bnTDIQmpNnshM7H6vTZ1v83PD0}{Amos, W., Worthington Wilmer, J., Fullard, K., Burg, T. M., Croxall, J. P., Bloch, D., & Coulson, T. (2001)}. The influence of parental relatedness on reproductive success. Proceedings of the Royal Society of London. Series B: Biological Sciences, 268(1480), 2021-2027.
+#'
+#' \href{https://onlinelibrary.wiley.com/doi/10.1111/j.1365-294X.2006.03111.x}{Aparicio, J. M., Ortego, J., & Cordero, P. J. (2006)}. What should we weigh to estimate heterozygosity, alleles or loci?. Molecular Ecology, 15(14), 4659-4665.
+#'
+#' \bold{For heterozygosity standardized by expected (StHe) and observed heterozygosity (StHo):}
+#'
+#' \href{https://academic.oup.com/evolut/article/53/4/1259/6757148}{Coltman, D. W., Pilkington, J. G., Smith, J. A., & Pemberton, J. M. (1999)}. Parasite‐mediated selection against Inbred Soay sheep in a free‐living island populaton. Evolution, 53(4), 1259-1267.
+#'
+#' @author \email{Keaka Farleigh (farleik@@miamioh.edu)}
 #' @export
 #'
 #' @examples
@@ -264,6 +283,7 @@ Heterozygosity <- function(data, pops, statistic = 'all', missing_value = NA, wr
      }
 
      ### Calculate IR for each individual
+     # Formula from the archived Rhh package https://cran.r-project.org/web/packages/Rhh/index.html
      for(i in 1:Individuals){
 
        H <- 0
@@ -355,7 +375,7 @@ Heterozygosity <- function(data, pops, statistic = 'all', missing_value = NA, wr
       Counts[i] <- list(table(Loc_HL_mat[,idx:(idx+1)]))
       E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2)
    }
-
+  # Formula from the archived Rhh package https://cran.r-project.org/web/packages/Rhh/index.html
    for (i in 1:Individuals) {
 
      sum.Eh <- 0
@@ -408,9 +428,9 @@ Heterozygosity <- function(data, pops, statistic = 'all', missing_value = NA, wr
     res_write <- which(Stat %in% statistic)
     Output2write <- Output[which(Stat_idx %in% res_write)]
     for (i in 1:length(Output2write)){
-      utils::write.csv(Output2write, file = paste(names(Output2write[i]), ".csv", sep = ""))
+      utils::write.csv(Output2write, file = paste(names(Output2write[i]), ".csv", sep = "_"))
     }
   } else if(write == TRUE && is.null(prefix)){
-      utils::write.csv(Output2write, file = paste(prefix, '_', names(Output2write[i]), ".csv", sep = ""))
+      utils::write.csv(Output2write, file = paste(prefix, '_', names(Output2write[i]), ".csv", sep = "_"))
     }
 }
diff --git a/R/Private.alleles.R b/R/Private.alleles.R
@@ -9,6 +9,8 @@
 #' @return A list containing the count of private alleles in each population and the metadata for those alleles. The metadata is a list that contains the private allele and locus name for each population.
 #' @export
 #'
+#' @author \email{Keaka Farleigh (farleik@@miamioh.edu)}
+#'
 #' @examples
 #' \donttest{
 #' data("HornedLizard_Pop")

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/LICENSE.html b/docs/LICENSE.html