-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added PCA function, PCA article, rebuilt site
- Loading branch information
Showing
30 changed files
with
337 additions
and
604 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#' A function to perform principal component analysis (PCA) on genetic data. Loci with missing data will be removed prior to PCA. | ||
#' | ||
#' @param data Character. String indicating the name of the vcf file, geno file or vcfR object to be used in the analysis. | ||
#' @param center Boolean. Whether or not to center the data before principal component analysis. | ||
#' @param scale Boolean. Whether or not to scale the data before principal component analysis. | ||
#' @param missing_value Character. String indicating missing data in the input data. It is assumed to be NA, but that may not be true (is likely not) in the case of geno files. | ||
#' @param write Boolean. Whether or not to write the output to files in the current working directory. There will be two files, one for the individual loadings and the other for the percent variance explained by each axis. | ||
#' @param prefix Character. Optional argument. String that will be appended to file output. Please provide a prefix if write is set to TRUE. | ||
#' @return A list containing two elements: the loadings of individuals on each principal component and the variance explained by each principal component. | ||
#' | ||
#' @author Keaka Farleigh | ||
#' @export | ||
#' | ||
#' @examples | ||
#' \donttest{ | ||
#' data("HornedLizard_VCF") | ||
#' Test <- PCA(data = HornedLizard_VCF)} | ||
PCA <- function(data, center = TRUE, scale = FALSE, missing_value = NA, write = FALSE, prefix = NULL) { | ||
|
||
# Read in files and convert to necessary formats | ||
if(missing(data)){ | ||
stop("Please supply a file or vcfR object for analysis") | ||
} | ||
if(methods::is(data,"vcfR")){ | ||
Dat <- data | ||
print("vcfR object detected, proceeding to formatting.") | ||
# Convert the vcf gt slot to a geno style table for calculations | ||
gt <- vcfR::extract.gt(Dat) | ||
gt[gt == "0/0"] <- 0 | ||
gt[gt == "0/1" | gt == "1/0"] <- 1 | ||
gt[gt == "1/1"] <- 2 | ||
|
||
# Transpose the numeric gt matrix | ||
Dat <- as.data.frame(t(as.matrix(gt))) | ||
|
||
# Preserve individual names | ||
Inds <- rownames(Dat) | ||
Dat <- sapply(Dat, as.numeric) | ||
} | ||
else if(tools::file_ext(data) == 'vcf') { | ||
Dat <- vcfR::read.vcfR(data, verbose = FALSE) | ||
print("VCF file detected, proceeding to formatting.") | ||
# Convert the vcf gt slot to a geno style table for calculations | ||
gt <- vcfR::extract.gt(Dat) | ||
gt[gt == "0/0"] <- 0 | ||
gt[gt == "0/1" | gt == "1/0"] <- 1 | ||
gt[gt == "1/1"] <- 2 | ||
|
||
# Transpose the numeric gt matrix | ||
Dat <- as.data.frame(t(as.matrix(gt))) | ||
|
||
# Preserve individual names | ||
Inds <- rownames(Dat) | ||
Dat <- sapply(Dat, as.numeric) | ||
} | ||
else if(tools::file_ext(data) == 'geno'){ | ||
Dat <- utils::read.table(data) | ||
print("Geno file detected, proceeding to formatting. Note, PopGenHelpR assumes that your individuals in the geno file and | ||
popmap are in the same order, please check to avoid erroneous inferences.") | ||
} | ||
else { | ||
stop("Please supply a geno file, vcf file, or vcfR object for analysis") | ||
} | ||
|
||
# Replace missing data value with NA | ||
if(is.na(missing_value) == FALSE){ | ||
Dat[Dat == missing_value] <- NA | ||
} | ||
|
||
if(any(is.na(Dat))){ | ||
n_NA <- colSums(is.na(Dat)) | ||
Loc2rem <- which(n_NA > 0) | ||
|
||
print(paste(colnames(Dat)[Loc2rem], " contain NAs which cannot occur in PCA, they have been removed from subsequent analysis. Please impute these sites if you wish to include them. Alternatively, see the R packages LEA and adegenet.", sep = '')) | ||
|
||
Dat <- Dat[,-Loc2rem] | ||
} | ||
|
||
# Scale the data | ||
Dat_scaled <- scale(Dat, center = center, scale = scale) | ||
|
||
# Run the pca | ||
Dat_pca <- stats::prcomp(Dat_scaled) | ||
|
||
# Calculate variance explained by each principal component | ||
Dat_pc_var <- summary(Dat_pca)$importance[2:3,]*100 | ||
rownames(Dat_pc_var) <- c("Percent variance explained", "Cumulative percent variance explained") | ||
|
||
Dat_loadings <- Dat_pca$x | ||
rownames(Dat_loadings) <- Inds | ||
|
||
Output <- list(Dat_loadings, Dat_pc_var) | ||
|
||
names(Output) <- c("Individual Loadings", "Variance Explained") | ||
|
||
if(write & !is.null(prefix)){ | ||
|
||
utils::write.csv(t(Dat_pc_var), paste(prefix, "_pc_variance_explained.csv", sep = '_')) | ||
utils::write.csv(Dat_loadings, paste(prefix, "_pc_loadings.csv", sep = '_')) | ||
|
||
} else if(write & is.null(prefix)){ | ||
|
||
utils::write.csv(t(Dat_pc_var), "Pc_variance_explained.csv") | ||
utils::write.csv(Dat_loadings, "Pc_loadings.csv") | ||
|
||
} | ||
|
||
return(Output) | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.