From 00bd9f96f0e8090184e77ce1a7da3d20c485f879 Mon Sep 17 00:00:00 2001 From: kfarleigh Date: Fri, 2 Feb 2024 14:22:41 -0500 Subject: [PATCH] added Fst, updated website --- .Rhistory | 1020 ++++++++++++++++----------------- R/Dif_stats.R | 4 +- R/Differentiation.R | 161 +++++- R/Div_Stats.R | 2 +- R/Heterozygosity.R | 8 +- R/Private.alleles.R | 2 +- docs/news/index.html | 3 +- docs/pkgdown.yml | 2 +- docs/reference/Dif_stats.html | 6 +- docs/reference/Div_stats.html | 6 +- docs/reference/index.html | 4 +- docs/search.json | 2 +- man/Dif_stats.Rd | 4 +- man/Differentiation.Rd | 18 +- man/Div_stats.Rd | 4 +- man/Heterozygosity.Rd | 8 +- man/Private.alleles.Rd | 2 +- 17 files changed, 689 insertions(+), 567 deletions(-) diff --git a/.Rhistory b/.Rhistory index 239178b..6dd6a93 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,512 +1,512 @@ -tmp[tmp == "1"] <- "0/2" -tmp[tmp == "2"] <- "2/2" -rownames(tmp) <- Inds -Loc_IR_formatted_tmp <- data.frame(matrix(NA, nrow = nrow(IR_dat), ncol = ncol(IR_dat)*2), row.names = rownames(tmp)) -for(i in 1:ncol(tmp)){ -# Get the locus name -Loc_nam <- colnames(tmp)[i] -# Split the genotype -Loc_split <- strsplit(tmp[,i], "/", fixed = T) -# Set the name of individuals -names(Loc_split) <- Dat[,1] -# Bind into a data frame -Loc_split_df <- do.call(rbind, Loc_split) -# Set an index to position the genotypes correctly -idx <- i*2-1 -Loc_IR_formatted_tmp[,idx:(idx+1)] <- Loc_split_df -colnames(Loc_IR_formatted_tmp)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = "")) -} -View(Loc_IR_formatted_tmp) -# Convert to a matrix for calculations -Loc_IR_mat_tmp <- as.matrix(Loc_IR_formatted_tmp) -View(Loc_IR_mat_tmp) -View(Loc_IR_mat) -# Get the number of individuals -Individuals <- nrow(Loc_IR_mat_tmp) -# Get the number of loci -Nloc <- ncol(Loc_IR_mat_tmp)/2 -# Set up a results table -res_tab <- data.frame(IR = matrix(NA, nrow = Individuals, ncol = 1), row.names = Inds) -# Get the counts of alleles for each locus -Counts2 <- list() -# Count the occurrences of 0,1,2 at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts2[[i]] <- table(Loc_IR_mat_tmp[,idx:(idx+1)]) -} -View(Counts2) -for(i in 1:ncol(IR_dat)){ -# Get the locus name -Loc_nam <- colnames(IR_dat)[i] -# Split the genotype -Loc_split <- strsplit(IR_dat[,i], "/", fixed = T) -# Set the name of individuals -names(Loc_split) <- rownames(IR_dat) -# Bind into a data frame -Loc_split_df <- do.call(rbind, Loc_split) -# Set an index to position the genotypes correctly -idx <- i*2-1 -Loc_IR_formatted[,idx:(idx+1)] <- Loc_split_df -colnames(Loc_IR_formatted)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = "")) -} -### Calculate IR for each individual -for(i in 1:Individuals){ -H <- 0 -N <- 0 -f <- 0 -for(j in 1:Nloc){ -# Set our index again -idx1 <- 2*j-1 -idx2 <- 2*j -if((!is.na(Loc_IR_mat[i,idx1])) && (!is.na(Loc_IR_mat[i,idx2]))){ -N <- N +1 -} -if(Loc_IR_mat[i,idx1] == Loc_IR_mat[i,idx2]){ -H <- H + 1 -# Which allele is the individual homozygous for -Hom_Allele <- as.character(Loc_IR_mat[i,idx1]) -f <- f + (2 * Counts[[j]][[Hom_Allele]] - 2)/(sum(Counts[[j]]) - 2) -} else if(Loc_IR_mat[i,idx1] != Loc_IR_mat[i,idx2]){ -# If they are heterozygous that means that they are contributing two alleles, using just a sum of 1 leads to NaN -Het_Allele1 <- as.character(Loc_IR_mat[i,idx1]) -f <- f + (Counts[[j]][[Het_Allele1]] - 1)/(sum(Counts[[j]]) - 2) -Het_Allele2 <- as.character(Loc_IR_mat[i,idx2]) -f <- f + (Counts[[j]][[Het_Allele2]] - 1)/(sum(Counts[[j]]) - 2) -} -} -# Calculate internal relatedness -res_tab[i,1] <- (2 * H - f) / (2 * N - f) -} -### Calculate IR for each individual -for(i in 1:Individuals){ -H <- 0 -N <- 0 -f <- 0 -for(j in 1:Nloc){ -# Set our index again -idx1 <- 2*j-1 -idx2 <- 2*j -if((!is.na(Loc_IR_mat_tmp[i,idx1])) && (!is.na(Loc_IR_mat_tmp[i,idx2]))){ -N <- N +1 -} -if(Loc_IR_mat_tmp[i,idx1] == Loc_IR_mat_tmp[i,idx2]){ -H <- H + 1 -# Which allele is the individual homozygous for -Hom_Allele <- as.character(Loc_IR_mat_tmp[i,idx1]) -f <- f + (2 * Counts2[[j]][[Hom_Allele]] - 2)/(sum(Counts2[[j]]) - 2) -} else if(Loc_IR_mat_tmp[i,idx1] != Loc_IR_mat_tmp[i,idx2]){ -# If they are heterozygous that means that they are contributing two alleles, using just a sum of 1 leads to NaN -Het_Allele1 <- as.character(Loc_IR_mat_tmp[i,idx1]) -f <- f + (Counts2[[j]][[Het_Allele1]] - 1)/(sum(Counts2[[j]]) - 2) -Het_Allele2 <- as.character(Loc_IR_mat_tmp[i,idx2]) -f <- f + (Counts2[[j]][[Het_Allele2]] - 1)/(sum(Counts2[[j]]) - 2) -} -} -# Calculate internal relatedness -res_tab[i,1] <- (2 * H - f) / (2 * N - f) -} -View(res_tab) -IR_perind <- IR(Dat) -View(IR_perind) -tmp <- Dat[,3:ncol(Dat)] -tmp[tmp == "0"] <- "0/0" -tmp[tmp == "1"] <- "0/2" -tmp[tmp == "2"] <- "2/2" -rownames(tmp) <- Inds -Loc_HL_formatted <- data.frame(matrix(NA, nrow = nrow(tmp), ncol = ncol(tmp)*2), row.names = rownames(tmp)) -for(i in 1:ncol(tmp)){ -# Get the locus name -Loc_nam <- colnames(tmp)[i] -# Split the genotype -Loc_split <- strsplit(tmp[,i], "/", fixed = T) -# Set the name of individuals -names(Loc_split) <- Dat[,1] -# Bind into a data frame -Loc_split_df <- do.call(rbind, Loc_split) -# Set an index to position the genotypes correctly -idx <- i*2-1 -Loc_HL_formatted[,idx:(idx+1)] <- Loc_split_df -colnames(Loc_HL_formatted)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = "")) -} -Loc_HL_mat <- as.matrix(Loc_HL_formatted) -View(Loc_HL_formatted) -View(Loc_HL_mat) -?array() -E <- array(Nloc) -E -frequencies <- array(Nloc) -for (l in 1:Nloc) { -E[l] <- 1 -g <- 2 * l - 1 -h <- 2 * l -frequencies[l] <- list(table(genotypes[, g:h])) -E[l] <- 1 - sum((frequencies[[l]] / sum(frequencies[[l]]))^2) -} -for (l in 1:Nloc) { -E[l] <- 1 -g <- 2 * l - 1 -h <- 2 * l -frequencies[l] <- list(table(Loc_HL_mat[, g:h])) -E[l] <- 1 - sum((frequencies[[l]] / sum(frequencies[[l]]))^2) -} -View(frequencies) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[[i]] <- table(Loc_HL_mat[,idx:(idx+1)]) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -E -# Estimate homozygosity by locus -HL <- function(Dat){ -# Extract genetic data and convert to count the frequency of allele s -tmp <- Dat[,3:ncol(Dat)] -tmp[tmp == "0"] <- "0/0" -tmp[tmp == "1"] <- "0/2" -tmp[tmp == "2"] <- "2/2" -rownames(tmp) <- Inds -Loc_HL_formatted <- data.frame(matrix(NA, nrow = nrow(tmp), ncol = ncol(tmp)*2), row.names = rownames(tmp)) -for(i in 1:ncol(tmp)){ -# Get the locus name -Loc_nam <- colnames(tmp)[i] -# Split the genotype -Loc_split <- strsplit(tmp[,i], "/", fixed = T) -# Set the name of individuals -names(Loc_split) <- Dat[,1] -# Bind into a data frame -Loc_split_df <- do.call(rbind, Loc_split) -# Set an index to position the genotypes correctly -idx <- i*2-1 -Loc_HL_formatted[,idx:(idx+1)] <- Loc_split_df -colnames(Loc_HL_formatted)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = "")) -} -Loc_HL_mat <- as.matrix(Loc_HL_formatted) -# Get the number of individuals -Individuals <- nrow(Loc_HL_mat) -# Get the number of loci -Nloc <- ncol(Loc_IR_mat)/2 -# Set up a results table -res_tab <- data.frame(HL = matrix(NA, nrow = Individuals, ncol = 1), row.names = Inds) -E <- array(Nloc) -frequencies <- array(Nloc) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[[i]] <- table(Loc_HL_mat[,idx:(idx+1)]) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (l in 1:loci) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -return(res_tab) -} -HL(Dat) -# Estimate homozygosity by locus -HL <- function(Dat){ -# Extract genetic data and convert to count the frequency of allele s -tmp <- Dat[,3:ncol(Dat)] -tmp[tmp == "0"] <- "0/0" -tmp[tmp == "1"] <- "0/2" -tmp[tmp == "2"] <- "2/2" -rownames(tmp) <- Inds -Loc_HL_formatted <- data.frame(matrix(NA, nrow = nrow(tmp), ncol = ncol(tmp)*2), row.names = rownames(tmp)) -for(i in 1:ncol(tmp)){ -# Get the locus name -Loc_nam <- colnames(tmp)[i] -# Split the genotype -Loc_split <- strsplit(tmp[,i], "/", fixed = T) -# Set the name of individuals -names(Loc_split) <- Dat[,1] -# Bind into a data frame -Loc_split_df <- do.call(rbind, Loc_split) -# Set an index to position the genotypes correctly -idx <- i*2-1 -Loc_HL_formatted[,idx:(idx+1)] <- Loc_split_df -colnames(Loc_HL_formatted)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = "")) -} -Loc_HL_mat <- as.matrix(Loc_HL_formatted) -# Get the number of individuals -Individuals <- nrow(Loc_HL_mat) -# Get the number of loci -Nloc <- ncol(Loc_IR_mat)/2 -# Set up a results table -res_tab <- data.frame(HL = matrix(NA, nrow = Individuals, ncol = 1), row.names = Inds) -E <- array(Nloc) -frequencies <- array(Nloc) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[[i]] <- table(Loc_HL_mat[,idx:(idx+1)]) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (l in 1:Nloc) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -return(res_tab) -} -HL(Dat) -hl_dc <- -function(genotypes) { -genotypes <- as.data.frame(genotypes) -genotypes <- as.matrix(genotypes) -individuals <- nrow(genotypes) -loci <- ncol(genotypes) / 2 -hl <- array(NA, dim=c(individuals, 1)) -E <- array(loci) -frequencies <- array(loci) -for (l in 1:loci) { -E[l] <- 1 -g <- 2 * l - 1 -h <- 2 * l -frequencies[l] <- list(table(genotypes[, g:h])) -E[l] <- 1 - sum((frequencies[[l]] / sum(frequencies[[l]]))^2) -} -for (i in 1:individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (l in 1:loci) { -g <- 2 * l - 1 -h <- 2 * l -if ((!is.na(genotypes[i, g])) && (!is.na(genotypes[i, h]))) { -if (genotypes[i, g] == genotypes[i, h]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -hl[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -hl -} -View(IR_dat) -hl_dc(IR_dat) -# Extract genetic data and convert to count the frequency of allele s -tmp <- Dat[,3:ncol(Dat)] -tmp[tmp == "0"] <- "0/0" -tmp[tmp == "1"] <- "0/2" -tmp[tmp == "2"] <- "2/2" -rownames(tmp) <- Inds -Loc_HL_formatted <- data.frame(matrix(NA, nrow = nrow(tmp), ncol = ncol(tmp)*2), row.names = rownames(tmp)) -View(Loc_HL_formatted) -for(i in 1:ncol(tmp)){ -# Get the locus name -Loc_nam <- colnames(tmp)[i] -# Split the genotype -Loc_split <- strsplit(tmp[,i], "/", fixed = T) -# Set the name of individuals -names(Loc_split) <- Dat[,1] -# Bind into a data frame -Loc_split_df <- do.call(rbind, Loc_split) -# Set an index to position the genotypes correctly -idx <- i*2-1 -Loc_HL_formatted[,idx:(idx+1)] <- Loc_split_df -colnames(Loc_HL_formatted)[idx:(idx+1)] <- c(paste(Loc_nam, "a", sep = ""), paste(Loc_nam, "b", sep = "")) -} -View(Loc_HL_formatted) -Loc_HL_mat <- as.matrix(Loc_HL_formatted) -View(Loc_HL_mat) -# Get the number of individuals -Individuals <- nrow(Loc_HL_mat) -# Get the number of loci -Nloc <- ncol(Loc_IR_mat)/2 -# Set up a results table -res_tab <- data.frame(HL = matrix(NA, nrow = Individuals, ncol = 1), row.names = Inds) -View(res_tab) -E <- array(Nloc) -frequencies <- array(Nloc) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[[i]] <- table(Loc_HL_mat[,idx:(idx+1)]) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (l in 1:Nloc) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -View(res_tab) -Counts <- array(Nloc) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[[i]] <- table(Loc_HL_mat[,idx:(idx+1)]) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -E <- array(Nloc) -Counts <- array(Nloc) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[i] <- table(Loc_HL_mat[,idx:(idx+1)]) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -# Set the same index as above -idx <- i*2-1 -Counts[i] <- list(table(Loc_HL_mat[,idx:(idx+1)])) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -View(Counts) -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (l in 1:Nloc) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -View(res_tab) -E <- array(Nloc) -Counts <- array(Nloc) -# Count the occurrences of each allele (0 and 2) at each locus -for(i in 1:Nloc) { -E[i] <- 1 -# Set the same index as above -idx <- i*2-1 -Counts[i] <- list(table(Loc_HL_mat[,idx:(idx+1)])) -E[i] <- 1 - sum((Counts[[i]] / sum(Counts[[i]]))^2) -} -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (j in 1:Nloc) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -View(res_tab) -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (j in 1:Nloc) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[l] -} -else { -sum.Ej <- sum.Ej + E[l] -} -} -} -res_tab[i,1] <- sum.Eh / (sum.Eh + sum.Ej) -} -View(res_tab) -hl_dc(IR_dat) -View(Loc_HL_formatted) -for (i in 1:Individuals) { -sum.Eh <- 0 -sum.Ej <- 0 -for (j in 1:Nloc) { -idx1 <- 2*j-1 -idx2 <- 2*j -if ((!is.na(Loc_HL_mat[i, idx1])) && (!is.na(Loc_HL_mat[i, idx2]))) { -if (Loc_HL_mat[i, idx1] == Loc_HL_mat[i, idx2]) { -sum.Eh <- sum.Eh + E[j] -} -else { -sum.Ej <- sum.Ej + E[j] -} -} -} -res_tab[i] <- sum.Eh / (sum.Eh + sum.Ej) -} -View(res_tab) -library(devtools) -document() -load_all() -?heterozygosity)_ -?heterozygosity() -build() -load_all() -?heterozygosity -library(pkgdown) -build_site() -document() -build_site() -build_site() -build_site() -document() -build_site() -document() -document() -document() -document() -data(nancycat) -data(nancycats) -library(adegenet) -data("nancycats") -Test <- nancycats@tab -View(Test) +step2 <- 1 +for(i in step:npops){ +index1 <- c(index1, i:npops) +index2 <- c(index2, rep(step2, length(step:npops))) +step=step+1 +step2=step2+1 +} +combn(Pops) +combn(Pops, m = 2) +combn(levels(Pops), m = 2) +# Get combinations for pairwise comparisons +Comps <- combn(levels(Pops), m = 2) +View(Comps) +# Set r (number of pops) +r <- 2 +ninds.dup.1 <- ninds[index1,] +ninds.dup.2 <- ninds[index2,] +View(Dat_perpop) +comps +Comps +length(Comps) +ncol(Comps) +i = 1 +which(names(Dat_perpop) == Comps[1,i]) +hich(names(Dat_perpop) == Comps[2,i]) +which(names(Dat_perpop) == Comps[2,i]) +Pop1 <- which(names(Dat_perpop) == Comps[1,i]) +Pop2 <- which(names(Dat_perpop) == Comps[2,i]) +Pop1 <- Dat_perpop[[which(names(Dat_perpop) == Comps[1,i])]] +Pop2 <- Dat_perpop[[which(names(Dat_perpop) == Comps[2,i])]] +View(ninds.dup.1) +View(ninds) +p1 <- p[index1,] +p2 <- p[index2,] +ind.nomissing <- matrix(nrow = length(Dat_perpop), ncol = length(Dat_perpop[3:ncol(Dat_perpop[[1]])])) +for(i in 1:length(Dat_perpop)){ +tmp <- Dat_perpop[[i]] +# Get frequency of alternate alleles +q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) +# Get number of heterozygotes in each population +het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") +ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") +} +View(ind.nomissing) +Comps[1,i] +row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat_perpop) +i = 1 +nPop1 <- ind.nomissing[Comps[1,i]] +nPop2 <- ind.nomissing[Comps[2,i]] +Comps[1,i] +#Pop1 <- Dat_perpop[[which(names(Dat_perpop) == Comps[1,i])]] +#Pop2 <- Dat_perpop[[which(names(Dat_perpop) == Comps[2,i])]] +nPop1 <- ind.nomissing[which(row.names(ind.nomissing == Comps[1,i]),] +nPop1 <- ind.nomissing[which(row.names(ind.nomissing) == Comps[1,i]),] +nPop2 <- ind.nomissing[which(row.names(ind.nomissing) == Comps[2,i]),] +ninds.dup.1 <- ninds[index1,] +ninds.dup.2 <- ninds[index2,] +View(ninds.dup.1) +View(ninds.dup.2) +index1 +index2 +q1 <- q.freq[which(row.names(q.freq) == Comps[1,i]),] +q2 <- q.freq[which(row.names(q.freq) == Comps[2,i]),] +p1 <- p[index1,] +p2 <- p[index2,] +View(p1) +oh1 <- oh[index1,] +oh2 <- oh[index2,] +h1 <- het.freq[which(row.names(het.freq) == Comps[1,i]),] +h2 <- het.freq[which(row.names(het.freq) == Comps[2,i]),] +n.bar <- (ninds.dup.1+ninds.dup.2)/r +nbar <- (nPop1+nPop2)/r +n.bar[1:10] +nbar[1:10] +nbar <- (nPop1+nPop1+nPop2)/r +nbar[1:10] +nbar <- (nPop1+nPop2+nPop2)/r +nbar[1:10] +nbar <- NULL +View(Pop1) +nPop1 +nPop2 +View(ninds.dup.1) +View(ninds.dup.2) +for(i in step:npops){ +index1 <- c(index1, i:npops) +index2 <- c(index2, rep(step2, length(step:npops))) +step=step+1 +step2=step2+1 +} +index2 <- index1 <- NULL +step <- 2 +step2 <- 1 +for(i in step:npops){ +index1 <- c(index1, i:npops) +index2 <- c(index2, rep(step2, length(step:npops))) +step=step+1 +step2=step2+1 +} +r=2 +ninds.dup.1 <- ninds[index1,] +ninds.dup.2 <- ninds[index2,] +View(n.bar) +View(ninds.dup.1) +(ninds.dup.1[,1]+ninds.dup.2[,1])/r +nPop1[,1] +nPop1[1] +nPop1[1] + nPop2[1] +index2 <- index1 <- NULL +step <- 2 +step2 <- 1 +for(i in step:npops){ +index1 <- c(index1, i:npops) +index2 <- c(index2, rep(step2, length(step:npops))) +step=step+1 +step2=step2+1 +} +ninds[index1,] +ninds[index1,1:5] +ind.nomissing[index1,1:5] +unique(Pops) +length(Pops) +length(unique(Pops)) +# Use STAMPP's indexing +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +# Set r (number of pops) +r <- 2 +#Pop1 <- Dat_perpop[[which(names(Dat_perpop) == Comps[1,i])]] +#Pop2 <- Dat_perpop[[which(names(Dat_perpop) == Comps[2,i])]] +# Extract the number of individuals, alternate allele frequency, and heterozygote frequency from the populations being compared +nPop1 <- ind.nomissing[index1_a,] +View(nPop1) +nPop2 <- ind.nomissing[index2_a,] +View(nPop2) +q1 <- q.freq[index1_a,] +q2 <- q.freq[index2_a,] +View(q1) +h1 <- het.freq[index1_a,] +h2 <- het.freq[index2_a,] +# Calcualte nbar +nbar <- (nPop1+nPop2)/r +View(n.bar) +View(nbar) +n.bar == nbar +nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) +p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) +s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) +h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) +a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) +b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) +c <- (1/2)*h.bar +any(a) == !is.finite() +any(a == !is.finite()) +is.infinite(a) +which(is.infinite(a)) +is.finite(a) +any(is.finite(a) == FALSE +) +!is.finite(a) +Fst <- function(Dat){ +# Dat is the Dat_perpop object +# Create a couple of matrices to store data +ind.nomissing <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +q.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +het.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +# Calculate the alternate allele frequency and the frequency of heterozygotes per population per locus +for(i in 1:length(Dat)){ +tmp <- Dat[[i]] +# Get frequency of alternate alleles +q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) +# Get number of heterozygotes in each population +het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") +# Get the number of individuals with data +ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") +} +# Set the names of the matrices +row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat) +# Use StamPP's indexing +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +# Set r (number of pops) +r <- 2 +nPop1 <- ind.nomissing[index1_a,] +nPop2 <- ind.nomissing[index2_a,] +q1 <- q.freq[index1_a,] +q2 <- q.freq[index2_a,] +h1 <- het.freq[index1_a,] +h2 <- het.freq[index2_a,] +### WC Fst +# Calculate nbar +n.bar <- (nPop1+nPop2)/r +nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) +p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) +s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) +h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) +a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) +b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) +c <- (1/2)*h.bar +# a, b, and c can be infinite, we will set these values to NA +if(!is.finite(a) | !is.finite(b) | !is.finite(c)){ +a[which(is.infinite(a))] <- NA +b[which(is.infinite(b))] <- NA +c[which(is.infinite(c))] <- NA +} +if(ncol(q.freq)>1){ #if there is more than 1 locus in the genotype dataset +if(length(unique(Pops))>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision +fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results +}else{ #if there is only 2 populations, ie. 1 pairwise comparision +fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results +} +}else{ # if there is only 1 locus in the genotype dataset +fst <- a/(a+b+c) #Fst results +} +} +Fst_wc <- Fst(Dat_perpop) +Fst <- function(Dat){ +# Dat is the Dat_perpop object +# Create a couple of matrices to store data +ind.nomissing <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +q.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +het.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +# Calculate the alternate allele frequency and the frequency of heterozygotes per population per locus +for(i in 1:length(Dat)){ +tmp <- Dat[[i]] +# Get frequency of alternate alleles +q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) +# Get number of heterozygotes in each population +het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") +# Get the number of individuals with data +ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") +} +# Set the names of the matrices +row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat) +# Use StamPP's indexing +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +# Set r (number of pops) +r <- 2 +nPop1 <- ind.nomissing[index1_a,] +nPop2 <- ind.nomissing[index2_a,] +q1 <- q.freq[index1_a,] +q2 <- q.freq[index2_a,] +h1 <- het.freq[index1_a,] +h2 <- het.freq[index2_a,] +### WC Fst +# Calculate nbar +n.bar <- (nPop1+nPop2)/r +nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) +p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) +s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) +h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) +a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) +b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) +c <- (1/2)*h.bar +# a, b, and c can be infinite, we will set these values to NA +if(any(is.finite(a) == FALSE) | any(is.finite(b) == FALSE) | any(is.finite(c) == FALSE)){ +a[which(is.infinite(a))] <- NA +b[which(is.infinite(b))] <- NA +c[which(is.infinite(c))] <- NA +} +if(ncol(q.freq)>1){ #if there is more than 1 locus in the genotype dataset +if(length(unique(Pops))>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision +fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results +}else{ #if there is only 2 populations, ie. 1 pairwise comparision +fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results +} +}else{ # if there is only 1 locus in the genotype dataset +fst <- a/(a+b+c) #Fst results +} +return(fst) +} +Fst_wc <- Fst(Dat_perpop) +Fst_wc +Fst <- function(Dat){ +# Dat is the Dat_perpop object +# Create a couple of matrices to store data +ind.nomissing <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +q.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +het.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +# Calculate the alternate allele frequency and the frequency of heterozygotes per population per locus +for(i in 1:length(Dat)){ +tmp <- Dat[[i]] +# Get frequency of alternate alleles +q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) +# Get number of heterozygotes in each population +het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") +# Get the number of individuals with data +ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") +} +# Set the names of the matrices +row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat) +# Use StamPP's indexing +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +# Set r (number of pops) +r <- 2 +nPop1 <- ind.nomissing[index1_a,] +nPop2 <- ind.nomissing[index2_a,] +q1 <- q.freq[index1_a,] +q2 <- q.freq[index2_a,] +h1 <- het.freq[index1_a,] +h2 <- het.freq[index2_a,] +### WC Fst +# Calculate nbar +n.bar <- (nPop1+nPop2)/r +nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) +p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) +s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) +h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) +a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) +b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) +c <- (1/2)*h.bar +# a, b, and c can be infinite, we will set these values to NA +if(any(is.finite(a) == FALSE) | any(is.finite(b) == FALSE) | any(is.finite(c) == FALSE)){ +a[which(is.infinite(a))] <- NA +b[which(is.infinite(b))] <- NA +c[which(is.infinite(c))] <- NA +} +if(ncol(q.freq)>1){ #if there is more than 1 locus in the genotype dataset +if(length(unique(Pops))>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision +fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results +} else{ #if there is only 2 populations, ie. 1 pairwise comparision +fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results +} +} else{ # if there is only 1 locus in the genotype dataset +fst <- a/(a+b+c) #Fst results +} +fstmat <- matrix(NA, nrow=length(unique(Pops)), ncol=length(unique(Pops)), dimnames=names(Dat)) +fstmat[lower.tri(fstmat)]=fst +return(fstmat) +} +Fst_wc +Fst_wc <- Fst(Dat_perpop) +Fst <- function(Dat){ +# Dat is the Dat_perpop object +# Create a couple of matrices to store data +ind.nomissing <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +q.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +het.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +# Calculate the alternate allele frequency and the frequency of heterozygotes per population per locus +for(i in 1:length(Dat)){ +tmp <- Dat[[i]] +# Get frequency of alternate alleles +q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) +# Get number of heterozygotes in each population +het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") +# Get the number of individuals with data +ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") +} +# Set the names of the matrices +row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat) +# Use StamPP's indexing +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +# Set r (number of pops) +r <- 2 +nPop1 <- ind.nomissing[index1_a,] +nPop2 <- ind.nomissing[index2_a,] +q1 <- q.freq[index1_a,] +q2 <- q.freq[index2_a,] +h1 <- het.freq[index1_a,] +h2 <- het.freq[index2_a,] +### WC Fst +# Calculate nbar +n.bar <- (nPop1+nPop2)/r +nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) +p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) +s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) +h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) +a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) +b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) +c <- (1/2)*h.bar +# a, b, and c can be infinite, we will set these values to NA +if(any(is.finite(a) == FALSE) | any(is.finite(b) == FALSE) | any(is.finite(c) == FALSE)){ +a[which(is.infinite(a))] <- NA +b[which(is.infinite(b))] <- NA +c[which(is.infinite(c))] <- NA +} +if(ncol(q.freq)>1){ #if there is more than 1 locus in the genotype dataset +if(length(unique(Pops))>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision +fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results +} else{ #if there is only 2 populations, ie. 1 pairwise comparision +fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results +} +} else{ # if there is only 1 locus in the genotype dataset +fst <- a/(a+b+c) #Fst results +} +fstmat <- matrix(NA, nrow=length(unique(Pops)), ncol=length(unique(Pops)), dimnames=list(names(Dat), names(Dat)) +fstmat[lower.tri(fstmat)]=fst +Fst <- function(Dat){ +# Dat is the Dat_perpop object +# Create a couple of matrices to store data +ind.nomissing <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +q.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +het.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) +# Calculate the alternate allele frequency and the frequency of heterozygotes per population per locus +for(i in 1:length(Dat)){ +tmp <- Dat[[i]] +# Get frequency of alternate alleles +q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) +# Get number of heterozygotes in each population +het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") +# Get the number of individuals with data +ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") +} +# Set the names of the matrices +row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat) +# Use StamPP's indexing +index2_a <- index1_a <- NULL +step <- 2 +step2 <- 1 +for(i in step:length(unique(Pops))){ +index1_a <- c(index1_a, i:length(unique(Pops))) +index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) +step=step+1 +step2=step2+1 +} +# Set r (number of pops) +r <- 2 +nPop1 <- ind.nomissing[index1_a,] +nPop2 <- ind.nomissing[index2_a,] +q1 <- q.freq[index1_a,] +q2 <- q.freq[index2_a,] +h1 <- het.freq[index1_a,] +h2 <- het.freq[index2_a,] +### WC Fst +# Calculate nbar +n.bar <- (nPop1+nPop2)/r +nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) +p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) +s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) +h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) +a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) +b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) +c <- (1/2)*h.bar +# a, b, and c can be infinite, we will set these values to NA +if(any(is.finite(a) == FALSE) | any(is.finite(b) == FALSE) | any(is.finite(c) == FALSE)){ +a[which(is.infinite(a))] <- NA +b[which(is.infinite(b))] <- NA +c[which(is.infinite(c))] <- NA +} +if(ncol(q.freq)>1){ #if there is more than 1 locus in the genotype dataset +if(length(unique(Pops))>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision +fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results +} else{ #if there is only 2 populations, ie. 1 pairwise comparision +fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results +} +} else{ # if there is only 1 locus in the genotype dataset +fst <- a/(a+b+c) #Fst results +} +fstmat <- matrix(NA, nrow=length(unique(Pops)), ncol=length(unique(Pops)), dimnames=list(names(Dat), names(Dat))) +fstmat[lower.tri(fstmat)]=fst +return(fstmat) +} +Fst_wc <- Fst(Dat_perpop) +View(Fst_wc) +Test_fst <- StAMPP::stamppFst(Glight, nboots = 1) +View(Test_fst) +View(Fst_wc) +Test_fst <- StAMPP::stamppFst(Glight, nboots = 0) +View(Test_fst) document() build_site() +pkgdown::build_site() +check() +check() +check() diff --git a/R/Dif_stats.R b/R/Dif_stats.R index 88feb8a..174e67f 100644 --- a/R/Dif_stats.R +++ b/R/Dif_stats.R @@ -1,4 +1,4 @@ -#' A function to calculate differentiation statistics and perform significance testing with a vcf file. +#' \bold{WARNING! This function has been deprecated and is no longer supported. Please use the Differentiation function.} A function to calculate differentiation statistics and perform significance testing with a vcf file. #' #' @param VCF Character string indicating the name of the vcf file to be used in analysis. #' @param pops Character string indicating the name of the population assignment file. This file should have four columns and be in the same order as your vcf file. The first column named Sample indicates the sample name. The second column named Population indicates the population assignment of each individual. The third column named Long indicates the longitude of the sample. The fourth column named Lat indicates the latitude of the sample. @@ -18,6 +18,8 @@ #' Test <- Dif_stats(VCF = HornedLizard_VCF, pops = HornedLizard_Pop, #' ploidy = 2, statistic = "both", boots = 10, write = FALSE)} Dif_stats <- function(VCF, pops, ploidy, statistic = "both", boots, write = FALSE, prefix = NULL) { + + .Deprecated("Heterozygosity", msg = "The Div_Stats function has been deprecated as of PopGenHelpR v1.3.0 and will dissappear in v2.0.0. Please use the Differentiation function if you wish to estimate Fst, Nei's D, or Jost's D.") # Read in files and convert to necessary formats if(missing(VCF)){ stop("Please supply a vcf file for analysis") diff --git a/R/Differentiation.R b/R/Differentiation.R index 80ae805..7381515 100644 --- a/R/Differentiation.R +++ b/R/Differentiation.R @@ -2,9 +2,9 @@ #' #' @param data Character. String indicating the name of the vcf file, geno file or vcfR object to be used in the analysis. #' @param pops Character. String indicating the name of the population assignment file or dataframe containing the population assignment information for each individual in the data. This file must be in the same order as the vcf file and include columns specifying the individual and the population that individual belongs to. The first column should contain individual names and the second column should indicate the population assignment of each individual. Alternatively, you can indicate the column containing the individual and population information using the individual_col and population_col arguments. -#' @param statistic Character. String or vector indicating the statistic to calculate. Options are any of: all; all of the statistics; Fst, Weir and Cockerham (1984) Fst; NeisD, Nei's D statistic; JostD, Jost's D; StHe, heterozygosity standardized by the average expected heterozygosity; StHo, heterozygosity standardized by the average observed heterozygosity; IR, internal relatedness; HL, homozygosity by locus. +#' @param statistic Character. String or vector indicating the statistic to calculate. Options are any of: all; all of the statistics; Fst, Weir and Cockerham (1984) Fst; NeisD, Nei's D statistic; JostsD, Jost's D. #' @param missing_value Character. String indicating missing data in the input data. It is assumed to be NA, but that may not be true (is likely not) in the case of geno files. -#' @param write Boolean. Whether or not to write the output to files in the current working directory. There will be one or two files for each statistic. Files will be named based on their statistic such as Ho_perpop.csv or Ho_perloc.csv. +#' @param write Boolean. Whether or not to write the output to files in the current working directory. There will be one or two files for each statistic. Files will be named based on their statistic such as Fst_perpop.csv. #' @param prefix Character. Optional argument. String that will be appended to file output. Please provide a prefix if write is set to TRUE. #' @param population_col Numeric. Optional argument (a number) indicating the column that contains the population assignment information. #' @param individual_col Numeric. Optional argument (a number) indicating the column that contains the individuals (i.e., sample name) in the data. @@ -12,20 +12,23 @@ #' @return A list containing the estimated heterozygosity statistics. The per pop values are calculated by taking the average of the per locus estimates. #' #' @references -#' \bold{For Fst:} +#' \bold{Fst:} +#' +#' \href{https://doi.org/10.1111/1755-0998.12129}{Pembleton, L. W., Cogan, N. O., & Forster, J. W. (2013)}. StAMPP: An R package for calculation of genetic differentiation and structure of mixed‐ploidy level populations. Molecular ecology resources, 13(5), 946-952. #' #' \href{https://www.jstor.org/stable/2408641}{Weir, B. S., & Cockerham, C. C. (1984)}. Estimating F-statistics for the analysis of population structure. evolution, 1358-1370. #' -#' \bold{For Nei's D:} +#' \bold{Nei's D:} #' #' \href{https://www.journals.uchicago.edu/doi/abs/10.1086/282771}{Nei, M. (1972)}. Genetic distance between populations. The American Naturalist, 106(949), 283-292. #' +#' \href{https://doi.org/10.1111/1755-0998.12129}{Pembleton, L. W., Cogan, N. O., & Forster, J. W. (2013)}. StAMPP: An R package for calculation of genetic differentiation and structure of mixed‐ploidy level populations. Molecular ecology resources, 13(5), 946-952. #' -#' \bold{For Jost's D:} +#' \bold{Jost's D:} #' -#' \href{https://doi.org/10.1111/j.1365-294X.2008.03887.x}{Jost L (2008)}. GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015–4026.. +#' \href{https://doi.org/10.1111/j.1365-294X.2008.03887.x}{Jost L (2008)}. GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015–4026. #' -#' @author \email{Keaka Farleigh (farleik@@miamioh.edu)} +#' @author Keaka Farleigh #' @export #' #' @examples @@ -70,8 +73,7 @@ Differentiation <- function(data, pops, statistic = 'all', missing_value = NA, w # Preserve individual names Inds <- rownames(Dat) Dat <- sapply(Dat, as.numeric) - } - else if(tools::file_ext(data) == 'vcf') { + } else if(tools::file_ext(data) == 'vcf') { Dat <- vcfR::read.vcfR(data, verbose = FALSE) print("VCF file detected, proceeding to formatting.") # Convert the vcf gt slot to a geno style table for calculations @@ -86,13 +88,11 @@ Differentiation <- function(data, pops, statistic = 'all', missing_value = NA, w # Preserve individual names Inds <- rownames(Dat) Dat <- sapply(Dat, as.numeric) - } - else if(tools::file_ext(data) == 'geno'){ + } else if(tools::file_ext(data) == 'geno'){ Dat <- utils::read.table(data) print("Geno file detected, proceeding to formatting. Note, PopGenHelpR assumes that your individuals in the geno file and popmap are in the same order, please check to avoid erroneous inferences.") - } - else { + } else { stop("Please supply a geno file, vcf file, or vcfR object for analysis") } @@ -134,17 +134,134 @@ Differentiation <- function(data, pops, statistic = 'all', missing_value = NA, w } message('Formatting has finished, moving onto calculations') - ####################################### + ######################################## ##### Differentiation calculations ##### - ####################################### + ######################################## + + ### Fst + # We will use the Dat_perpop object + + + Fst <- function(Dat){ + # Dat is the Dat_perpop object + # Create a couple of matrices to store data + ind.nomissing <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) + q.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) + het.freq <- matrix(nrow = length(Dat), ncol = length(Dat[3:ncol(Dat[[1]])])) + + # Calculate the alternate allele frequency and the frequency of heterozygotes per population per locus + for(i in 1:length(Dat)){ + tmp <- Dat[[i]] + # Get frequency of alternate alleles + q.freq[i,] <- (((colSums(tmp[3:ncol(tmp)] == 2, na.rm = T))*2) + colSums(tmp[3:ncol(tmp)] == 1, na.rm = T))/(2*colSums(tmp[3:ncol(tmp)] != "NA")) + # Get number of heterozygotes in each population + het.freq[i,] <- colSums(tmp[3:ncol(tmp)] == 1)/colSums(tmp[3:ncol(tmp)] != "NA") + # Get the number of individuals with data + ind.nomissing[i,] <- colSums(tmp[3:ncol(tmp)] != "NA") + } + # Set the names of the matrices + row.names(q.freq) <- row.names(het.freq) <- row.names(ind.nomissing) <- names(Dat) + + + # Use StamPP's indexing + index2_a <- index1_a <- NULL + step <- 2 + step2 <- 1 + for(i in step:length(unique(Pops))){ + index1_a <- c(index1_a, i:length(unique(Pops))) + index2_a <- c(index2_a, rep(step2, length(step:length(unique(Pops))))) + step=step+1 + step2=step2+1 + } + + # Set r (number of pops) + r <- 2 + + + nPop1 <- ind.nomissing[index1_a,] + nPop2 <- ind.nomissing[index2_a,] + + q1 <- q.freq[index1_a,] + q2 <- q.freq[index2_a,] + + h1 <- het.freq[index1_a,] + h2 <- het.freq[index2_a,] + + ### WC Fst + + # Calculate nbar + n.bar <- (nPop1+nPop2)/r + + nc <- (r*n.bar)-(((nPop1^2)+(nPop2^2)) / (r*n.bar)) + p.bar <- ((nPop1*q1)/(r*n.bar)) + ((nPop2*q2)/(r*n.bar)) + s.square <- ((nPop1*((q1-p.bar)^2))/n.bar) + ((nPop2*((q2-p.bar)^2))/n.bar) + h.bar <- ((nPop1*h1)/(r*n.bar)) + ((nPop2*h2)/(r*n.bar)) + + a <- (n.bar/nc) * (s.square - (1/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - ((1/4)*h.bar) )) + b <- (n.bar/(n.bar-1)) * ( (p.bar*(1-p.bar)) - (((r-1)/r)*s.square) - (((2*n.bar-1)/(4*n.bar))*h.bar)) + c <- (1/2)*h.bar - if(write == TRUE && !is.null(prefix)){ - res_write <- which(Stat %in% statistic) - Output2write <- Output[which(Stat_idx %in% res_write)] - for (i in 1:length(Output2write)){ - utils::write.csv(Output2write, file = paste(names(Output2write[i]), ".csv", sep = "_")) + # a, b, and c can be infinite, we will set these values to NA + if(any(is.finite(a) == FALSE) | any(is.finite(b) == FALSE) | any(is.finite(c) == FALSE)){ + a[which(is.infinite(a))] <- NA + b[which(is.infinite(b))] <- NA + c[which(is.infinite(c))] <- NA } - } else if(write == TRUE && is.null(prefix)){ - utils::write.csv(Output2write, file = paste(prefix, '_', names(Output2write[i]), ".csv", sep = "_")) + + if(ncol(q.freq)>1){ #if there is more than 1 locus in the genotype dataset + + if(length(unique(Pops))>2){ #if there are greater than 2 populations, ie. greater than 1 pairwise comparision + + fst <- rowSums(a, na.rm=TRUE) / (rowSums(a, na.rm=TRUE) + rowSums(b, na.rm=TRUE) + rowSums(c, na.rm=TRUE)) #Fst results + + } else{ #if there is only 2 populations, ie. 1 pairwise comparision + + fst <- sum(a, na.rm=TRUE) / (sum(a, na.rm=TRUE) + sum(b, na.rm=TRUE) + sum(c, na.rm=TRUE)) #Fst results + + } + + } else{ # if there is only 1 locus in the genotype dataset + + fst <- a/(a+b+c) #Fst results + } + + fstmat <- matrix(NA, nrow=length(unique(Pops)), ncol=length(unique(Pops)), dimnames=list(names(Dat), names(Dat))) + fstmat[lower.tri(fstmat)]=fst + return(fstmat) + } + + Fst_wc <- Fst(Dat_perpop) + + + ### Nei's D + + ### Jost's D + + + + Output <- list(Fst_wc, ND, JD) + + names(Output) <- c("Fst", "NeisD", "JostsD") + ### Write output + Stat <- c("Fst", "NeisD", "JostsD") + Stat_idx <- c(1,2,3) + + if(length(statistic) == 1 && statistic == "all"){ + return(Output) + } else { + res <- which(Stat %in% statistic) + Output_final<- Output[which(Stat_idx %in% res)] + return(Output_final) + } + + if(write == TRUE && !is.null(prefix)){ + res_write <- which(Stat %in% statistic) + Output2write <- Output[which(Stat_idx %in% res_write)] + for (i in 1:length(Output2write)){ + utils::write.csv(Output2write, file = paste(names(Output2write[i]), ".csv", sep = "_")) + } + } else if(write == TRUE && is.null(prefix)){ + utils::write.csv(Output2write, file = paste(prefix, '_', names(Output2write[i]), ".csv", sep = "_")) + } } diff --git a/R/Div_Stats.R b/R/Div_Stats.R index dce0233..e43a5b4 100644 --- a/R/Div_Stats.R +++ b/R/Div_Stats.R @@ -1,4 +1,4 @@ -#' A function to estimate heterozygosity and the number of private alleles from a vcf file. +#' \bold{WARNING! This function has been deprecated and is no longer supported. Please use the Heterozygosity and Private.alleles functions.} A function to estimate heterozygosity and the number of private alleles from a vcf file. #' #' @param VCF Character string indicating the name of the vcf file to be used in analysis. #' @param pops Character string indicating the name of the population assignment file. This file should have four columns and be in the same order as your vcf file. The first column named Sample indicates the sample name. The second column named Population indicates the population assignment of each individual. The third column named Longitude indicates the longitude of the sample. The fourth column named Latitude indicates the latitude of the sample. diff --git a/R/Heterozygosity.R b/R/Heterozygosity.R index bcbed29..487eef7 100644 --- a/R/Heterozygosity.R +++ b/R/Heterozygosity.R @@ -12,11 +12,11 @@ #' @return A list containing the estimated heterozygosity statistics. The per pop values are calculated by taking the average of the per locus estimates. #' #' @references -#' \bold{For expected (He) and observed heterozygosity (Ho):} +#' \bold{Expected (He) and observed heterozygosity (Ho):} #' #' Nei, M. (1987) Molecular Evolutionary Genetics. Columbia University Press #' -#' \bold{For homozygosity by locus (HL) and internal relatedness (IR):} +#' \bold{Homozygosity by locus (HL) and internal relatedness (IR):} #' #' \href{https://onlinelibrary.wiley.com/doi/full/10.1111/j.1755-0998.2010.02830.x?casa_token=QiNcMSJyunkAAAAA%3Agv-CK7GrUn1bHSgz4qZSOcB2nyHDeR8B1Wtm9bM7q7vZCAcJhNkhTWnpM0EfkSCb2EvkRrr2ArMzC7v7}{Alho, J. S., Välimäki, K., & Merilä, J. (2010)}. Rhh: an R extension for estimating multilocus heterozygosity and heterozygosity–heterozygosity correlation. Molecular ecology resources, 10(4), 720-722. #' @@ -24,11 +24,11 @@ #' #' \href{https://onlinelibrary.wiley.com/doi/10.1111/j.1365-294X.2006.03111.x}{Aparicio, J. M., Ortego, J., & Cordero, P. J. (2006)}. What should we weigh to estimate heterozygosity, alleles or loci?. Molecular Ecology, 15(14), 4659-4665. #' -#' \bold{For heterozygosity standardized by expected (StHe) and observed heterozygosity (StHo):} +#' \bold{Heterozygosity standardized by expected (StHe) and observed heterozygosity (StHo):} #' #' \href{https://academic.oup.com/evolut/article/53/4/1259/6757148}{Coltman, D. W., Pilkington, J. G., Smith, J. A., & Pemberton, J. M. (1999)}. Parasite‐mediated selection against Inbred Soay sheep in a free‐living island populaton. Evolution, 53(4), 1259-1267. #' -#' @author \email{Keaka Farleigh (farleik@@miamioh.edu)} +#' @author Keaka Farleigh #' @export #' #' @examples diff --git a/R/Private.alleles.R b/R/Private.alleles.R index 25d321c..ba19688 100644 --- a/R/Private.alleles.R +++ b/R/Private.alleles.R @@ -9,7 +9,7 @@ #' @return A list containing the count of private alleles in each population and the metadata for those alleles. The metadata is a list that contains the private allele and locus name for each population. #' @export #' -#' @author \email{Keaka Farleigh (farleik@@miamioh.edu)} +#' @author Keaka Farleigh #' #' @examples #' \donttest{ diff --git a/docs/news/index.html b/docs/news/index.html index 66cd259..12ae312 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -51,13 +51,12 @@

PopGenHelpR 1.3.0