diff --git a/DESCRIPTION b/DESCRIPTION index ec18e7b..ebbc658 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: IgGeneUsage Type: Package Title: Differential gene usage in immune repertoires -Version: 1.17.23 +Version: 1.17.24 Authors@R: person(given = "Simo", family = "Kitanovski", diff --git a/R/utils_ppc.R b/R/utils_ppc.R index 8e60e6e..6f68654 100644 --- a/R/utils_ppc.R +++ b/R/utils_ppc.R @@ -104,8 +104,8 @@ get_ppc_condition <- function(glm, } # condition map - condition_map <- data.frame(condition_name = ud$condition_names, - condition_id = ud$condition_id) + condition_map <- data.frame(condition_name = ud$condition_name_of_sample, + condition_id = ud$condition_id_of_sample) condition_map <- condition_map[duplicated(condition_map)==FALSE,] rownames(condition_map) <- condition_map$condition_id yhat$gene_name <- ud$gene_names[yhat$gene_id] diff --git a/R/utils_usage.R b/R/utils_usage.R index 631b8c3..0147a60 100644 --- a/R/utils_usage.R +++ b/R/utils_usage.R @@ -34,8 +34,8 @@ get_usage <- function(u) { check_paired <- function(u, has_balanced_replicates, has_replicates, - has_condition) { - if(has_condition==FALSE) { + has_conditions) { + if(has_conditions==FALSE) { return(FALSE) } if(has_balanced_replicates==FALSE) { @@ -43,18 +43,19 @@ get_usage <- function(u) { } if(has_replicates) { - q <- u[duplicated(u[,c("individual_id","condition","replicate")])==FALSE,] + q <- u[duplicated(u[,c("individual_id","condition", + "replicate_id")]) == FALSE,] q$f <- 1 - q <- aggregate(f~individual_id+condition+replicate, data = q, - FUN = sum, simplify = FALSE, drop = FALSE) + q <- aggregate(f~individual_id+condition+replicate_id, + data = q, FUN = sum, drop = FALSE) q$f[is.null(q$f)|is.na(q$f)] <- 0 return(ifelse(test = any(q$f!=1), yes = FALSE, no = TRUE)) } else { q <- u[duplicated(u[,c("individual_id", "condition")])==FALSE,] q$f <- 1 - q <- aggregate(f~individual_id+condition, data = q, FUN = sum, - simplify = FALSE, drop = FALSE) + q <- aggregate(f~individual_id+condition, + data = q, FUN = sum, drop = FALSE) q$f[is.null(q$f)|is.na(q$f)] <- 0 return(ifelse(test = any(q$f!=1), yes = FALSE, no = TRUE)) } @@ -158,7 +159,7 @@ get_usage <- function(u) { u = u, has_balanced_replicates = has_balanced_replicates, has_replicates = has_replicates, - has_condition = has_condition) + has_conditions = has_conditions) return(list(Y = Y, N = as.numeric(N), diff --git a/data/d_zibb_3.RData b/data/d_zibb_3.RData index e692354..bbb556f 100644 Binary files a/data/d_zibb_3.RData and b/data/d_zibb_3.RData differ diff --git a/inst/scripts/d_zibb_3.R b/inst/scripts/d_zibb_3.R index a34a666..8749498 100644 --- a/inst/scripts/d_zibb_3.R +++ b/inst/scripts/d_zibb_3.R @@ -21,17 +21,24 @@ data { array [N_individual] int condition_id; // id of conditions real phi; real kappa; - array [N_condition] vector [N_gene] beta_condition; vector [N_condition] sigma_individual; + vector [N_condition] sigma_condition; } generated quantities { vector [N_gene] alpha; array [N_individual] vector [N_gene] theta; array [N_individual] vector [N_gene] beta_individual; + array [N_condition] vector [N_gene] beta_condition; // generate usage array [N_gene, N_individual] int Y; + for(i in 1:N_condition) { + for(j in 1:N_gene) { + beta_condition[i][j] = normal_rng(0, sigma_condition[i]); + } + } + for(i in 1:N_gene) { alpha[i] = normal_rng(-3.0, 1.0); @@ -54,10 +61,10 @@ N_condition <- 3 N_individual <- 5 N_gene <- 8 N <- 10^3 -sigma_individual <- runif(n = N_condition, min = 0.1, max = 0.6) -beta_condition <- t(replicate(n = N_condition, expr = rnorm(n = N_gene, mean = 0, sd = 1))) +sigma_individual <- runif(n = N_condition, min = 0.1, max = 0.2) +sigma_condition <- runif(n = N_condition, min = 0.2, max = 0.6) phi <- 200 -kappa <- 0.03 +kappa <- 0.015 condition_id <- rep(x = 1:N_condition, each = N_individual) @@ -67,7 +74,7 @@ l <- list(N_individual = N_individual*N_condition, N = N, condition_id = condition_id, sigma_individual = sigma_individual, - beta_condition = beta_condition, + sigma_condition = sigma_condition, phi = phi, kappa = kappa) @@ -77,7 +84,7 @@ sim <- rstan::sampling(object = m, iter = 1, chains = 1, algorithm = "Fixed_param", - seed = 123456) + seed = 12346) # extract simulation and convert into data frame which can diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd index 03e1b54..3ec1aab 100644 --- a/vignettes/User_Manual.Rmd +++ b/vignettes/User_Manual.Rmd @@ -88,19 +88,19 @@ on the posterior distribution of $\gamma$, and are thus related. `r Biocpkg("IgGeneUsage")` has a couple of built-in Ig gene usage datasets. Some were obtained from studies and others were simulated. -Lets look into the simulated dataset `d_zibb_2`. This dataset was generated +Lets look into the simulated dataset `d_zibb_3`. This dataset was generated by a zero-inflated beta-binomial (ZIBB) model, and `r Biocpkg("IgGeneUsage")` was designed to fit ZIBB-distributed data. ```{r} -data("d_zibb_2", package = "IgGeneUsage") -knitr::kable(head(d_zibb_2)) +data("d_zibb_3", package = "IgGeneUsage") +knitr::kable(head(d_zibb_3)) ``` -We can also visualize `d_zibb_2` with `r CRANpkg("ggplot")`: +We can also visualize `d_zibb_3` with `r CRANpkg("ggplot")`: ```{r, fig.width=6, fig.height=3.25} -ggplot(data = d_zibb_2)+ +ggplot(data = d_zibb_3)+ geom_point(aes(x = gene_name, y = gene_usage_count, col = condition), position = position_dodge(width = .7), shape = 21)+ theme_bw(base_size = 11)+ @@ -113,10 +113,10 @@ ggplot(data = d_zibb_2)+ ## DGU analysis As main input `r Biocpkg("IgGeneUsage")` uses a data.frame formatted as e.g. -`d_zibb_2`. Other input parameters allow you to configure specific settings +`d_zibb_3`. Other input parameters allow you to configure specific settings of the `r CRANpkg("rstan")` sampler. -In this example, we analyze `d_zibb_2` with 3 MCMC chains, 1500 iterations +In this example, we analyze `d_zibb_3` with 3 MCMC chains, 1500 iterations each including 500 warm-ups using a single CPU core (Hint: for parallel chain execution set parameter `mcmc_cores` = 3). We report for each model parameter its mean and 95% highest density interval (HDIs). @@ -129,8 +129,8 @@ issue with a reproducible script at the Bioconductor support site or on Github[^3]. ```{r} -M <- DGU(ud = d_zibb_2, # input data - mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500) +M <- DGU(ud = d_zibb_3, # input data + mcmc_warmup = 300, # how many MCMC warm-ups per chain (default: 500) mcmc_steps = 1500, # how many MCMC steps per chain (default: 1,500) mcmc_chains = 3, # how many MCMC chain to run (default: 4) mcmc_cores = 1, # how many PC cores to use? (e.g. parallel chains) @@ -182,7 +182,7 @@ summary(M) rstan::check_hmc_diagnostics(M$fit) ``` - * rhat < 1.03 and n_eff > 0 + * rhat < 1.05 and n_eff > 0 ```{r, fig.height = 3, fig.width = 6} @@ -197,7 +197,7 @@ Error bars show 95% HDI of mean posterior prediction. The predictions can be compared with the observed data (x-axis). For points near the diagonal $\rightarrow$ accurate prediction. -```{r, fig.height = 3.25, fig.width = 7} +```{r, fig.height = 4, fig.width = 7} ggplot(data = M$ppc$ppc_rep)+ facet_wrap(facets = ~individual_id, ncol = 5)+ geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+ @@ -366,7 +366,7 @@ by evaluating their variability for a specific gene. This analysis can be computationally demanding. ```{r} -L <- LOO(ud = d_zibb_2, # input data +L <- LOO(ud = d_zibb_3, # input data mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500) mcmc_steps = 1000, # how many MCMC steps per chain (default: 1,500) mcmc_chains = 1, # how many MCMC chain to run (default: 4) @@ -376,6 +376,7 @@ L <- LOO(ud = d_zibb_2, # input data max_treedepth = 10) # tree depth evaluated at each step (default: 12) ``` + Next, we collected the results (GU and DGU) from each LOO iteration: ```{r} @@ -388,32 +389,32 @@ L_dgu <- do.call(rbind, lapply(X = L, FUN = function(x){return(x$dgu)})) ## LOO-DGU: variability of effect size $\gamma$ -```{r, fig.width=6.5, fig.height=4} +```{r, fig.width=6, fig.height=5} ggplot(data = L_dgu)+ + facet_wrap(facets = ~contrast, ncol = 1)+ geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+ geom_errorbar(aes(x = gene_name, y = es_mean, ymin = es_L, ymax = es_H, col = contrast, group = loo_id), - width = 0.1, position = position_dodge(width = 0.5))+ + width = 0.1, position = position_dodge(width = 0.75))+ geom_point(aes(x = gene_name, y = es_mean, col = contrast, group = loo_id), size = 1, - position = position_dodge(width = 0.5))+ + position = position_dodge(width = 0.75))+ theme_bw(base_size = 11)+ - theme(legend.position = "top")+ - ylab(expression(gamma))+ - theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4)) + theme(legend.position = "none")+ + ylab(expression(gamma)) ``` ## LOO-DGU: variability of $\pi$ -```{r, fig.width=6.5, fig.height=4} +```{r, fig.width=6, fig.height=5} ggplot(data = L_dgu)+ + facet_wrap(facets = ~contrast, ncol = 1)+ geom_point(aes(x = gene_name, y = pmax, col = contrast, group = loo_id), size = 1, position = position_dodge(width = 0.5))+ theme_bw(base_size = 11)+ - theme(legend.position = "top")+ - ylab(expression(pi))+ - theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4)) + theme(legend.position = "none")+ + ylab(expression(pi)) ``` @@ -425,24 +426,16 @@ ggplot(data = L_gu)+ geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L, ymax = prob_H, col = condition, group = interaction(loo_id, condition)), - width = 0.1, position = position_dodge(width = 0.5))+ + width = 0.1, position = position_dodge(width = 1))+ geom_point(aes(x = gene_name, y = prob_mean, col = condition, group = interaction(loo_id, condition)), size = 1, - position = position_dodge(width = 0.5))+ + position = position_dodge(width = 1))+ theme_bw(base_size = 11)+ theme(legend.position = "top")+ ylab("GU [probability]")+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4)) ``` -# Hierarchical clustering analaysis - -```{r, fig.width=6, fig.height=4} -# x <- M$theta -x <- acast(individual_id~gene_name, data = M$theta, value.var = "theta_mean") - -plot(hclust(dist(x, method = "euclidean"), method = "average")) -``` # Case Study B: analyzing IRRs containing biological replicates