-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathPC_Covariates.R
269 lines (248 loc) · 13.6 KB
/
PC_Covariates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#From expression matrix (exp) and meta data (meta), compute covariate vs PC significance, create results.
#The significance is computed using correlation method for numeric covariates, and ANOVA for categorical covariates.
#Output files (Excel file and PDF plots) will be saved in output directory with prefix as specified by the user (out_prefix). If out_prefix=NULL, will only return results as R list object.
#exp is expression matrix (logTPM, logCPM, etc), meta is meta data. Column names (samples) of exp must match row names of meta.
#PC_cutoff: select which principle components to be used in analysis. Default 5 will select components that explain more than 5% of variance in the data.
#FDR_cutoff will choose the covariate-PC pairs that pass this cutoff.
#N_col: number of columns for facet plots
#Besides the output files, return R list object including the following:
#data.all (PC scores and meta data combined), selVar_All (significant covariate-PC pairs)
#sel_dataN (Numeric covariate results), sel_dataC (Categorical covariate results)
Covariate_PC_Analysis<-function(exp, meta, out_prefix, PC_cutoff=5, FDR_cutoff=0.1, N_col=3, PCA_plots="ranked") {
require(tidyverse); require(cowplot); require(openxlsx)
if (!is.null(out_prefix) ) {
out_dir=dirname(out_prefix)
if (!dir.exists(out_dir)) {dir.create(out_dir)}
}
res1=Compute_Corr_Anova(exp, meta, PC_cutoff=PC_cutoff)
sel_dataN<-get_PC_meta_plot(res1, 'numeric', FDR_cutoff, N_col=N_col)
selVarN=sel_dataN$selVar
if (!is.null(selVarN) && !is.null(out_prefix)) { #output correlation results
graphH=ceiling(nrow(selVarN)/3)*4
graphW=min(nrow(selVarN)*4+1, 12)
graphH=min(50, graphH); graphW=min(50, graphW)
ggsave(str_c(out_prefix, "_Significant_Numeric_Covariates.pdf"), sel_dataN$plot, width=graphW, height=graphH,limitsize = FALSE)
}
sel_dataC<-get_PC_meta_plot(res1, 'categorical', FDR_cutoff, N_col=N_col)
selVarC=sel_dataC$selVar
if (!is.null(selVarC) && !is.null(out_prefix)) { #output anova results
graphH=ceiling(nrow(selVarC)/3)*4
graphW=min(nrow(selVarC)*4+1, 12)
graphH=min(50, graphH); graphW=min(50, graphW)
ggsave(str_c(out_prefix, "_Significant_Categorical_Covariates.pdf"), sel_dataC$plot, width=graphW, height=graphH,limitsize = FALSE)
}
if (!is.null(selVarC)) {selVarC<-selVarC%>%arrange(fdr)}
if (!is.null(selVarN)) {selVarN<-selVarN%>%arrange(fdr)}
PC_info<-data.frame(PC=colnames(res1$PC_scores), Per=res1$percentVar[1:ncol(res1$PC_scores)])%>%mutate(PC_new=str_c(PC, " (", Per, "%)"))
PC_Scores=res1$PC_scores; PC_Scores=data.frame(Sample=rownames(PC_Scores), PC_Scores)
MetaData=res1$meta; MetaData=data.frame(Sample=rownames(MetaData), MetaData)
if (!is.null(out_prefix) ) {
dat=list(Sig_Cat_Anova=selVarC, Sig_Num_corr=selVarN, All_Cat_Anova=res1$anova, All_Num_Corr=res1$corr, PC_Scores=PC_Scores, MetaData=MetaData,
PC_Perentage=PC_info[, 1:2])
write.xlsx(dat, str_c(out_prefix, file='_Covariate_PC_Results.xlsx'))
}
#now create PCA plots
data.all<-cbind(res1$PC_scores, res1$meta)
if (is.null(selVarN)) {selVarN1=NULL
} else { selVarN1<-selVarN%>%mutate(NewText=str_c(covar, " vs. ", PC, ": ", text ))%>%
mutate(Type="Numeric")%>%dplyr::select(PC, covar, Type, NewText, pvalue, fdr) }
if (is.null(selVarC)) {selVarC1=NULL
} else { selVarC1<-selVarC%>%mutate(NewText=str_c(covar, " vs. ", PC, " ", text ))%>%
mutate(Type="Categorical")%>%dplyr::select(PC, covar, Type,NewText, pvalue, fdr) }
selVar_All<-rbind(selVarC1, selVarN1)
if (!is.null(selVar_All)) {
selVar_All<-selVar_All%>%arrange(fdr) #sort by FDR
if (nrow(selVar_All)>0 && !is.null(out_prefix) ) {
pdf(str_c(out_prefix, "_PCA_Plots.pdf"), width=8, height=9)
if (PCA_plots=="ranked") { #plot PCA plots in order of FDR
for (i in 1:nrow(selVar_All)) {
x0=as.character(selVar_All$PC[i])
if (x0=="PC1") {y0="PC2"} else {y0=x0; x0="PC1"}
x=sym(x0); y=sym(y0); color_by=sym(as.character(selVar_All$covar[i]))
p<-ggplot(data.all, aes(x=!!x, y=!!y, col=!!color_by))+geom_point()+
labs(x=PC_info$PC_new[PC_info$PC==x0], y=PC_info$PC_new[PC_info$PC==y0])+ theme_half_open()
#additional text to add
p<-add_sub(p, selVar_All$NewText[i], x=0.2, hjust=0)
print(ggdraw(p))
}
} else { #plot PCA plots for each variate, try to minimize plots
var_list=as.character(sort(unique(selVar_All$covar)))
for (i in 1:length(var_list) ) {
var=as.character(var_list[i])
PCs<-selVar_All%>%dplyr::filter(covar==var)%>%dplyr::select(PC)%>%unlist%>%as.character()
if (length(PCs)==1) {
PCs=sort(c(PCs, ifelse(PCs=="PC1", "PC2", "PC1")))
}
for (j in 2:length(PCs)) {
x=sym(PCs[1]); y=sym(PCs[j]); color_by=sym(var)
p<-ggplot(data.all, aes(x=!!x, y=!!y, col=!!color_by))+geom_point()+
labs(x=PC_info$PC_new[PC_info$PC==PCs[1]], y=PC_info$PC_new[PC_info$PC==PCs[j]])+ theme_half_open()
#additional text to add
more_text<-selVar_All%>%dplyr::filter(covar==var)%>%dplyr::select(NewText)%>%unlist()%>%paste(collapse="\n")
p<-add_sub(p, str_c(more_text, "\n(This plot shows ", PCs[1], " in X and ", PCs[j], " in Y)"), x=0.2, hjust=0)
print(ggdraw(p))
}
}
}
dev.off()
}
}
if (!is.null(selVar_All)) {
names(selVar_All)[c(2, 4, 5, 6)]=c("Covariate", "Significance", "P-value", "FDR")
}
res=list(data.all=data.all, selVar_All=selVar_All, sel_dataN=sel_dataN, sel_dataC=sel_dataC, ncol=N_col, PC_info=PC_info)
if (!is.null(out_prefix) ) {
saveRDS(res, str_c(out_prefix, "_Covariate_PC_Results.rds"))
cat("Please check output files at:", out_dir, "\n")
}
return(res)
}
#Plot one covariate vs. one principle component. Can be useful when there are too many categories for the default faceted plot.
#Input: res_file is RDS file from function Covariate_PC_Analysis. add_text will add a line below the plot to show if the covariate vs PC is significant.
plot_covariate_PC<-function(res_file, pc, var, out_file, width=10, height=8, add_text=TRUE) {
res<-readRDS(res_file)
data.all=res$data.all
selVar=res$selVar_All
if (!(var %in% names(data.all))) {cat("Covariate ", var, " not in MetaData. Please check the spelling of covariate.\n", sep=""); return(NULL)}
if (!(pc %in% names(data.all))) {cat(pc, " not in principle component scores. Please check the spelling.\n", sep=""); return(NULL)}
Num_names=names(dplyr::select_if(data.all, is.numeric))
if (var %in% Num_names) { #numeric covariate
p<-ggplot(data.all, aes(x=!!sym(var), y=!!sym(pc)) )+geom_point()+
stat_summary(fun.data= mean_cl_normal) + geom_smooth(method='lm')+theme_half_open()
} else {
p<-ggplot(data.all, aes(x=!!sym(var), y=!!sym(pc)) )+geom_boxplot()+geom_jitter(alpha=0.7, width=0.1)+theme_half_open() +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5))
}
if (add_text) {
info<-selVar%>%dplyr::filter(PC==pc, Covariate==var)
if ( nrow(info)>0) {text_info=info$Significance[1]} else {text_info=str_c(var, " vs. ", pc, " not significant.")}
p<-add_sub(p, text_info, x=0.2, hjust=0)
}
pdf(out_file, width=width, height=height)
print(ggdraw(p))
dev.off()
cat("Please check output at: ", out_file)
}
#compute covariate vs PC significance. For numerical covariates, use correlation, for categorical covariates, use ANOVA.
#exp is expression matrix (logTPM, logCPM, etc), meta is meta data. Column names (samples) of exp must match row names of meta.
#PC_cutoff: select which principle components to be used in analysis. Default 5 will select components that explain more than 5% of variance in the data.
Compute_Corr_Anova<-function(exp, meta, PC_cutoff=5) {
require(tidyverse); require(psych); require(broom)
meta=data.frame(meta)
sel=match(colnames(exp), rownames(meta))
meta=meta[sel,,drop=F ]
stopifnot(identical(colnames(exp), rownames(meta)))
exp[is.na(exp)]=0
pca <- prcomp(t(exp),rank. = 10, scale = FALSE)
percentVar <- round((pca$sdev)^2/sum(pca$sdev^2), 3) * 100
scores <- as.data.frame(pca$x)
Npc=max(2,min(sum(percentVar>PC_cutoff), 10))
cor_mat=NULL
anova_results=NULL
meta_num=dplyr::select_if(meta, is.numeric)
if (ncol(meta_num)>0) {
cov_cor <- psych::corr.test(scores[, 1:Npc, drop=F],
data.matrix(meta_num),
use = 'pairwise.complete.obs',
method = "kendall",
adjust = "none")
all_cor_vals <- cov_cor[["r"]]
all_cor_p <- cov_cor[["p"]]
cor_mat <- reshape2::melt(all_cor_p, varnames = c("PC", "covar"))
colnames(cor_mat)[colnames(cor_mat) == "value"] <- "pvalue"
cor_mat$r <- reshape2::melt(all_cor_vals)[["value"]]
cor_mat$fdr <- p.adjust(cor_mat$pvalue, method = "fdr")
}
sel_col=which(!(colnames(meta) %in% colnames(meta_num)))
meta_cat=meta[, sel_col, drop=F]
if (ncol(meta_cat)>0) {
for (i in 1:ncol(meta_cat)) {
if (!is.factor(meta_cat[1, i])) {meta_cat[, i]=as.factor(meta_cat[, i])} #turn all into factors, even logical
}
nelem<-function(x) nlevels(factor(x))
( nL<-apply(meta_cat, 2, nelem) )
meta_cat=meta_cat[, nL>1 & nL<nrow(meta_cat), drop=F]
if (ncol(meta_cat)>0) {
data.df=cbind(scores[, 1:Npc, drop=F], meta_cat)
anova_results=NULL
for (i in 1:ncol(meta_cat)) {
for (j in 1:Npc) {
test_formula=str_c(colnames(scores)[j], "~", colnames(meta_cat)[i])
res.aov <- aov(as.formula(test_formula), data = data.df)
if ("p.value" %in% names(tidy(res.aov))) {
pvalue=tidy(res.aov)$p.value[1]
} else {pvalue=1}
PWinfo=NA
if (pvalue<0.05) { #run pairwise
PW<-tidy(TukeyHSD(res.aov))
PW.sig<-PW%>%dplyr::filter(adj.p.value<0.05)
if (nrow(PW.sig)>0) {
names(PW.sig)[names(PW.sig)=="contrast"]="comparison" #older version comparison, new version contrast
PWinfo<-paste(paste(PW.sig$comparison, format.pval(PW.sig$adj.p.value, digits=2)), collapse="; ")
}
}
result1=data.frame(PC=colnames(scores)[j], covar=colnames(meta_cat)[i], pvalue, PairWise=PWinfo)
anova_results<-rbind(anova_results, result1)
}
}
anova_results$fdr=p.adjust(anova_results$pvalue, method = "fdr")
}
}
meta=cbind(meta_num, meta_cat)
return(list(PC_scores=scores, meta=meta, percentVar=percentVar, corr=cor_mat, anova=anova_results))
}
#select significant covariate vs PC pairs, and create plot.
#var_type can be "numeric" or "categorical"
#N_col: number of columns for facet plots
get_PC_meta_plot<-function(res, var_type, FDR_cutoff=0.1, N_col=3) {
require(tidyverse);require(cowplot)
if (var_type=="numeric") {
if (is.null(res$corr)) {return(NULL)}
selVar=res$corr%>%dplyr::filter(fdr<FDR_cutoff)
} else {
if (is.null(res$anova)) {return(NULL)}
selVar<-res$anova%>%dplyr::filter(fdr<FDR_cutoff)
}
if (nrow(selVar)==0) {return(NULL)}
meta=res$meta
scores=res$PC_scores
data.df=NULL
all_levels=NULL
for (i in 1:nrow(selVar)) {
N1=match(selVar$PC[i], colnames(scores) )
N2=match(selVar$covar[i], colnames(meta))
values=meta[, N2]
df1=data.frame(PC=selVar$PC[i], covar=selVar$covar[i], Value=values, Score=scores[, N1])
#get levels for categorical covariates
if (var_type!="numeric") {
if (is.factor(values)) {levels=as.character(levels(values))} else {levels=unique(values)}
all_levels=unique(c(all_levels, levels))
}
data.df=rbind(data.df, df1)
}
if (var_type!="numeric") {data.df$Value=factor(data.df$Value, levels=all_levels)}
##make plot
selVar<-selVar%>%arrange(fdr)%>%mutate(wrap=str_c(covar, "\n", PC))
selVar$wrap<-factor(selVar$wrap, levels=selVar$wrap)
data.df<-data.df%>%mutate(wrap=str_c(covar, "\n", PC))
data.df$wrap<-factor(data.df$wrap, levels=selVar$wrap)
if (var_type=="numeric") {
p<-ggplot(data.df, aes(x=Value, y=Score, color=covar) )+geom_point()+
stat_summary(fun.data= mean_cl_normal) + geom_smooth(method='lm')+
facet_wrap(c("wrap"), ncol=N_col,scales = "free")+theme_half_open() +background_grid()+panel_border() +
labs(x="Covariate Value", y="PC Scores", color="numeric\ncovariates")
selVar$text=str_c("r=", round(selVar$r*1000)/1000, "; fdr=",format(selVar$fdr, scientific = T, digits=2))
p<-p+ geom_text( data = selVar, color="black",
mapping = aes(x = -Inf, y = Inf, label = text),
hjust = -0.1, vjust = 1.5)
} else {
p<-ggplot(data.df, aes(x=Value, y=Score, color=covar) )+geom_boxplot()+geom_jitter(alpha=0.7, width=0.1)+
facet_wrap(c("wrap"), ncol=N_col,scales = "free")+theme_half_open() +background_grid()+panel_border() +
theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + labs(x="Covariate Category", y="PC Scores", color="categorical\ncovariates")
selVar$text=str_c("ANOVA fdr: ", format(selVar$fdr, scientific = T, digits=2))
p<-p+ geom_text( data = selVar, color="black",
mapping = aes(x = -Inf, y = Inf, label = text),
hjust = -0.1, vjust = 1.5)
}
return(list(plot=p, data.df=data.df%>%dplyr::select(-wrap), selVar=selVar%>%dplyr::select(-wrap)))
}