Added protein and peptide coverage per single cell calculations

SlavovLab · Dec 17, 2020 · 1483b8e · 1483b8e
1 parent 899a844
commit 1483b8e
Show file tree

Hide file tree

Showing 3 changed files with 4,367 additions and 2 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/code/SCoPE2_analysis.R b/code/SCoPE2_analysis.R
@@ -208,8 +208,8 @@ ev[ev$Raw.file%in%sc.runs, ri.index] <- ev[ev$Raw.file%in%sc.runs, ri.index] / e
 # Organize data into a more convenient data structure:
 
 # Create empty data frame
-ev.melt<-melt(ev[0, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest", colnames(ev)[ri.index]) ],
-              id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest"))
+ev.melt<-melt(ev[0, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan", colnames(ev)[ri.index]) ],
+              id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan"))
 
 colnames(ev.melt)<-c("Raw.file","sequence","protein","lcbatch","sortday","digest","scan","celltype","quantitation")
 
@@ -4028,3 +4028,338 @@ colnames(s2)<-colnames(hd)
 write.table(s2,file= "dat/sdrf_scope2.tsv",sep="\t", row.names = F)
 
 s2$`source name`
+
+# Peptides and protein quantified per single cell ------------------------------------
+
+# Stringently filtered peptides or proteins quantified / single cell
+pep_stringent<-ncol(t(t3))-na.count(t(t3))
+prot_stringent<-ncol(t(ev.matrix.sc.f.n))-na.count(t(ev.matrix.sc.f.n))
+
+# Stringently filtered peptides or proteins quantified / control well
+ev.matrix.sc.e<-ev.matrix.sc[,colnames(ev.matrix.sc)%in%ev.melt$id[ev.melt$celltype%in%"sc_0"]]
+t3m<-data.frame(ev.matrix.sc.e)
+t3m$pep<-rownames(ev.matrix.sc.e)
+t3m$prot <- ev.melt.pep$protein[match(t3m$pep, ev.melt.pep$sequence)]
+t3m<-melt(t3m, variable.names = c("pep", "prot"))
+colnames(t3m) <-c("pep","prot","id","quantitation")
+t3m2<- t3m %>% group_by(prot, id) %>% summarize(qp = median(quantitation, na.rm=T))
+t4m<-dcast(t3m2, prot ~ id, value.var = "qp", fill=NA)
+t4e<-as.matrix(t4m[,-1]);
+hist(c(t4e), breaks=b.t, xlim=xlim.t)
+
+pep_control<-ncol(t(ev.matrix.sc.e))-na.count(t(ev.matrix.sc.e))
+prot_control<-ncol(t(t4e))-na.count(t(t4e))
+
+range(pep_control); range(prot_control)
+mean(pep_control); mean(prot_control)
+
+# 1% FDR filtered peptides or proteins quantified / single cell
+
+# # Load raw data and annotations
+# #ev<-read.csv("dat/evidence_unfiltered_v2.csv")
+# ev11<-read.delim("dat/dart/dart11/ev_updated.txt")
+# ev16<-read.delim("dat/dart/dart16/ev_updated.txt")
+# 
+# ev_dart<-rbind(ev11, ev16)
+# 
+# tmt11<-read.delim("/Volumes/GoogleDrive/My Drive/MS/SCoPE/public_uploads/massive.quant_20200604/tmt11/combined/txt/evidence.txt")#, delim="\t")
+# tmt16<-read.delim("/Volumes/GoogleDrive/My Drive/MS/SCoPE/public_uploads/massive.quant_20200604/tmt16/combined/txt/evidence.txt")#, delim="\t")
+# 
+# tmt11[,setdiff(colnames(tmt16), colnames(tmt11))]<-NA
+# ev<-rbind(tmt11[,colnames(tmt16)], tmt16)
+# 
+# ev_dart$uid<-paste(ev_dart$Modified.sequence, ev_dart$Charge, ev_dart$PEP,ev_dart$Raw.file)
+# ev$uid<-paste(ev$Modified.sequence, ev$Charge, ev$PEP,ev$Raw.file)
+# 
+# ev<-merge(ev,ev_dart, by="uid")
+# 
+# colnames(ev)[grep(".x", colnames(ev), fixed=T)]<-gsub(".x","",colnames(ev)[grep(".x", colnames(ev), fixed=T)])
+# 
+# design<-read.csv("dat/annotation.csv")
+# batch<-read.csv("dat/batch.csv")
+# design$Set<-paste0("X",design$Set)
+# batch$set<-paste0("X",batch$set)
+# ev$Raw.file<-paste0("X", ev$Raw.file)
+# # Parse protein names
+# parse_row<-grep("|",ev$Leading.razor.protein, fixed=T)
+# split_prot<-str_split(ev$Leading.razor.protein[parse_row], pattern = fixed("|"))
+# split_prot2<-unlist(split_prot)[seq(2,3*length(split_prot),3)]
+# ev$Leading.razor.protein[parse_row]<-split_prot2
+# 
+# # Attach batch data to protein data
+# ev[,colnames(batch)[-1]]<-NA
+# for(X in batch$set){
+# 
+#   ev$lcbatch[ev$Raw.file==X] <- as.character(batch$lcbatch[batch$set%in%X])
+#   ev$sortday[ev$Raw.file==X] <- as.character(batch$sortday[batch$set%in%X])
+#   ev$digest[ev$Raw.file==X] <- as.character(batch$digest[batch$set%in%X])
+# 
+# }
+# 
+# # Create unique peptide+charge column:
+# ev$modseq<-paste0(ev$Modified.sequence,ev$Charge)
+# 
+# 
+# 
+# save(ev,design,batch,file="dat/raw.RData")
+load("dat/raw.RData")
+
+# Add X in front of experiment names because R doesn't like column names starting with numbers
+
+
+# Group the experimental runs by type: single cell, 10 cell, 100 cell, 1000 cell, or ladder
+all.runs<-unique(ev$Raw.file); length(all.runs)
+c10.runs<-c( all.runs[grep("col19", all.runs)] , all.runs[grep("col20", all.runs)] ); length(unique(c10.runs))
+c100.runs<-c( all.runs[grep("col21", all.runs)] , all.runs[grep("col22", all.runs)] ); length(unique(c100.runs))
+c1000.runs<-c( all.runs[grep("col23", all.runs)] ); length(unique(c1000.runs))
+ladder.runs<-c( all.runs[grep("col24", all.runs)] ); length(unique(ladder.runs))
+carrier.runs<-c( all.runs[grep("arrier", all.runs)] ); length(unique(carrier.runs))
+other.runs<-c( all.runs[c(grep("Ref", all.runs), grep("Master", all.runs), grep("SQC", all.runs), grep("blank", all.runs) )] ); length(unique(other.runs))
+sc.runs<-all.runs[!all.runs%in%c(c10.runs,c1000.runs,c100.runs,ladder.runs, carrier.runs, other.runs)]; length(unique(sc.runs))
+
+# Remove experimental sets concurrent with low mass spec performance
+i1<-grep("FP103", sc.runs)
+i2<-grep("FP95", sc.runs)
+i3<-grep("FP96", sc.runs)
+
+if(length(c(i1,i2,i3))>0){
+
+  remove.sc.runs<-sc.runs[sc.runs%in%sc.runs[c(i1,i2,i3)]]
+
+  ev<-ev[!ev$Raw.file%in%remove.sc.runs, ]
+
+}
+
+filter_step<-c()
+cutoff<-c()
+exps_left<-c()
+sc_left<-c()
+prot_left<-c()
+pep_left<-c()
+
+# # Counting 
+# filter_step<-c(filter_step, "Baseline")
+# cutoff<-c(cutoff, "None")
+# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
+# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
+# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
+# sc_left<-c(sc_left, temp1 )
+# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
+# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))
+
+# Remove sets with less than X peptides from 10, 100 cell runs:
+ev<-ev[ !( (ev$Raw.file%in%names(table(ev$Raw.file))[table(ev$Raw.file)<thres_ids_c10c100])&(ev$Raw.file%in%c(c10.runs, c100.runs)) ) , ]
+
+# Remove sets with less than X peptides from single cell runs:
+ev<-ev[ !( (ev$Raw.file%in%names(table(ev$Raw.file))[table(ev$Raw.file)<thres_ids_sc])&(ev$Raw.file%in%sc.runs) ), ]
+
+# # Counting 
+# filter_step<-c(filter_step, "Low ID rate")
+# cutoff<-c(cutoff, "500")
+# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
+# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
+# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
+# sc_left<-c(sc_left, temp1 )
+# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
+# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))
+
+
+# Which columns hold the TMT Reporter ion (RI) data
+ri.index<-which(colnames(ev)%in%paste0("Reporter.intensity.",1:16))
+
+# Make sure all runs are described in design, if not, print and remove them:
+not.described<-unique(ev$Raw.file)[ !unique(ev$Raw.file) %in% paste0(design$Set) ]
+ev<-ev[!ev$Raw.file%in%not.described,]
+
+# Calculate FDR - single cell runs
+ev.sc<-ev[ev$Raw.file%in%sc.runs, ]
+ev.sc$fdr<-calc_fdr(ev.sc$dart_PEP)
+fdr_prots<-aggregate(dart_PEP ~ Leading.razor.protein, data=ev.sc, FUN=min.na)
+fdr_prots$fdr<-calc_fdr(fdr_prots$dart_PEP)
+ev.sc<-ev.sc[ev.sc$Leading.razor.protein%in%fdr_prots$Leading.razor.protein[fdr_prots$fdr<0.01], ]
+ev.sc<-ev.sc[ev.sc$fdr<0.01, ]
+
+# Calculate FDR - single cell runs
+ev.bulk<-ev[ev$Raw.file%in%c(c10.runs, c100.runs), ]
+ev.bulk$fdr<-calc_fdr(ev.bulk$dart_PEP)
+fdr_prots<-aggregate(dart_PEP ~ Leading.razor.protein, data=ev.bulk, FUN=min.na)
+fdr_prots$fdr<-calc_fdr(fdr_prots$dart_PEP)
+ev.bulk<-ev.bulk[ev.bulk$Leading.razor.protein%in%fdr_prots$Leading.razor.protein[fdr_prots$fdr<0.01], ]
+ev.bulk<-ev.bulk[ev.bulk$fdr<0.01, ]
+
+# Recombine filtered data
+ev<-rbind(ev.sc, ev.bulk)
+
+# # Counting 
+# filter_step<-c(filter_step, "FDR")
+# cutoff<-c(cutoff, "xx")
+# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
+# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
+# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
+# sc_left<-c(sc_left, temp1 )
+# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
+# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))
+
+# Filter out reverse hits, contaminants, and contaminated spectra...
+if(length(grep("REV", ev$Leading.razor.protein))>0){ ev<-ev[-grep("REV", ev$Leading.razor.protein),] }
+
+if(length(grep("CON", ev$Leading.razor.protein))>0){ ev<-ev[-grep("CON", ev$Leading.razor.protein),] }
+
+# Record the minimally-filtered evidence information for later: 
+ev_standard_filtering<-ev[,c("Raw.file","dart_PEP","Leading.razor.protein","PEP","Modified.sequence")]
+
+# # Counting 
+# filter_step<-c(filter_step, "REVCON")
+# cutoff<-c(cutoff, "xx")
+# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
+# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
+# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
+# sc_left<-c(sc_left, temp1 )
+# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
+# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))
+
+
+
+# ev<-ev[!is.na(ev$PIF),]
+# 
+# ev<-ev[ev$PIF>0.8,]
+
+# # Counting 
+# filter_step<-c(filter_step, "PIF")
+# cutoff<-c(cutoff, "xx")
+# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
+# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
+# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
+# sc_left<-c(sc_left, temp1 )
+# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
+# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))
+
+# Remove peptides that are more the 10% the intensity of the carrier in the single cell runs (only)
+ev<-as.data.frame(ev)
+ev$mrri<-0
+# ev$mrri[ev$Raw.file%in%sc.runs] <- rowMeans(ev[ev$Raw.file%in%sc.runs, ri.index[4:16]] / ev[ev$Raw.file%in%sc.runs, ri.index[1]], na.rm = T)
+# ev<-ev[ev$mrri < 0.1, ]
+
+# # Counting 
+# filter_step<-c(filter_step, "High int peps")
+# cutoff<-c(cutoff, "xx")
+# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
+# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
+# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
+# sc_left<-c(sc_left, temp1 )
+# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
+# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))
+
+# Scan number
+ev$scan<-paste0(ev$Raw.file,"_",ev$MS.MS.scan.number)
+
+# Normalize single cell runs to normalization channel
+ev<-as.data.frame(ev)
+ev[ev$Raw.file%in%sc.runs, ri.index] <- ev[ev$Raw.file%in%sc.runs, ri.index] / ev[ev$Raw.file%in%sc.runs, ri.index[2]]
+
+# Organize data into a more convenient data structure:
+
+# Create empty data frame
+ev.melt<-melt(ev[0, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan", colnames(ev)[ri.index]) ],
+              id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan"))
+
+colnames(ev.melt)<-c("Raw.file","sequence","protein","lcbatch","sortday","digest","scan","celltype","quantitation")
+
+# Record mapping of cell type to Channel:
+ct.v<-c()
+qt.v<-c()
+
+# Create a unique ID string
+unique.id.numeric<-1:16
+unique.id<-paste0("i",unique.id.numeric)
+
+# Give each sample a unique identifier
+for(X in unique(ev$Raw.file)){
+
+  # Subset data by X'th experiment
+  ev.t<-ev[ev$Raw.file%in%X, ]
+
+  if(is.na(X)){next}
+
+  # Name the RI columns by what sample type they are: carrier, single cell, unused, etc...
+  colnames(ev.t)[ri.index]<-paste0(as.character(unlist(design[design$Set==X,-1])),"-", unique.id)
+
+  if(length(ri.index)>0){
+
+    if( X%in%c(c10.runs, c100.runs) ){
+
+      ev.t[,ri.index]<-ev.t[,ri.index] / apply(ev.t[, ri.index], 1, median.na)
+
+    }
+
+    # Melt it! and combine with other experimental sets
+    ev.t.melt<-melt(ev.t[, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan", colnames(ev.t)[ri.index]) ],
+                    id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan"));
+
+    # Record mapping of cell type to Channel:
+    ct.v<-c(ct.v, unique.id[which(ri.index%in%ri.index)] )
+    qt.v<-c(qt.v, colnames(ev)[ri.index] )
+
+    colnames(ev.t.melt)<-c("Raw.file","sequence","protein","lcbatch","sortday","digest","scan","celltype","quantitation")
+
+    ev.melt<-rbind(ev.melt, ev.t.melt)
+
+  }
+
+  # Update unique ID string
+  unique.id.numeric<-unique.id.numeric + 16
+  unique.id<-paste0("i", unique.id.numeric)
+
+}
+
+c2q<-data.frame(ct.v, qt.v); colnames(c2q)<-c("celltype","channel")
+
+# Grab the unique number associate to each and every cell, carrier channel, and empty channel
+ev.melt$id<-unlist(strsplit(as.character(ev.melt$celltype),"-"))[seq(2,length(unlist(strsplit(as.character(ev.melt$celltype),"-"))),2)]
+ev.melt$celltype<-unlist(strsplit(as.character(ev.melt$celltype),"-"))[seq(1,length(unlist(strsplit(as.character(ev.melt$celltype),"-"))),2)]
+ev.melt$id<-as.factor(ev.melt$id)
+
+# Remove duplicate observations of peptides from a single experiment
+ev.melt<-remove.duplicates(ev.melt,c("sequence","id") )
+ev.melt<-ev.melt[!is.na(ev.melt$protein), ]
+
+# Create additional meta data matrices
+ev.melt.uniqueID<-remove.duplicates(ev.melt,"id")
+ev.melt.pep<-remove.duplicates(ev.melt, c("sequence","protein") )
+
+# Create data frame of peptides x cells, populated by quantitation
+ev.unmelt<-dcast(ev.melt, sequence ~ id, value.var = "quantitation", fill=NA)
+
+# Also create matrix of same shape
+ev.matrix<-as.matrix(ev.unmelt[,-1]); row.names(ev.matrix)<-ev.unmelt$sequence
+
+# Replace all 0s with NA
+ev.matrix[ev.matrix==0]<-NA
+ev.matrix[ev.matrix==Inf]<-NA
+ev.matrix[ev.matrix==-Inf]<-NA
+
+# Divide matrix into single cells (including intentional blanks) and carriers
+sc_cols<-unique(ev.melt$id[(ev.melt$celltype%in%c("sc_u","sc_m0"))&(ev.melt$Raw.file%in%sc.runs)])
+ev.matrix.sc<-ev.matrix[, sc_cols]
+
+
+ev.matrix.sc.e<-ev.matrix.sc
+t3m<-data.frame(ev.matrix.sc.e)
+t3m$pep<-rownames(ev.matrix.sc.e)
+t3m$prot <- ev.melt.pep$protein[match(t3m$pep, ev.melt.pep$sequence)]
+t3m<-melt(t3m, variable.names = c("pep", "prot"))
+colnames(t3m) <-c("pep","prot","id","quantitation")
+t3m2<- t3m %>% group_by(prot, id) %>% summarize(qp = median(quantitation, na.rm=T))
+t4m<-dcast(t3m2, prot ~ id, value.var = "qp", fill=NA)
+t4m<-as.matrix(t4m[,-1]);
+
+pep_1pctFDR<-ncol(t(ev.matrix.sc))-na.count(t(ev.matrix.sc))
+prot_1pctFDR<-ncol(t(t4m))-na.count(t(t4m))
+
+# Equalize lengths before putting together in df: 
+pep_stringent<-c(pep_stringent, rep(NA, length(pep_1pctFDR)-length(pep_stringent)))
+prot_stringent<-c(prot_stringent, rep(NA, length(prot_1pctFDR)-length(prot_stringent)))
+
+coverage_sc<-data.frame(prot_1pctFDR, prot_stringent, pep_1pctFDR, pep_stringent)
+
+write.csv(coverage_sc, file="dat/coverage_per_sc.csv", row.names = F)