Skip to content

Commit

Permalink
Added protein and peptide coverage per single cell calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
hspekt committed Dec 17, 2020
1 parent 899a844 commit 1483b8e
Show file tree
Hide file tree
Showing 3 changed files with 4,367 additions and 2 deletions.
Binary file modified .DS_Store
Binary file not shown.
339 changes: 337 additions & 2 deletions code/SCoPE2_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ ev[ev$Raw.file%in%sc.runs, ri.index] <- ev[ev$Raw.file%in%sc.runs, ri.index] / e
# Organize data into a more convenient data structure:

# Create empty data frame
ev.melt<-melt(ev[0, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest", colnames(ev)[ri.index]) ],
id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest"))
ev.melt<-melt(ev[0, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan", colnames(ev)[ri.index]) ],
id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan"))

colnames(ev.melt)<-c("Raw.file","sequence","protein","lcbatch","sortday","digest","scan","celltype","quantitation")

Expand Down Expand Up @@ -4028,3 +4028,338 @@ colnames(s2)<-colnames(hd)
write.table(s2,file= "dat/sdrf_scope2.tsv",sep="\t", row.names = F)

s2$`source name`

# Peptides and protein quantified per single cell ------------------------------------

# Stringently filtered peptides or proteins quantified / single cell
pep_stringent<-ncol(t(t3))-na.count(t(t3))
prot_stringent<-ncol(t(ev.matrix.sc.f.n))-na.count(t(ev.matrix.sc.f.n))

# Stringently filtered peptides or proteins quantified / control well
ev.matrix.sc.e<-ev.matrix.sc[,colnames(ev.matrix.sc)%in%ev.melt$id[ev.melt$celltype%in%"sc_0"]]
t3m<-data.frame(ev.matrix.sc.e)
t3m$pep<-rownames(ev.matrix.sc.e)
t3m$prot <- ev.melt.pep$protein[match(t3m$pep, ev.melt.pep$sequence)]
t3m<-melt(t3m, variable.names = c("pep", "prot"))
colnames(t3m) <-c("pep","prot","id","quantitation")
t3m2<- t3m %>% group_by(prot, id) %>% summarize(qp = median(quantitation, na.rm=T))
t4m<-dcast(t3m2, prot ~ id, value.var = "qp", fill=NA)
t4e<-as.matrix(t4m[,-1]);
hist(c(t4e), breaks=b.t, xlim=xlim.t)

pep_control<-ncol(t(ev.matrix.sc.e))-na.count(t(ev.matrix.sc.e))
prot_control<-ncol(t(t4e))-na.count(t(t4e))

range(pep_control); range(prot_control)
mean(pep_control); mean(prot_control)

# 1% FDR filtered peptides or proteins quantified / single cell

# # Load raw data and annotations
# #ev<-read.csv("dat/evidence_unfiltered_v2.csv")
# ev11<-read.delim("dat/dart/dart11/ev_updated.txt")
# ev16<-read.delim("dat/dart/dart16/ev_updated.txt")
#
# ev_dart<-rbind(ev11, ev16)
#
# tmt11<-read.delim("/Volumes/GoogleDrive/My Drive/MS/SCoPE/public_uploads/massive.quant_20200604/tmt11/combined/txt/evidence.txt")#, delim="\t")
# tmt16<-read.delim("/Volumes/GoogleDrive/My Drive/MS/SCoPE/public_uploads/massive.quant_20200604/tmt16/combined/txt/evidence.txt")#, delim="\t")
#
# tmt11[,setdiff(colnames(tmt16), colnames(tmt11))]<-NA
# ev<-rbind(tmt11[,colnames(tmt16)], tmt16)
#
# ev_dart$uid<-paste(ev_dart$Modified.sequence, ev_dart$Charge, ev_dart$PEP,ev_dart$Raw.file)
# ev$uid<-paste(ev$Modified.sequence, ev$Charge, ev$PEP,ev$Raw.file)
#
# ev<-merge(ev,ev_dart, by="uid")
#
# colnames(ev)[grep(".x", colnames(ev), fixed=T)]<-gsub(".x","",colnames(ev)[grep(".x", colnames(ev), fixed=T)])
#
# design<-read.csv("dat/annotation.csv")
# batch<-read.csv("dat/batch.csv")
# design$Set<-paste0("X",design$Set)
# batch$set<-paste0("X",batch$set)
# ev$Raw.file<-paste0("X", ev$Raw.file)
# # Parse protein names
# parse_row<-grep("|",ev$Leading.razor.protein, fixed=T)
# split_prot<-str_split(ev$Leading.razor.protein[parse_row], pattern = fixed("|"))
# split_prot2<-unlist(split_prot)[seq(2,3*length(split_prot),3)]
# ev$Leading.razor.protein[parse_row]<-split_prot2
#
# # Attach batch data to protein data
# ev[,colnames(batch)[-1]]<-NA
# for(X in batch$set){
#
# ev$lcbatch[ev$Raw.file==X] <- as.character(batch$lcbatch[batch$set%in%X])
# ev$sortday[ev$Raw.file==X] <- as.character(batch$sortday[batch$set%in%X])
# ev$digest[ev$Raw.file==X] <- as.character(batch$digest[batch$set%in%X])
#
# }
#
# # Create unique peptide+charge column:
# ev$modseq<-paste0(ev$Modified.sequence,ev$Charge)
#
#
#
# save(ev,design,batch,file="dat/raw.RData")
load("dat/raw.RData")

# Add X in front of experiment names because R doesn't like column names starting with numbers


# Group the experimental runs by type: single cell, 10 cell, 100 cell, 1000 cell, or ladder
all.runs<-unique(ev$Raw.file); length(all.runs)
c10.runs<-c( all.runs[grep("col19", all.runs)] , all.runs[grep("col20", all.runs)] ); length(unique(c10.runs))
c100.runs<-c( all.runs[grep("col21", all.runs)] , all.runs[grep("col22", all.runs)] ); length(unique(c100.runs))
c1000.runs<-c( all.runs[grep("col23", all.runs)] ); length(unique(c1000.runs))
ladder.runs<-c( all.runs[grep("col24", all.runs)] ); length(unique(ladder.runs))
carrier.runs<-c( all.runs[grep("arrier", all.runs)] ); length(unique(carrier.runs))
other.runs<-c( all.runs[c(grep("Ref", all.runs), grep("Master", all.runs), grep("SQC", all.runs), grep("blank", all.runs) )] ); length(unique(other.runs))
sc.runs<-all.runs[!all.runs%in%c(c10.runs,c1000.runs,c100.runs,ladder.runs, carrier.runs, other.runs)]; length(unique(sc.runs))

# Remove experimental sets concurrent with low mass spec performance
i1<-grep("FP103", sc.runs)
i2<-grep("FP95", sc.runs)
i3<-grep("FP96", sc.runs)

if(length(c(i1,i2,i3))>0){

remove.sc.runs<-sc.runs[sc.runs%in%sc.runs[c(i1,i2,i3)]]

ev<-ev[!ev$Raw.file%in%remove.sc.runs, ]

}

filter_step<-c()
cutoff<-c()
exps_left<-c()
sc_left<-c()
prot_left<-c()
pep_left<-c()

# # Counting
# filter_step<-c(filter_step, "Baseline")
# cutoff<-c(cutoff, "None")
# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
# sc_left<-c(sc_left, temp1 )
# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))

# Remove sets with less than X peptides from 10, 100 cell runs:
ev<-ev[ !( (ev$Raw.file%in%names(table(ev$Raw.file))[table(ev$Raw.file)<thres_ids_c10c100])&(ev$Raw.file%in%c(c10.runs, c100.runs)) ) , ]

# Remove sets with less than X peptides from single cell runs:
ev<-ev[ !( (ev$Raw.file%in%names(table(ev$Raw.file))[table(ev$Raw.file)<thres_ids_sc])&(ev$Raw.file%in%sc.runs) ), ]

# # Counting
# filter_step<-c(filter_step, "Low ID rate")
# cutoff<-c(cutoff, "500")
# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
# sc_left<-c(sc_left, temp1 )
# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))


# Which columns hold the TMT Reporter ion (RI) data
ri.index<-which(colnames(ev)%in%paste0("Reporter.intensity.",1:16))

# Make sure all runs are described in design, if not, print and remove them:
not.described<-unique(ev$Raw.file)[ !unique(ev$Raw.file) %in% paste0(design$Set) ]
ev<-ev[!ev$Raw.file%in%not.described,]

# Calculate FDR - single cell runs
ev.sc<-ev[ev$Raw.file%in%sc.runs, ]
ev.sc$fdr<-calc_fdr(ev.sc$dart_PEP)
fdr_prots<-aggregate(dart_PEP ~ Leading.razor.protein, data=ev.sc, FUN=min.na)
fdr_prots$fdr<-calc_fdr(fdr_prots$dart_PEP)
ev.sc<-ev.sc[ev.sc$Leading.razor.protein%in%fdr_prots$Leading.razor.protein[fdr_prots$fdr<0.01], ]
ev.sc<-ev.sc[ev.sc$fdr<0.01, ]

# Calculate FDR - single cell runs
ev.bulk<-ev[ev$Raw.file%in%c(c10.runs, c100.runs), ]
ev.bulk$fdr<-calc_fdr(ev.bulk$dart_PEP)
fdr_prots<-aggregate(dart_PEP ~ Leading.razor.protein, data=ev.bulk, FUN=min.na)
fdr_prots$fdr<-calc_fdr(fdr_prots$dart_PEP)
ev.bulk<-ev.bulk[ev.bulk$Leading.razor.protein%in%fdr_prots$Leading.razor.protein[fdr_prots$fdr<0.01], ]
ev.bulk<-ev.bulk[ev.bulk$fdr<0.01, ]

# Recombine filtered data
ev<-rbind(ev.sc, ev.bulk)

# # Counting
# filter_step<-c(filter_step, "FDR")
# cutoff<-c(cutoff, "xx")
# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
# sc_left<-c(sc_left, temp1 )
# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))

# Filter out reverse hits, contaminants, and contaminated spectra...
if(length(grep("REV", ev$Leading.razor.protein))>0){ ev<-ev[-grep("REV", ev$Leading.razor.protein),] }

if(length(grep("CON", ev$Leading.razor.protein))>0){ ev<-ev[-grep("CON", ev$Leading.razor.protein),] }

# Record the minimally-filtered evidence information for later:
ev_standard_filtering<-ev[,c("Raw.file","dart_PEP","Leading.razor.protein","PEP","Modified.sequence")]

# # Counting
# filter_step<-c(filter_step, "REVCON")
# cutoff<-c(cutoff, "xx")
# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
# sc_left<-c(sc_left, temp1 )
# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))



# ev<-ev[!is.na(ev$PIF),]
#
# ev<-ev[ev$PIF>0.8,]

# # Counting
# filter_step<-c(filter_step, "PIF")
# cutoff<-c(cutoff, "xx")
# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
# sc_left<-c(sc_left, temp1 )
# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))

# Remove peptides that are more the 10% the intensity of the carrier in the single cell runs (only)
ev<-as.data.frame(ev)
ev$mrri<-0
# ev$mrri[ev$Raw.file%in%sc.runs] <- rowMeans(ev[ev$Raw.file%in%sc.runs, ri.index[4:16]] / ev[ev$Raw.file%in%sc.runs, ri.index[1]], na.rm = T)
# ev<-ev[ev$mrri < 0.1, ]

# # Counting
# filter_step<-c(filter_step, "High int peps")
# cutoff<-c(cutoff, "xx")
# exps_left<-c(exps_left, length(unique(ev$Raw.file[ev$Raw.file%in%sc.runs])) )
# runs_remaining<-unique(ev$Raw.file[ev$Raw.file%in%sc.runs])
# temp1<- length(which( unlist(design[design$Set%in%runs_remaining, -1])%in%c("sc_m0","sc_u") ) )
# sc_left<-c(sc_left, temp1 )
# prot_left<-c(prot_left, length(unique(ev$Leading.razor.protein[ev$Raw.file%in%runs_remaining])))
# pep_left<-c( pep_left,length(unique(ev$Modified.sequence[ev$Raw.file%in%runs_remaining])))

# Scan number
ev$scan<-paste0(ev$Raw.file,"_",ev$MS.MS.scan.number)

# Normalize single cell runs to normalization channel
ev<-as.data.frame(ev)
ev[ev$Raw.file%in%sc.runs, ri.index] <- ev[ev$Raw.file%in%sc.runs, ri.index] / ev[ev$Raw.file%in%sc.runs, ri.index[2]]

# Organize data into a more convenient data structure:

# Create empty data frame
ev.melt<-melt(ev[0, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan", colnames(ev)[ri.index]) ],
id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan"))

colnames(ev.melt)<-c("Raw.file","sequence","protein","lcbatch","sortday","digest","scan","celltype","quantitation")

# Record mapping of cell type to Channel:
ct.v<-c()
qt.v<-c()

# Create a unique ID string
unique.id.numeric<-1:16
unique.id<-paste0("i",unique.id.numeric)

# Give each sample a unique identifier
for(X in unique(ev$Raw.file)){

# Subset data by X'th experiment
ev.t<-ev[ev$Raw.file%in%X, ]

if(is.na(X)){next}

# Name the RI columns by what sample type they are: carrier, single cell, unused, etc...
colnames(ev.t)[ri.index]<-paste0(as.character(unlist(design[design$Set==X,-1])),"-", unique.id)

if(length(ri.index)>0){

if( X%in%c(c10.runs, c100.runs) ){

ev.t[,ri.index]<-ev.t[,ri.index] / apply(ev.t[, ri.index], 1, median.na)

}

# Melt it! and combine with other experimental sets
ev.t.melt<-melt(ev.t[, c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan", colnames(ev.t)[ri.index]) ],
id.vars = c("Raw.file","modseq","Leading.razor.protein","lcbatch","sortday","digest","scan"));

# Record mapping of cell type to Channel:
ct.v<-c(ct.v, unique.id[which(ri.index%in%ri.index)] )
qt.v<-c(qt.v, colnames(ev)[ri.index] )

colnames(ev.t.melt)<-c("Raw.file","sequence","protein","lcbatch","sortday","digest","scan","celltype","quantitation")

ev.melt<-rbind(ev.melt, ev.t.melt)

}

# Update unique ID string
unique.id.numeric<-unique.id.numeric + 16
unique.id<-paste0("i", unique.id.numeric)

}

c2q<-data.frame(ct.v, qt.v); colnames(c2q)<-c("celltype","channel")

# Grab the unique number associate to each and every cell, carrier channel, and empty channel
ev.melt$id<-unlist(strsplit(as.character(ev.melt$celltype),"-"))[seq(2,length(unlist(strsplit(as.character(ev.melt$celltype),"-"))),2)]
ev.melt$celltype<-unlist(strsplit(as.character(ev.melt$celltype),"-"))[seq(1,length(unlist(strsplit(as.character(ev.melt$celltype),"-"))),2)]
ev.melt$id<-as.factor(ev.melt$id)

# Remove duplicate observations of peptides from a single experiment
ev.melt<-remove.duplicates(ev.melt,c("sequence","id") )
ev.melt<-ev.melt[!is.na(ev.melt$protein), ]

# Create additional meta data matrices
ev.melt.uniqueID<-remove.duplicates(ev.melt,"id")
ev.melt.pep<-remove.duplicates(ev.melt, c("sequence","protein") )

# Create data frame of peptides x cells, populated by quantitation
ev.unmelt<-dcast(ev.melt, sequence ~ id, value.var = "quantitation", fill=NA)

# Also create matrix of same shape
ev.matrix<-as.matrix(ev.unmelt[,-1]); row.names(ev.matrix)<-ev.unmelt$sequence

# Replace all 0s with NA
ev.matrix[ev.matrix==0]<-NA
ev.matrix[ev.matrix==Inf]<-NA
ev.matrix[ev.matrix==-Inf]<-NA

# Divide matrix into single cells (including intentional blanks) and carriers
sc_cols<-unique(ev.melt$id[(ev.melt$celltype%in%c("sc_u","sc_m0"))&(ev.melt$Raw.file%in%sc.runs)])
ev.matrix.sc<-ev.matrix[, sc_cols]


ev.matrix.sc.e<-ev.matrix.sc
t3m<-data.frame(ev.matrix.sc.e)
t3m$pep<-rownames(ev.matrix.sc.e)
t3m$prot <- ev.melt.pep$protein[match(t3m$pep, ev.melt.pep$sequence)]
t3m<-melt(t3m, variable.names = c("pep", "prot"))
colnames(t3m) <-c("pep","prot","id","quantitation")
t3m2<- t3m %>% group_by(prot, id) %>% summarize(qp = median(quantitation, na.rm=T))
t4m<-dcast(t3m2, prot ~ id, value.var = "qp", fill=NA)
t4m<-as.matrix(t4m[,-1]);

pep_1pctFDR<-ncol(t(ev.matrix.sc))-na.count(t(ev.matrix.sc))
prot_1pctFDR<-ncol(t(t4m))-na.count(t(t4m))

# Equalize lengths before putting together in df:
pep_stringent<-c(pep_stringent, rep(NA, length(pep_1pctFDR)-length(pep_stringent)))
prot_stringent<-c(prot_stringent, rep(NA, length(prot_1pctFDR)-length(prot_stringent)))

coverage_sc<-data.frame(prot_1pctFDR, prot_stringent, pep_1pctFDR, pep_stringent)

write.csv(coverage_sc, file="dat/coverage_per_sc.csv", row.names = F)
Loading

0 comments on commit 1483b8e

Please sign in to comment.