-
Notifications
You must be signed in to change notification settings - Fork 2
cnvRcode.R
mdavy86 edited this page Oct 20, 2014
·
5 revisions
#!/bin/env Rscript rm(list=ls()) ## Specify URL for BAM file url <-'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18603/alignment/NA18603.mapped.ILLUMINA.bwa.CHB.low_coverage.20130415.bam' ## Old url bam is 2011 version ##url <-'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18603/alignment/NA18603.mapped.ILLUMINA.bwa.CHB.low_coverage.20111114.bam' # specify chromosome and position of interest ch<-1 st<-161100000 en<-162100000 library(Rsamtools) library(ShortRead) # use Rsamtools to retrieve data hd <- scanBamHeader(url) ## List of chromosomes in header... names(hd[[1]][["targets"]]) ## Either GRanges which <- GRanges("1", IRanges(st, en)) ## or RangesList which <- RangesList("1"=IRanges(st,en)) what <- scanBamWhat() pp<-ScanBamParam(which=which, what=what) sb<-scanBam(url,param=pp) ## the system calls that are comment out below are now effectively done by scanBam # If not able to retrieve data, download and load from local file: # Extract data for FCGR region and save as local BAM file ## system('samtools view -b ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18603/alignment/NA18603.mapped.ILLUMINA.bwa.CHB.low_coverage.20111114.bam 1:161,100,000-162,100,000 > NA18603.chrom1.ILLUMINA.bwa.CHB.low_coverage.fcgr.bam') # Index local BAM file ## system('samtools index NA18603.chrom1.ILLUMINA.bwa.CHB.low_coverage.fcgr.bam') # Load data into R #file<-"NA18603.chrom1.ILLUMINA.bwa.CHB.low_coverage.fcgr.bam" #sb<-scanBam(file) # Extract quality scores qual<-FastqQuality(sb[[1]]$qual) fq <- ShortReadQ(sb[[1]][["seq"]], sb[[1]][["qual"]], BStringSet(sb[[1]][["qname"]])) qaSummary <- qa(fq, lane="cnv") perCycle <- qaSummary[["perCycle"]] head(perCycle[["baseCall"]]) head(perCycle[["quality"]]) ## Quality plot of bam region ShortRead:::.plotCycleQuality(perCycle$quality, main=names(sb)[1]) # PROBLEM - can't deal with ragged array (variable length reads): # Extract subset of equal length reads (kludge). qualx<-qual[width(qual)==min(width(qual))] # convert scores to matrix readM <- as(qualx, "matrix") # Create boxplot for each column boxplot(as.data.frame((readM)), outline = FALSE, main="Per Cycle Read Quality", xlab="Cycle", ylab="Phred Quality") # walk across region in 1000 base steps count number of reads with start position in each interval. cnt<-c() for(i in 1:1000){ cnt[i]<-sum(sb[[1]]$pos>(st+1000*(i-1)+1) & sb[[1]]$pos<=(st+1000*i),na.rm=T) } # Number of reads in first 10 intervals (bins) cnt[1:10] # specify gene positions genes<-data.frame(fcgr3b=c(161592990,161601158), fcgr2b=c(161632940,161647951), fcgr2c=c(161551129,161570031), fcgr2a =c(161475205,161489359),fcgr3a=c(161511553,161520413)) # set up plot (blank plot - colour set to `white') plot(seq(st,en,l=1000),cnt,type='l',col='white', xlab='Chromosome 1 Position (HG19/GRCh37 assembly)',ylab='Read Count') # add gene positions and names to plot yl<- c(25,max(cnt)*1.1) for(k in 1:ncol(genes)) rect(genes[1,k],yl[1],genes[2,k],yl[2],col=gray(0.8),border=NA) for(k in 1:ncol(genes)) text(mean(genes[,k]),0.9*yl[2],gsub("fcgr","",names(genes)[k]),cex=0.75) # add count data on top of other plot objects lines(seq(st,en,l=1000),cnt,type='l') # load library and standardize counts library(DNAcopy) cnt.std<-(cnt-median(cnt))/sd(cnt) # create "CNA" object for use with DNAcopy cna.obj<-CNA(cnt.std,chrom=rep(1,length(cnt.std)),seq(st,en,l=length(cnt.std)),data.type="logratio",sampleid='NA18603') # perform segmentation analysis & extract output cna.seg<-segment(cna.obj) cna.out<-cna.seg$output # Filter object to capture "substantial" deviations (>1 SD) filt.cna.out<-cna.out[abs(cna.out$seg.mean)>1,] cna.out filt.cna.out # Set up plot for standardized data plot(seq(st,en,l=1000),cnt.std,type='l',col='white', xlab='Chromosome 1 Position (HG19/GRCh37 assembly)',ylab='Read Count') # Add gene info yl<-range(cnt.std) for(k in 1:ncol(genes)) rect(genes[1,k],yl[1],genes[2,k],yl[2],col=gray(0.8),border=NA) for(k in 1:ncol(genes)) text(mean(genes[,k]),0.9*yl[2],gsub("fcgr","",names(genes)[k]),cex=0.75) # add standardized count data on top of other plot objects lines(seq(st,en,l=1000),cnt.std,type='l') # Add segmentation info for(i in 1:nrow(cna.out)) lines(cna.out[i,3:4],rep(cna.out[i,6],2),col='red',lwd=3) # Redo plot using filtered segmentation info plot(seq(st,en,l=1000),cnt.std,type='l',col='white', xlab='Chromosome 1 Position (HG19/GRCh37 assembly)',ylab='Read Count') yl<-range(cnt.std) for(k in 1:ncol(genes)) rect(genes[1,k],yl[1],genes[2,k],yl[2],col=gray(0.8),border=NA) for(k in 1:ncol(genes)) text(mean(genes[,k]),0.9*yl[2],gsub("fcgr","",names(genes)[k]),cex=0.75) lines(seq(st,en,l=1000),cnt.std,type='l') for(i in 1:nrow(cna.out)) lines(filt.cna.out[i,3:4],rep(filt.cna.out[i,6],2),col='red',lwd=3)