cnvRcode.R

#!/bin/env Rscript

rm(list=ls())

## Specify URL for BAM file
url <-'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18603/alignment/NA18603.mapped.ILLUMINA.bwa.CHB.low_coverage.20130415.bam'

## Old url bam is 2011 version
##url <-'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18603/alignment/NA18603.mapped.ILLUMINA.bwa.CHB.low_coverage.20111114.bam'

# specify chromosome and position of interest
ch<-1
st<-161100000
en<-162100000

library(Rsamtools) 
library(ShortRead)

# use Rsamtools to retrieve data
hd <- scanBamHeader(url)

## List of chromosomes in header...
names(hd[[1]][["targets"]])
              
## Either GRanges
which <- GRanges("1", IRanges(st, en))
## or RangesList
which <- RangesList("1"=IRanges(st,en))

what  <- scanBamWhat()
pp<-ScanBamParam(which=which, what=what)
sb<-scanBam(url,param=pp)


## the system calls that are comment out below are now effectively done by scanBam

# If not able to retrieve data, download and load from local file:
# Extract data for FCGR region and save as local BAM file
## system('samtools view -b ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18603/alignment/NA18603.mapped.ILLUMINA.bwa.CHB.low_coverage.20111114.bam 1:161,100,000-162,100,000 > NA18603.chrom1.ILLUMINA.bwa.CHB.low_coverage.fcgr.bam')

# Index local BAM file
## system('samtools index NA18603.chrom1.ILLUMINA.bwa.CHB.low_coverage.fcgr.bam')

# Load data into R
#file<-"NA18603.chrom1.ILLUMINA.bwa.CHB.low_coverage.fcgr.bam"
#sb<-scanBam(file)

# Extract quality scores
qual<-FastqQuality(sb[[1]]$qual)

fq <- ShortReadQ(sb[[1]][["seq"]], sb[[1]][["qual"]], BStringSet(sb[[1]][["qname"]]))
qaSummary <- qa(fq, lane="cnv")

perCycle  <- qaSummary[["perCycle"]]
head(perCycle[["baseCall"]])
head(perCycle[["quality"]])

## Quality plot of bam region
ShortRead:::.plotCycleQuality(perCycle$quality, main=names(sb)[1])

# PROBLEM - can't deal with ragged array (variable length reads):
# Extract subset of equal length reads (kludge).
qualx<-qual[width(qual)==min(width(qual))]


# convert scores to matrix 
readM <- as(qualx, "matrix")

# Create boxplot for each column 
boxplot(as.data.frame((readM)), outline = FALSE, main="Per Cycle Read Quality", xlab="Cycle", ylab="Phred Quality") 

# walk across region in 1000 base steps count number of reads with start position in each interval.
cnt<-c()
for(i in 1:1000){
  cnt[i]<-sum(sb[[1]]$pos>(st+1000*(i-1)+1) & sb[[1]]$pos<=(st+1000*i),na.rm=T)
}

# Number of reads in first 10 intervals (bins)
cnt[1:10]

# specify gene positions
genes<-data.frame(fcgr3b=c(161592990,161601158), fcgr2b=c(161632940,161647951), fcgr2c=c(161551129,161570031), fcgr2a =c(161475205,161489359),fcgr3a=c(161511553,161520413))

# set up plot (blank plot - colour set to `white')
plot(seq(st,en,l=1000),cnt,type='l',col='white', xlab='Chromosome 1 Position (HG19/GRCh37 assembly)',ylab='Read Count')

# add gene positions and names to plot
yl<- c(25,max(cnt)*1.1)
for(k in 1:ncol(genes)) rect(genes[1,k],yl[1],genes[2,k],yl[2],col=gray(0.8),border=NA)
for(k in 1:ncol(genes)) text(mean(genes[,k]),0.9*yl[2],gsub("fcgr","",names(genes)[k]),cex=0.75)

# add count data on top of other plot objects
lines(seq(st,en,l=1000),cnt,type='l')

# load library and standardize counts
library(DNAcopy)
cnt.std<-(cnt-median(cnt))/sd(cnt)

# create "CNA" object for use with DNAcopy
cna.obj<-CNA(cnt.std,chrom=rep(1,length(cnt.std)),seq(st,en,l=length(cnt.std)),data.type="logratio",sampleid='NA18603')

# perform segmentation analysis & extract output
cna.seg<-segment(cna.obj)
cna.out<-cna.seg$output

# Filter object to capture "substantial" deviations (>1 SD)
filt.cna.out<-cna.out[abs(cna.out$seg.mean)>1,]

cna.out

filt.cna.out

# Set up plot for standardized data
plot(seq(st,en,l=1000),cnt.std,type='l',col='white', xlab='Chromosome 1 Position (HG19/GRCh37 assembly)',ylab='Read Count')

# Add gene info
yl<-range(cnt.std)
for(k in 1:ncol(genes)) rect(genes[1,k],yl[1],genes[2,k],yl[2],col=gray(0.8),border=NA)
for(k in 1:ncol(genes)) text(mean(genes[,k]),0.9*yl[2],gsub("fcgr","",names(genes)[k]),cex=0.75)

# add standardized count data on top of other plot objects
lines(seq(st,en,l=1000),cnt.std,type='l')

# Add segmentation info
for(i in 1:nrow(cna.out))
   lines(cna.out[i,3:4],rep(cna.out[i,6],2),col='red',lwd=3)

# Redo plot using filtered segmentation info
plot(seq(st,en,l=1000),cnt.std,type='l',col='white', xlab='Chromosome 1 Position (HG19/GRCh37 assembly)',ylab='Read Count')
yl<-range(cnt.std)
for(k in 1:ncol(genes)) rect(genes[1,k],yl[1],genes[2,k],yl[2],col=gray(0.8),border=NA)
for(k in 1:ncol(genes)) text(mean(genes[,k]),0.9*yl[2],gsub("fcgr","",names(genes)[k]),cex=0.75)
lines(seq(st,en,l=1000),cnt.std,type='l')

for(i in 1:nrow(cna.out))
   lines(filt.cna.out[i,3:4],rep(filt.cna.out[i,6],2),col='red',lwd=3)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cnvRcode.R

Clone this wiki locally