-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathEDAintroduction.R
814 lines (590 loc) · 27.1 KB
/
EDAintroduction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
# EDAintroduction.R
#
# Purpose:
#
# Version: 3.0
#
# Date: 2019 05
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 3.0 Reconceptualized 2019
# V 2.0 Restructured for 2018 workshop
# V 1.1 2017 updates
# V 1.0 First code 2016
#
# TODO:
#
#
# == HOW TO WORK WITH THIS FILE ================================================
#
# This file contains scenarios and tasks, we will discuss them in detail in
# class. If this file is named "myEDAintroduction.R", then edit it profusely,
# write code, experiment with options, or just play.
# Especially play.
#
# If there is anything you don't understand, use R's help system,
# Google for an answer, or ask. Especially ask. Don't continue if you don't
# understand what's going on. That's not how it works ...
#
# ==============================================================================
#
# I N T R O D U C T I O N T O E D A C O N C E P T S
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------------------------
#TOC> 1 SCENARIO AND SETUP 75
#TOC> 1.1 Data: 95
#TOC> 1.2 Optional tasks: 110
#TOC> 2 LOAD DATA 128
#TOC> 2.1 Task: Getting text data into R 145
#TOC> 2.2 Task: Read an Excel table 211
#TOC> 2.3 Digression: Factors in dataframes 280
#TOC> 3 SUBSETTING 344
#TOC> 3.1 Subsetting data - Review of the principles 346
#TOC> 3.1.1 A sample dataset 355
#TOC> 3.1.2 Subsetting by index 398
#TOC> 3.1.2.1 Negative indices 442
#TOC> 3.1.3 Subsetting by logical 453
#TOC> 3.1.3.1 Filtering by string matching expressions 506
#TOC> 3.1.4 Subsetting by name 523
#TOC> 3.1.5 The "$" operator 557
#TOC> 3.2 Subsetting Task 577
#TOC> 4 DESCRIPTIVE STATISTICS AND SIMPLE PLOTS 602
#TOC> 4.1 Quantiles 619
#TOC> 4.1.1 Boxplot 638
#TOC> 4.1.2 Violin plot 664
#TOC> 4.2 Plotting principles 703
#TOC> 4.3 QQ plots 723
#TOC> 5 EXPLORING SCATTERPLOTS 746
#TOC> 5.1 Explore scatter plots 785
#TOC> 5.2 Trellis plots: all against all 844
#TOC>
#TOC> ==========================================================================
updateTOC() # <<< Execute to update the TOC
# = 1 SCENARIO AND SETUP ==================================================
# EDA is exploration. The goal of EDA is to
# - uncover underlying structure;
# - define important variables;
# - detect outliers and anomalies;
# - detect trends;
# - develop statistical models;
# - test underlying assumptions.
# A cornerstone of EDA is graphics: producing plots that visualize aspects of
# your data: the observations that make up your data - viewed directly, under
# transformation, in comparison to expectations, and integrated with other data.
# Don't think of graphics as an afterthought: graphics are central to the story
# about biology that you are telling, based on the facts you discover. Make
# your story explicit, consider how good graphics supports it, get good at this.
# == 1.1 Data: =============================================================
# There are three datasets we will be working with in the next two days:
#
# - The "crabs" dataset is a staple of R data samples. It is part of the MASS
# package and conatains morphological measurements of Australian crabs
#
# - The "single cell" dataset is supplemental data from one of the
# earliest biological single-cell RNA seq studies, on immune cells, in
# Ido Amit's lab at the Weizman Institute.
#
# - The "cell cycle" dataset is a high resolution RNA seq study of expression
# changes in yeast, published in 2006 by the Breeden lab at the FHCRC.
# == 1.2 Optional tasks: ===================================================
# While you are waiting for others to finish a "Checkpoint"
# here are suggestions:
#
# - Read the paper by Weissgerber et al. (2015) "Beyond Barcharts ..."
# in your "./assets" folder. How could you implement the suggestions
# in the paper in an R function?
#
# - Study how the testthat package can be used to write tests for
# code correctness as you develop code. Work through the script in
# "./scripts/unitTesting.R"
#
# - Learn about regular expressions at
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Regular_Expressions
#
# and work through the script in "./scripts/RPR-RegEx.R"
# = 2 LOAD DATA ===========================================================
# We will explore gene expression data in this unit:
source("./sampleSolutions/EDAintroductionSampleSolutions-ShowPlot.R")
# Before we can do any kind of exploratory analysis, we first need to load data.
# There are many sources - the Web, text files, Excel spread sheets. Here, we'll
# explore data from the supplementary material published with a recent paper on
# single-cell RNAseq analysis.
# TASK: Unzip and browse the Jaitin et al. paper.
# Often the data we need can be copied and pasted from simple
# text files.
# == 2.1 Task: Getting text data into R ====================================
# TASK: open the text file for "./data/Fig_3-CharacteristicGenes.txt".
# I have prepared this file from the text in a figure of
# the actual paper.
# How do we get this data into a vector in R?
# First think of a way to do this by hand.
# Then, figure out if there is an R function that does this:
# - Open a textfile given its filename
# - Return a vector of strings
# Assign the resulting vector to the variable name "charGenes"
insertSnip("gave.fact") # <<< Execute (if needed) to insert sample solution code
# == 2.2 Task: Read an Excel table =========================================
# I have included a dataset with this project, a .csv file taken from
# suplementary data of a paper on tissue definition by single cell RNA seq, by
# Jaitin et al. (2014).
# http://www.ncbi.nlm.nih.gov/pubmed/24531970
# This data set contains log2 fold changes of gene expression enrichment in
# different cd11c+ cell populations, and their response to lipopolysaccharide
# stimulation. It was posted as an Excel file on the Science Website.
# A word on Excel: it's a very good spreadsheet program,
# but it is miserable and often wrong on statistics,
# and it makes horrible, horrible plots.
# To elaborate - see the link below:
# http://www.burns-stat.com/documents/tutorials/spreadsheet-addiction/
# ... these are not merely cosmetic problems!
# Therefore: It's oK to keep data in Excel spreadsheets
# if you must - but read it into R for any analysis!
# But be cautious: one of the problems of Excel is that
# it truncates numeric precision. Protip: convert all
# cells to "text" before export.
# There are many other "read" functions.
# Refer to the R Data Import / Export manual
# http://cran.r-project.org/doc/manuals/R-data.html
# See:
?read.table # ... includes read.csv and read.delim
?read.fwf # ... for "fixed width format"
?readLines # ... for reading in text-files line by line
# Excel spreadsheets should be converted to csv or tsv format. Alternatively the
# readxl package is available via CRAN, see
# https://cran.r-project.org/package=readxl ... but be very careful and double
# and triple check your data.
# TASK:
# 1 - load the supplementary data from the Jaitin et al. single cell study
# ./data/table_S3.xls into Excel, and save it as
# a .csv (comma separated values) file.
# 2 - Examine the file (a plain text file) in a
# text-editor (such as the RStudio editor).
# 3 - Read the table into R, assigning it to a variable.
# I usually give the first input of data the variable
# name "rawDat" or "tmp" since it will be processed before
# it becomes meaningful for analysis.
# 4 - Use head() to look at the beginning of the object.
# 5 - Remove any unneeded header rows.
# 6 - Fix the rownames so they start at 1
# 7 - Give the columns names that reflect the cell type (cf.
# Figure 2c), and the stimulus status. Use cell abbreviations B, MF,
# NK, Mo, pDC, DC1 and DC2 and distinguish ctrl and LPS conditions.
# Call the last column "cluster".
# 8 - Change all expression values that were read as strings to "numeric"
# 9 - Use str() to validate that the data frame you have created
# conforms to specifications. The first column is a character column,
# the other columns are numeric.
# 10 - Assign the final object to the variable name "LPSdat"
# 11 - What tests can you devise to validate the correctness of the data?
insertSnip("file.dome") # <<< Execute (if needed) to insert sample solution code
# == 2.3 Digression: Factors in dataframes =================================
# Many of R's dataframe methods convert all strings
# into factors by default. Factors are special types:
# they are nominally strings - (m, f) or (agree, neutral,
# disagree) or such. But underlyingly they are coded as integers
# that identify the "levels" of the factor.
# To illustrate.
sex <- factor(c("m", "f", "f", "m", "f"))
sex
str(sex)
is.ordered(sex)
# We can define ordered factors by adding some details to
# our factor() command - e.g. tumor grades:
sampleGrades <- factor(c("G1", "G3", "G3", "G1", "G2", "G1"),
levels = c("G1", "G2", "G3", "G4"),
ordered = TRUE)
sampleGrades # Note that G4 is a level although it
# was not observed in the data
is.ordered(sampleGrades)
# Factors are useful since they support a number of analysis
# methods such as ordering boxplots, or calculating regression on categorical
# data.
# For more on factors, have a look at this factor tutorial
# by Jenny Bryan:
# http://www.stat.ubc.ca/~jenny/STAT545A/block08_bossYourFactors.html
# and this discussion on their use:
# http://stackoverflow.com/questions/3445316/factors-in-r-more-than-an-annoyance
#
# But for our purposes, the default behavior of R, to
# treat all strings as factors is usually unwanted
# and needs to be turned off. Always use the parameter
# as.is = TRUE to achieve this. If you don't
# you are in for some bad surprises if e.g. if there is
# a character "contaminant" such as "NA" in a numeric column.
# Note: you can change R's default behaviour and set a global
# option to "turn factors off". Some advocate this,
# I don't think this is a good idea, since you may
# encounter packages or functions that make
# assumptions about R's default behaviour and will thus
# fail.
# But consider the following carefully: this illustrates how work with factors
# can go VERY wrong...
(myDF <- data.frame(data = c("N/A", 1, 1, 2, 3, 5, 8)))
str(myDF)
(myDF <- myDF[-1, ])
(myDF2 <- as.numeric(myDF)) # Whoa! what just happened ?
(myDF3 <- as.numeric(as.character(myDF))) # :-)
# = 3 SUBSETTING ==========================================================
# == 3.1 Subsetting data - Review of the principles ========================
# A significant portion of your efforts in any project will be spent on
# preparing data for analysis. This includes reading data from various sources,
# preprocessing it, and extracting subsets of interest. R has powerful functions
# that support these tasks.
# === 3.1.1 A sample dataset
# Let's start with a small dataframe of synthetic data to go through the main
# principles of subsetting. The same principles apply to matrices and vectors -
# however, data frames are more flexible because their columns can contain data
# of different types (character, numeric, logical ...). Values in vectors and
# matrices must always have the same type.
# Imagine you are a naturalist who has collected some living things and keeps
# observations in a table ...
set.seed(112358)
N <- 10
dat <- data.frame(name = sample(LETTERS, N, replace = TRUE),
legs = sample(c(2 * (0:5), 100), N, replace = TRUE),
type = character(N),
matrix(rnorm(5 * N), ncol = 5),
stringsAsFactors=FALSE)
# Some auxiliary data ...
dict <- c("fish", "bird", "beast", "bug", "spider", "crab", "centipede")
names(dict) <- c(2 * (0:5), 100)
#... to populate the >>type<< column:
dat$type <- dict[as.character(dat$legs)]
# If you already understand the expression above, you're doing pretty well with
# the topic of this tutorial. If you don't, don't worry - by the end of the
# tutorial you will.
# Now let's see what we have:
head(dat)
str(dat)
# Note that we have given names to some columns, but R made names for the five
# columns of random values that were created as a matrix. Let us look at the
# basic ways to subset such objects. Basically, all these methods work with the
# subsetting operator "[".
?"["
# === 3.1.2 Subsetting by index
# Elements can be uniquely identified by indices in the range of their length
# (for vectors), or their rows and columns (in dataframes and matrices). The
# order is row, column.
dat[2,3] # one element
dat[2, ] # empty columns: use all of them
dat[ , 3] # empty rows, use all of them
# If you want a particular set of row and columns, pass a vector of positive
# integers.
dat[c(2, 3), c(1, 2, 3)]
# Any function that returns a vector of integers can be used. Most frequently we
# use the range operator ":" . Retrieving ranges of rows and/or columns from a
# matrix or data frame is also called "slicing".
dat[1:4, 1:3]
dat[4:1, 1:3] # same in reverse order
dat[seq(2, N, by=2), ] # even rows
# But we can do more interesting things, since the indices don't have to be
# unique, or in any order:
dat[c(1, 1, 1, 2, 2, 3), 1:3]
# In particular we can select random subsets...
dat[sample(1:N, 3), 1:3]
dat[sample(1:N, 3), 1:3]
dat[sample(1:N, 3), 1:3]
# ... or sort the dataframe. Sorting requires the order() function, not sort().
sort(dat[ , 2]) # ... gives us the sorted values
order(dat[ , 2]) # ... tells us in which row the sorted values are
dat[order(dat[ , 2]), 1:3] # ordered by number of legs
dat[order(dat[ , 1]), 1:3] # ordered by lexical order of names
# Note: I am indenting expressions so you can first evaluate the expressions
# individually, then see how they fit into the brackets to subset the data.
# ==== 3.1.2.1 Negative indices
# If you specify a negative index, that element is excluded.
dat[-1, 1:3] # not the first row
dat[-N, 1:3] # not the last row
dat[-1:-3, 1:3]
dat[-(1:3), 1:3] # same effect
# === 3.1.3 Subsetting by logical
# Instead of indices, we can specify sets of rows or columns by boolean values
# (type: logical): TRUE or FALSE. If we place a vector of logicals into the
# square brackets, only the rows resp. columns for which the expression is TRUE
# are returned.
dat[1:3, c(TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE)]
# You need to take care that the number of elements exactly matches the number
# of rows or columns, otherwise "vector recycling" will apply and this is
# usually unintended. Thus explicitly specifying a boolean selection like above
# is not all that useful. However, many R functions are "vectorized" and
# applying a logical expression or function to an entire column gives a vector
# of TRUE and FALSE values of the same length. If we place this vector into the
# square brackets, only the rows resp. columns for which the expression is TRUE
# are returned.
dat[ , 2]
dat[ , 2] > 4 # See how this creates a vector
dat[dat[ , 2] > 4, 1:3]
# Expressions can be combined with the "&" (and) and the "|" (or) operators.
dat[ , 4] > 0
dat[ , 5] < 0
dat[ , 4] > 0 & dat[ , 5] < 0
dat[dat[ , 4] > 0 & dat[ , 5] < 0, ]
# In this context, the any() and all() functions may be useful. But take care -
# you can't simply apply them to a range of columns: that would apply the
# condition to all elements of a selection at once. You need to use the apply()
# function to first return a vector. apply()'s second argument switches between
# row-wise and column-wise evaluation. Here, 1 means operate on rows."
apply(dat[ , 4:8], 1, max) # row-wise, fetch the maximum
apply(dat[ , 4:8], 1, max) > 1 # max() > 1 ?
dat[apply(dat[ , 4:8], 1, max) > 1, ]
# To use any() and all(), we define our own function."
myF <- function(x){ any(x > 1.5) }
myF(dat[3, 4:8])
apply(dat[ , 4:8], 1, myF)
dat[apply(dat[ , 4:8], 1, myF), ]
# But we can also write the definition "in place"...
apply(dat[ , 4:8], 1, function(x){ all(x < 0.5) })
dat[apply(dat[ , 4:8], 1, function(x){ all(x < 0.5)} ), ]
# ==== 3.1.3.1 Filtering by string matching expressions
# The function grep(), and the %in% operator can be used to subset via string
# matching:
grep("r", dat[ , 3]) # types that contain "r"
dat[grep("r", dat[ , 3]), 1:3]
grep("^c", dat[ , 3]) # types that begin with "c"
dat[grep("^c", dat[ , 3]), 1:3]
scary <- c("spider", "centipede")
dat[ , 3] %in% scary
dat[dat[ , 3] %in% scary, 1:3]
# === 3.1.4 Subsetting by name
# If rownames and/or columnnames have been defined, we can use these for
# selection. If not defined, they default to the row/column numbers as character
# strings(!).
rownames(dat) # the row numbers, but note that they are strings!
colnames(dat) # the ones we have defined
# If we place a string or a vector of strings into the brackets, R matches the
# corresponding row/ column names:"
dat[1:5, "name"]
dat[1:5, c("name", "legs")]
dat[1:5, "eyes"] # error, that name does not exist
# We can combine the techniques e.g. to flexibly select columns. Here we select
# the X1 to X5 columns:
colnames(dat)
grep("^X", colnames(dat))
colnames(dat)[grep("^X", colnames(dat))]
dat[1:3, colnames(dat)[grep("^X", colnames(dat))]]
# This is very useful when the exact position of columns may have changed during
# the analysis. Actually, rows and columns should really never be selected by
# number even though we have done so above. Such numbers are "magic numbers" and
# code that relies on such magic numbers is heard to read and very hard to
# maintain. It is always better to expose the logic with which your columns are
# selected and to make the selection explicit and robust. An exception may be
# when you need a slice of the data for testing purposes, but even then it may
# be preferrable to use the head() or tail() functions.
# === 3.1.5 The "$" operator
# The "$" operator returns a single column as a vector. It is not strictly
# necessary - the column can just as well be named in quotation marks within the
# brackets - but I think it makes for more readable code.
#
dat[1:3, "legs"]
dat$legs[1:3] # same result. This is the preferred version.
dat$"legs"[1:3] # works, but isn't necessary
dat[1:3, legs] # this returns an error - hopefully; but if for any
# reason the object DOES exist, you'll get an un-
# expected result. Know when to quote!
# Three more functions that I use all the time for data manipulation:
?which
?unique
?duplicated
# == 3.2 Subsetting Task ===================================================
# TASK:
# Write R expressions that get the following data from LPSdat:
# - rows 1:10 of the first three columns in reverse order
# - gene names and the expression values for Mo.LPS
# for the top ten expression values.
# ( hint: use order() )
# - All genes for which B-cells are stimulated by LPS by
# more than 2 log units.
# - Expression values for all genes whose gene-names appear in Figure 3B.
# (hint: use the %in% operator)
insertSnip("blot.pipe") # <<< Execute (if needed) to insert sample solution code
# With these simple subsetting and filtering operations, we are already
# "Exploring data".
# = 4 DESCRIPTIVE STATISTICS AND SIMPLE PLOTS =============================
set.seed(100)
x <- rnorm(100, mean=0, sd=1)
mean(x)
median(x)
IQR(x)
var(x)
sd(x)
summary(x)
# TASK:
# 1 - Explore interesting characterizations of LPSdat.
# == 4.1 Quantiles =========================================================
# What is the threshold that has a given fraction of values above/below it?
x <- seq(-4, 4, 0.1)
f <- dnorm(x, mean=0, sd=1)
q90 <- qnorm(0.90, mean = 0, sd = 1)
plot(x, f, xlab="x", ylab="density", type="l", lwd=5)
abline(v=q90, col="red", lwd=5)
# empirical quantiles
set.seed(100)
x <- rnorm(100, mean=0, sd=1)
quantile(x)
quantile(x, probs=c(0.1, 0.2, 0.9))
abline(v=quantile(x, probs=c(0.9)), col="green", lwd=3)
# === 4.1.1 Boxplot
set.seed(100)
x <- rnorm(1000, mean=5, sd=2.5)
boxplot(x)
boxplot(x*x)
# Careful - a boxplot per se can obscure
# important structure in your data. Consider
# for example this bimodal distribution
x <- rnorm(100, mean=-2, sd=1)
x <- c(x, rnorm(100, mean=2, sd=1))
hist(x)
boxplot(x)
# you can use jitter() to show the actual observations
plot(jitter(rep(0.5, length(x))), x, xlim = c(0, 1))
plot(jitter(rep(0.5, length(x))), x,
xlim=c(0, 1),
pch=21,
col="#00228855",
bg="#0077FF44",
cex = 1.4)
# === 4.1.2 Violin plot
#
if (!requireNamespace("ggplot2", quietly=TRUE)) {
install.packages("ggplot2")
}
library(ggplot2)
X <- as.data.frame(x)
p <- ggplot(X, aes(1,x))
p + geom_violin()
# See ggplot2 introductory tutorial at
# http://www.r-bloggers.com/basic-introduction-to-ggplot2/
# and violin plot documentation at
# http://docs.ggplot2.org/current/geom_violin.html
# Plotting more than one column with a boxplot places the plots side by side.
colnames(LPSdat)
boxplot(LPSdat[ ,c("NK.ctrl", "NK.LPS")])
boxplot(LPSdat[ , c(seq(2, 14, by = 2), seq(3, 15, by = 2))])
abline(v=7.5)
# TODO colours !
# TASK:
# 1 - What would be interesting quantiles
# and boxplots in LPSdat?
# Imagine, explore and share.
#
# Interpret the results !
# TODO: quantile normalization
# ==============================================================================
# == 4.2 Plotting principles ===============================================
# Explore plot types (Section 1 of ./scripts/PlottingReference.R)
# Explore lines (Section 3 of ./scripts/PlottingReference.R)
# =============================================
# Overlay a histogram with a line plot
myStim <- LPSdat$B.LPS - LPSdat$B.ctrl
hist(myStim,
freq = FALSE,
breaks = 40,
col="#0066AA44",
main = "B.LPS - B.ctrl",
xlab = "stimulation (delta(log(expression))")
xVals <- seq(min(myStim),max(myStim),0.1)
lines(xVals, dnorm(xVals, sd = sd(myStim)), col="#CC000055")
# == 4.3 QQ plots ==========================================================
qqnorm(myStim, pch = 21, bg ="#0066AA22")
qqline(myStim, col="#CC000066")
# add a legend
legend("topleft",
cex = 0.8,
c("LPS effect", "normal distribution"),
col = c("#0066AA22", "#CC000066"),
lwd=5)
# =============================================
# QQ- plot: sample against sample
myStim <- LPSdat$B.LPS - LPSdat$B.ctrl
myBase <- LPSdat$Mo.ctrl - LPSdat$B.ctrl
qqplot(myStim, myBase, pch = 21, bg = "#00AA6622")
# TASK: What other columns of LPSdat could be compared with
# a qqplot? Explore this. Interpret the result.
# = 5 EXPLORING SCATTERPLOTS ==============================================
# TODO Change this to to one of the three datasets - e.g.
# GvHD flow cytometry data is a sample dataset provided in the project.
gvhd <- read.table("./data/GvHD.txt", header=TRUE)
head(gvhd)
# Only extract the CD3 positive cells
hist(gvhd[,5])
gvhdCD3p <- as.data.frame(gvhd[gvhd[, 5]>280, 3:6])
plot(gvhdCD3p[, 1:2])
stimDC2 <- LPSdat$DC2.LPS - LPSdat$DC2.ctrl
stimNK <- LPSdat$NK.LPS - LPSdat$NK.ctrl
# A dataframe of differences:
stims <- data.frame(genes = LPSdat$genes,
dB = LPSdat$B.LPS - LPSdat$B.ctrl,
dMF = LPSdat$MF.LPS - LPSdat$MF.ctrl,
dNK = LPSdat$NK.LPS - LPSdat$NK.ctrl,
dMo = LPSdat$Mo.LPS - LPSdat$Mo.ctrl,
dpDC = LPSdat$pDC.LPS - LPSdat$pDC.ctrl,
dDC1 = LPSdat$DC1.LPS - LPSdat$DC1.ctrl,
dDC2 = LPSdat$DC2.LPS - LPSdat$DC2.ctrl,
stringsAsFactors = FALSE)
plot(stims$dMF, stims$dpDC,
pch=21,
col="#00228855",
bg="#0077FF44",
cex = 1.4)
# == 5.1 Explore scatter plots =============================================
# Topics:
# Section 6 - Plotting symbols and characters
# 2 - Colors
# 4 - Coordinates
# ... of ./scripts/PlottingReference.R)
# ==============================================================================
# Scatter plots are extremely useful, but learning
# a bit about R's plotting capabilities will help
# tremendously to create INFORMATIVE plots.
# Some special packages
# The overlap in the GvHD data can obscure
# data relationships. Here are some alternatives
# for dense plots:
# load "hexbin" package from CRAN
if (!requireNamespace(hexbin, quietly=TRUE)) {
install.packages("hexbin")
}
library(hexbin)
# variant one: hexbin
hb <- hexbin::hexbin(stims$dMF, stims$dpDC, xbins = 20)
plot(hb)
plot(hb, colramp = colorRampPalette(c("#FFFFDD",
"#77EE99",
"#3377AA",
"#0033BB")))
# === variant two: smoothScatter
smoothScatter(stims$dMF, stims$dpDC,
nrpoints = 0,
pch=20,
cex=0.5,
col="#6633BB55")
# === variant three: colors vary by density
plot(stims$dMF, stims$dpDC,
col=densCols(stims$dMF, stims$dpDC),
pch=16,
cex=2)
# === variant four: analysis with specialized package
# load "prada" package from BioConductor
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("prada", quietly=TRUE)) {
BiocManager::install("prada")
}
# using functions ?fitNorm2 and plotNorm2 from the prada package
nfit <- prada::fitNorm2(stims$dMF, stims$dpDC)
prada::plotNorm2(nfit, selection=TRUE, ellipse=TRUE)
# == 5.2 Trellis plots: all against all ====================================
plot(stims[ , 2:8], pch=".", col = "#44BB6633")
boxplot(stims[ , 2:8])
oPar <- par(mfrow=c(2, 2))
hist(stims$dB, breaks=40, main="dB", freq=FALSE, ylim=c(0,1.5), col="#cffff1")
hist(stims$dNK, breaks=40, main="dNK", freq=FALSE, ylim=c(0,1.5), col="#cbe5ff")
hist(stims$dMo, breaks=40, main="dMo", freq=FALSE, ylim=c(0,1.5), col="#ecddff")
hist(stims$dMF, breaks=40, main="dMF", freq=FALSE, ylim=c(0,1.5), col="#c9f5ff")
par(oPar)
# [END]