-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_tidyr_data_tidying.R
135 lines (93 loc) · 3.82 KB
/
04_tidyr_data_tidying.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
## Data tidying with tidyr
## Adam Stone
## NTID Data Science Workshop, March 2019
# Gathering fNIRS Data ----------------------------------------------------
library(tidyverse)
# Is this data long or wide? Is it tidy?
fnirs <- read_csv("data/AT273_individualdata_wavelet.csv") %>%
select(-c(group, onset, mark, valid, rep, type, prune))
# I want to plot the HbO signal in channel 1 during the first trial.
# So I make a small dataset with just this data
ch1 <- fnirs %>%
filter(chnum == 1) %>% # channel 1
filter(trialnum == 1) %>% # first trial
filter(hbx == 1) # Just HbO data
ch1 %>%
ggplot(aes(x = ..., y = ...))
# Changing wide to long format is called "gathering"
# I want to put all column names in a new "time" column.
# And I want to put all values in a new "hb_value" column.
# And I want this to operate over the columns time1 to time160.
ch1_long <- ch1 %>%
gather(time, hb_value, time1:time160) %>%
mutate(time = str_remove(time, "time")) %>% # this removes "time"
mutate(time = as.double(time)) # this makes "time" a numeric variable
# Now let's try again with plotting
ch1_long %>%
ggplot(aes(x = time, y = hb_value)) +
geom_line()
# Spreading Marvel Data ---------------------------------------------------
# Let's load a "tall" version I made using Marvel character data:
marvel <- read_csv("data/Marvel_Characters_tall.csv")
# View it
marvel
# Wouldn't it be nice if we could compare genders side-by-side, one year per row?
# In spread():
# I want the column names to be "sex"
# I want the values under the new column names to be "characters"
marvel %>%
spread(sex, characters)
marvel %>%
spread(sex, characters) %>%
View()
# Gathering & Spreading Visitor Data --------------------------------------
visitors <- read_csv("data/Tourism_Visitor_Arrivals.csv")
# How big is this table? What's the size of it?
visitors
glimpse(visitors)
# Exercise:
# Try gathering this dataset, so all years are in one column.
# gather(x, y, z)
# x is the new name of the column that will contain the column names
# y is the new name of the column that will contain the values in those columns
# z is the range of column names, but notice how the column names look like `1995`.
# So you will need to use backticks to refer to column names. (Hey, it's not tidy data!)
# Save it to visitors_long
# The visitors_long table will be 1284 x 3.
# Try plotting visitors_long! Remember:
# 1. Start with data %>%
# 2. Define aesthetics, ggplot(aes(x..., y...)) +
# 3. Pick a geometric layer, geom_line()
# Let's spread visitor_long, so you can compare year by year changes easily.
# You may have to change the names in spread() depending on
# how you named the new columns in gather()
visitor_long %>%
spread(year, visitors)
# Same data, right? You're just reshaping it between long and wide!
# Simon Data --------------------------------------------------------------
# Do you know the Simon task? https://en.wikipedia.org/wiki/Simon_effect
# This is a dataset of 35 participants doing the Simon Task, with 16 trials each.
# Values are reaction times in milliseconds.
simon <- read_csv("data/Simon_data.csv")
glimpse(simon)
# Let's try calculating the average reaction time per participant.
simon %>%
group_by(subject) %>%
summarise(mean_rt = mean(trial_1, trial_10, trial_11, trial_12, trial_.....))
# I'm tired already. I can't type out all of those.
# This is one good reason to tidy the data and make it tall.
simon_long <- simon %>%
gather(trial, reaction_time, trial_1:trial_9)
# Much easier, right?
simon_means <- simon_long %>%
group_by(subject) %>%
summarise(mean_rt = mean(reaction_time),
sd_rt = sd(reaction_time))
simon_means
# And it's easier to plot the mean values too:
simon_means %>%
ggplot(aes(x = subject, y = mean_rt)) +
geom_col()
simon_means %>%
ggplot(aes(y = mean_rt)) +
geom_boxplot()