-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkshop-script.qmd
153 lines (120 loc) · 5.28 KB
/
workshop-script.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
---
title: "Workshop script"
toc: true
df-print: paged
execute:
warning: false
---
Below is the final version of the script we worked on together during the workshop session.
### In-class exercises
```{r}
library(tidyverse)
# we'll be looking at data on Groundhog predictions
groundhogs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-01-30/groundhogs.csv')
predictions <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-01-30/predictions.csv')
head(predictions)
# find groundhog predictions from 2020
filter(predictions, year == 2020)
# find groundhog predictions from 2020 and 2021
filter(predictions, year == 2020 | year == 2021)
filter(predictions, year %in% c(2020, 2021))
# find predictions between 1900 and 2000
predictions |>
filter(year >= 1900 & year <= 2000)
# create a subset of your data where "shadow" has a value of either TRUE or FALSE. Make sure there are no duplicate rows, and sort the result by descending year.
predictions <- predictions |>
filter(shadow %in% c(TRUE, FALSE)) |>
distinct(year, id, .keep_all = TRUE) |>
arrange(desc(year))
# group predictions by year
predictions |>
group_by(year)
# how many predictions were made in each year?
predictions |>
group_by(year) |>
summarize(n_predictions = n())
# How many different groundhogs made predictions each year?
predictions |>
group_by(year) |>
summarize(n_groundhogs = n_distinct(id)) |>
arrange(desc(n_groundhogs))
# What is the first year each groundhog made a prediction?
predictions |>
group_by(id) |>
summarize(first_prediction = min(year))
# Let's return to our dataframe with the number of predictions in each year.
# How would we add a column for the number of shadows seen in each year?
predictions |>
group_by(year) |>
summarize(n_predictions = n(),
n_shadows = sum(shadow == TRUE))
# Create a dataframe with 3 variables:
# groundhog id
# the number of total predictions each groundhog has made
# the number of times each groundhog has seen its shadow
predictions |>
group_by(id) |>
summarize(n_predictions = n(),
n_shadows = sum(shadow == TRUE))
# calculate how many characters are in the details field and put the variable after id
predictions |>
mutate(details_length = nchar(details), .after = id)
# create a column that indicates whether the prediction was made by Punxatawney Phil
predictions |>
mutate(phil = if_else(id == 1, 'TRUE', 'FALSE'))
# create a column that indicates the century of the predictions
predictions |>
mutate(century = case_when(year < 1900 ~ 19,
year < 2000 & year >= 1900 ~ 20,
year >= 2000 ~ 21))
# Working off of our table with the number of predictions and number of shadows seen per groundhog, lets:
# Add a column called shadow_percent that gives the percentage of time each groundhog sees its shadow
# Filter for groundhogs with more than 5 predictions
# Keep only the variables id and shadow_percent, and rename id to groundhog_id
# Assign the result to a variable groundhog_predictions
groundhog_predictions <- predictions |>
group_by(id) |>
summarize(n_predictions = n(),
n_shadows = sum(shadow == TRUE)) |>
mutate(shadow_percent = n_shadows/n_predictions) |>
filter(n_predictions > 5) |>
select(id, shadow_percent) |>
rename(groundhog_id = id)
# add the variables from groundhogs to our groundhog_predictions table
left_join(groundhog_predictions, groundhogs, join_by(groundhog_id == id))
# add the variables from groundhog_predictions to the groundhogs table
right_join(groundhog_predictions, groundhogs, join_by(groundhog_id == id))
left_join(groundhogs, groundhog_predictions, join_by(id == groundhog_id))
# add variables from groundhogs to groundhog_predictions where keys appear in both tables
inner_join(groundhog_predictions, groundhogs, join_by(groundhog_id == id))
# add variables from groundhogs to groundhog_predictions. Add rows even if the groundhog isn't in groundhog_predictions
full_join(groundhog_predictions, groundhogs, join_by(groundhog_id == id))
# Bonus exercises
# Write code to calculate the column predictions_count in groundhogs
# Write code to calculate the column is_groundhog in groundhogs
# Calculate the proportion of groundhogs from each country that make predictions each year
# Add a column to groundhogs indicating the first year each groundhog saw its shadow
```
### Bonus Exercises
```{r}
#### BONUS EXERCISE ANSWERS ####
# Write code to calculate the column predictions_count in groundhogs
groundhogs |> left_join(predictions |>
group_by(id) |>
summarize(predictions_count = n()))
# Write code to calculate the column is_groundhog in groundhogs
groundhogs |>
mutate(is_groundhog = if_else(type =='Groundhog', TRUE, FALSE))
# Calculate the proportion of groundhogs from each country that make predictions each year
predictions |>
left_join(groundhogs) |>
group_by(year, country) |>
summarize(n = n()) |>
mutate(percent = n/sum(n))
# Add a column to groundhogs indicating the first year each groundhog saw its shadow
groundhogs |>
left_join(predictions |>
group_by(id) |>
filter(shadow == TRUE) |>
summarize(first_shadow = min(year)))
```