-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path99_data_cleaning.R
128 lines (121 loc) · 5.32 KB
/
99_data_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
library(tidyverse)
# Internet Usage
df <- read_csv("data/originals/SYB61_T29_Internet Usage.csv",
skip = 2,
col_names = c("id",
"area",
"year",
"series",
"value",
"footnotes",
"source")) %>%
mutate(value = value/100) %>%
rename(country = area) %>%
write_csv("data/Internet_Usage_by_Country.csv")
# Marvel Characters
df <- read_csv("data/originals/marvel_wiki_data.csv") %>%
janitor::clean_names() %>%
mutate(name = str_remove(name, '\\(.+\\)^'),
name = str_remove(name, '\\\\.+\\\\"'),
name = str_remove(name, '\\(Earth-616\\)')) %>%
mutate(sex = str_remove(sex, "Characters"),
sex = str_trim(sex)) %>%
mutate(align = str_remove(align, "Characters"),
align = str_trim(align)) %>%
write_csv("data/Marvel_Characters.csv")
# Marvel Characters Tall
df <- read_csv("data/Marvel_Characters.csv") %>%
group_by(year, sex) %>%
summarise(characters = n()) %>%
filter(!is.na(year)) %>%
filter(!is.na(sex)) %>%
write_csv("data/Marvel_Characters_tall.csv")
# Fingerspelling Fluency
df <- haven::read_sav("data/originals/Fingerspelling and Fluency PLoS Data Set.sav") %>%
janitor::clean_names() %>%
rename(id = vl2id,
piat = raw_piatr,
wj_reading_fluency = raw_rf_wj,
kbit = raw_kb_matrices,
asl_srt = raw_aslsrt,
fingerspelling = raw_tc_fst) %>%
select(-c(raw_b_span_man)) %>%
write_csv("data/Stone_etal_PLoS_2015_Fingerspelling.csv")
# GoT Data (not checked into Github)
df <- readxl::read_xlsx("data/originals/GoTdata_FINAL.xlsx", sheet = 1) %>%
mutate(sex = recode(sex, "1" = "Male",
"2" = "Female",
"9" = "Unknown/Unclear")) %>%
mutate(social_status = recode(social_status, "1" = "Highborn",
"2" = "Lowborn",
"9" = "Unknown/Unclear/Other")) %>%
mutate(allegiance_last = recode(allegiance_last, "1" = "Stark",
"2" = "Targaryen",
"3" = "Night's Watch",
"4" = "Lannister",
"5" = "Greyjoy",
"6" = "Bolton",
"7" = "Frey",
"8" = "Other",
"9" = "Unknown/Unclear")) %>%
mutate(allegiance_switched = recode(allegiance_switched, "1" = "No",
"2" = "Yes",
"9" = "Unknown/Unclear")) %>%
mutate(location = recode(location, "1" = "Indoors",
"2" = "Outdoors",
"9" = "Unknown/Unclear")) %>%
mutate(continent = recode(continent, "1" = "Westeros",
"2" = "Essos",
"9" = "Unknown/Unclear")) %>%
mutate(time_of_day = recode(time_of_day, "1" = "Day",
"2" = "Night",
"9" = "Unknown/Unclear")) %>%
mutate(prominence_cat = recode(prominence_cat, "1" = "Low",
"2" = "Medium",
"3" = "High")) %>%
mutate(dth_flag = recode(dth_flag, "0" = "Alive",
"1" = "Dead")) %>%
mutate(religion = recode(religion, "1" = "Great Stallion",
"2" = "Lord of Light",
"3" = "Faith of the Seven",
"4" = "Old Gods",
"5" = "Drowned God",
"6" = "Many Faced God",
"7" = "Great Shepard",
"8" = "White Walkers",
"9" = "Unknown",
"10" = "Ghiscari",
"11" = "None")) %>%
mutate(occupation = recode(occupation, "1" = "Silk collar",
"2" = "Boiled leather collar",
"9" = "Unknown/unclear")) %>%
rename(house_last = allegiance_last,
intro_sec = intro_time_sec,
death_season = dth_season,
death_episode = dth_episode,
death_sec = dth_time_sec,
status = dth_flag,
lifespan_season = exp_season,
lifespan_episode = exp_episode,
lifespan_sec = exp_time_sec) %>%
select(-censor_time_sec, -prominence, -occupation, -COD, -COD_text, -continent,
-place, -place_text, -time_of_day, -featured_episode_count, -diagnosis,
-location) %>%
select(ID, name, status, prominence_cat, everything()) %>%
rename(prominence = prominence_cat,
death_how = dth_description,
diagnosis = diagnosis_text) %>%
write_csv("data/GoT_data.csv")
# ASL-LEX
df <- read_csv("data/originals/SignData.csv") %>%
janitor::clean_names() %>%
select(-x45) %>%
write_csv("data/ASL-LEX_Sign_Data.csv")
# Tourism Data
df <- read_csv("data/originals/SYB61_T30_Tourist-Visitors Arrival and Expenditure.csv", skip = 1) %>%
janitor::clean_names() %>%
rename(country = x2) %>%
filter(series == "Tourist/visitor arrivals (thousands)") %>%
select(country, year, value) %>%
spread(year, value) %>%
write_csv("data/Tourism_Visitor_Arrivals.csv")