-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlife-expectancy-by-usa-state-2016.R
118 lines (95 loc) · 3.09 KB
/
life-expectancy-by-usa-state-2016.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# generates 6 maps of Overall, White, Black, Latino, Asian & Native American life
# expectancies by state in 2016 using a single mapply at end of script;
# explores statistical relationship between the life expectancies
#==============
# LOAD PACKAGES
#==============
library(tidyverse)
library(sf)
library(rvest)
library(stringr)
library(scales)
library(viridis)
#============
# SCRAPE DATA
#============
df.le <- read_html("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_life_expectancy") %>%
html_nodes("table") %>%
.[[2]] %>%
html_table(fill = T)
# inspect
glimpse(df.le)
str(df.le)
#============
# CLEAN DATA
#============
# delete superfluous first 3 rows
df.le <- df.le[-c(1:3),]
head(df.le)
str(df.le)
# delete rank columns
df.le <- df.le[, -c(1:2)]
head(df.le)
str(df.le)
# only keep cols that matter
df.le <- df.le[, c(1:7)]
head(df.le)
str(df.le)
# change col names
colnames(df.le) <- c("state", "le", "le_white", "le_black", "le_asian", "le_latino", "le_native")
head(df.le)
str(df.le)
# convert all cols except state name to numeric
df.le[, 2:7] <- sapply(df.le[, 2:7], as.numeric)
head(df.le)
str(df.le)
#============
# EXPLORE DATA
#============
cor.test(df.le$le_white, df.le$le_black) # 0.387
cor.test(df.le$le_white, df.le$le_latino) # -0.0664
cor.test(df.le$le_white, df.le$le_asian) # 0.00836
cor.test(df.le$le_white, df.le$le_native) # -0.296
cor.test(df.le$le_white, df.le$le) # 0.755 # autocorrelation
cor.test(df.le$le_black, df.le$le_latino) # -0.184
#========
# GET MAP
#========
map.states <- map_data('state')
str(map.states)
# transform all state names in df.le to lowercase to match statenames in map.states
df.le$region <- tolower(df.le$state)
head(df.le)
str(df.le)
# merge the datasets
states <- merge(map.states, df.le, by="region", all.x=T)
head(states)
str(states)
#=====
# PLOT
#=====
# plot multiple maps by race
le_by_race <- function(race, title) {
ggplot(data = states, aes(x = long, y = lat, group = group, fill = race)) +
geom_polygon(color = "white") +
scale_fill_gradient(name = "years",
low = "#cefaf2",
high = "#095b4b",
guide = "colorbar",
na.value="#eeeeee") +
labs(x = NULL, y = NULL) +
labs(title = paste(title, "Life Expectancy, 2016", " "), subtitle = " source: https://en.wikipedia.org/wiki/\n\tList_of_U.S._states_and_territories_by_life_expectancy") +
theme(panel.background = element_rect(fill = 'white')) + # white background for map
theme(text = element_text(color = "#464646", family = "American Typewriter")) +
theme(plot.background = element_blank()) +
theme(axis.text = element_blank()) + # remove axis tick marks
theme(axis.ticks = element_blank()) +
theme(plot.title = element_text(size = 16)) +
theme(plot.subtitle = element_text(size = 10)) # white background for key
}
# "" will represent the overall graph
titles <- c("", "White", "Black", "Latino", "Asian", "Native American")
head(states)
# extract life expectancy columns
s <- states[, 8:13]
mapply(le_by_race, s, titles, SIMPLIFY = FALSE)