-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwvsAll.R
284 lines (209 loc) · 10.8 KB
/
wvsAll.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
library(plyr)
library(dplyr)
library(stringr)
library(psych)
library(lavaan)
library(MplusAutomation)
# Wave 5
load("WV5_Data_R_v_2015_04_18.rdata")
wvs5 <- WV5_Data_r_v_2015_04_18
rm(WV5_Data_r_v_2015_04_18)
names(wvs5)[names(wvs5)=="V2"] <- "country"
names(wvs5)[names(wvs5)=="V235"] <- "female"
names(wvs5)[names(wvs5)=="V256"] <- "race"
names(wvs5)[names(wvs5)=="V237"] <- "age"
names(wvs5)[names(wvs5)=="V253"] <- "incdecile"
names(wvs5)[names(wvs5)=="V187"] <- "religious"
names(wvs5)[names(wvs5)=="V186"] <- "religserv"
names(wvs5)[names(wvs5)=="V238"] <- "education"
names(wvs5)[names(wvs5)=="V114"] <- "leftright"
names(wvs5)[names(wvs5)=="Y001"] <- "postmater"
names(wvs5)[names(wvs5)=="V46"] <- "control"
names(wvs5)[names(wvs5)=="V47"] <- "trust"
names(wvs5)[names(wvs5)=="V260"] <- "year"
names(wvs5)[names(wvs5)=="V91"] <- "scibetlife"
names(wvs5)[names(wvs5)=="V92"] <- "scinextgen"
names(wvs5)[names(wvs5)=="V93"] <- "scichangefast"
names(wvs5)[names(wvs5)=="V94"] <- "scinotenfaith"
names(wvs5)[names(wvs5)=="V123"] <- "scibetoff"
wvs5 <- select(wvs5, country, year, female, race, age, incdecile, religious, religserv, education, leftright, postmater, control, trust, scibetlife, scinextgen, scichangefast, scinotenfaith, scibetoff)
cc <- read.csv(file = "Country Codes.csv", header = FALSE)
cc$code <- NA
cc$country <- NA
cc$code <- sapply(cc[,1], function(x) as.numeric(str_split(x, "##")[[1]][1]))
cc$country <- sapply(cc[,1], function(x) str_split(x, "##")[[1]][2])
wvs5$country <- cc$country[match(wvs5$country, cc$code)]
sum(sapply(wvs5, function(x) sum(x %in% c(-5, -4, -3, -2, -1))))
wvs5[wvs5 == -5 | wvs5 == -4 | wvs5 == -3 | wvs5 == -2 | wvs5 == -1] <- NA
sum(sapply(wvs5, function(x) sum(x %in% c(-5, -4, -3, -2, -1))))
wvs5$female <- mapvalues(wvs5$female, c(1, 2), c(0, 1))
wvs5$female <- as.factor(wvs5$female)
wvs5$religious <- mapvalues(wvs5$religious, c(1, 2, 3), c(1, 0, 0))
wvs5$religious <- as.factor(wvs5$religious)
wvs5$religserv <- mapvalues(wvs5$religserv, c(1,2,3,4,5,6,7),
c(7,6,5,4,3,2,1))
wvs5$scichangefast <- mapvalues(wvs5$scichangefast, c(1,2,3,4,5,6,7,8,9,10),
c(10,9,8,7,6,5,4,3,2,1))
wvs5$scinotenfaith <- mapvalues(wvs5$scinotenfaith, c(1,2,3,4,5,6,7,8,9,10),
c(10,9,8,7,6,5,4,3,2,1))
wvs5$wave <- "five"
# Wave 6
load("WV6_Data_R_v_2016_01_01.rdata")
wvs6 <- WV6_Data_R
rm(WV6_Data_R)
names(wvs6)[names(wvs6)=="V2"] <- "country"
names(wvs6)[names(wvs6)=="V240"] <- "female"
names(wvs6)[names(wvs6)=="V254"] <- "race"
names(wvs6)[names(wvs6)=="V242"] <- "age"
names(wvs6)[names(wvs6)=="V239"] <- "incdecile"
names(wvs6)[names(wvs6)=="V147"] <- "religious"
names(wvs6)[names(wvs6)=="V145"] <- "religserv"
names(wvs6)[names(wvs6)=="V248"] <- "education"
names(wvs6)[names(wvs6)=="V95"] <- "leftright"
names(wvs6)[names(wvs6)=="Y001"] <- "postmater"
names(wvs6)[names(wvs6)=="V55"] <- "control"
names(wvs6)[names(wvs6)=="V56"] <- "trust"
names(wvs6)[names(wvs6)=="V262"] <- "year"
names(wvs6)[names(wvs6)=="V192"] <- "scibetlife"
names(wvs6)[names(wvs6)=="V193"] <- "scinextgen"
names(wvs6)[names(wvs6)=="V194"] <- "scinotenfaith"
names(wvs6)[names(wvs6)=="V195"] <- "scirightwrong"
names(wvs6)[names(wvs6)=="V196"] <- "sciimportant"
names(wvs6)[names(wvs6)=="V197"] <- "scibetoff"
wvs6 <- select(wvs6, country, year, female, race, age, incdecile, religious, religserv, education, leftright, postmater, control, trust, scibetlife, scinextgen, scinotenfaith, scirightwrong, sciimportant, scibetoff)
wvs6$country <- cc$country[match(wvs6$country, cc$code)]
sum(sapply(wvs6, function(x) sum(x %in% c(-5, -4, -3, -2, -1))))
wvs6[wvs6 == -5 | wvs6 == -4 | wvs6 == -3 | wvs6 == -2 | wvs6 == -1] <- NA
sum(sapply(wvs6, function(x) sum(x %in% c(-5, -4, -3, -2, -1))))
wvs6$female <- mapvalues(wvs6$female, c(1, 2), c(0, 1))
wvs6$female <- as.factor(wvs6$female)
wvs6$religious <- mapvalues(wvs6$religious, c(1, 2, 3), c(1, 0, 0))
wvs6$religious <- as.factor(wvs6$religious)
wvs6$religserv <- mapvalues(wvs6$religserv, c(1,2,3,4,5,6,7),
c(7,6,5,4,3,2,1))
wvs6$scinotenfaith <- mapvalues(wvs6$scinotenfaith, c(1,2,3,4,5,6,7,8,9,10),
c(10,9,8,7,6,5,4,3,2,1))
wvs6$scirightwrong <- mapvalues(wvs6$scirightwrong, c(1,2,3,4,5,6,7,8,9,10),
c(10,9,8,7,6,5,4,3,2,1))
wvs6$sciimportant <- mapvalues(wvs6$sciimportant, c(1,2,3,4,5,6,7,8,9,10),
c(10,9,8,7,6,5,4,3,2,1))
wvs6$wave <- "six"
# We start by using as many countries as possible. For this, we will get rid of
# the countries that do not have any values on the science questions
c5.1 <- unique(wvs5$country)
c6.1 <- unique(wvs6$country)
length(c5.1)
length(c6.1)
length(intersect(c5.1, c6.1))
nrow(wvs5)
nrow(wvs6)
# At this point we have 58 countries in wave 5 and 60 in wave 6, of which 38
# are common to both waves. In total, we have 83975 observations in wave 5 and
# 90350 in wave 6. We delete the observations without values in any of the
# science questions
wvs5.sci <- filter(wvs5, !is.na(scibetlife) & !is.na(scinextgen) & !is.na(scichangefast) & !is.na(scinotenfaith) & !is.na(scibetoff))
wvs6.sci <- filter(wvs6, !is.na(scibetlife) & !is.na(scinextgen) & !is.na(scirightwrong) & !is.na(scinotenfaith) & !is.na(scibetoff) & !is.na(sciimportant))
c5.2 <- unique(wvs5.sci$country)
c6.2 <- unique(wvs6.sci$country)
length(c5.2)
length(c6.2)
length(intersect(c5.2, c6.2))
nrow(wvs5.sci)
nrow(wvs6.sci)
# We lost 11 countries in wave 5, and none in wave 6. We now have 59580
# observations in wave 5, and 81279 in wave 6. Now we are left with only 32
# countries in both waves. The countries we lost because they don't ask the
# science questions are:
setdiff(c5.1, c5.2)
# "Argentina" "France" "Guatemala" "Hong Kong" "Iran"
# "Iraq" "Netherlands" "New Zealand" "Russia" "Switzerland"
# "Great Britain"
# Now let's remove the observations that do not have values in any of our main
# covariates
wvs5.sci.no.na <- filter(wvs5.sci, !is.na(female) & !is.na(age) & !is.na(incdecile) & !is.na(religious) & !is.na(education) & !is.na(leftright) & !is.na(trust))
nrow(wvs5.sci.no.na)
wvs6.sci.no.na <- filter(wvs6.sci, !is.na(female) & !is.na(age) & !is.na(incdecile) & !is.na(religious) & !is.na(education) & !is.na(leftright) & !is.na(trust))
nrow(wvs6.sci.no.na)
# This is leaving us with 41644 observations in wave 5 and 59692 in wave 6.
# Since we've lost plenty of observations, let's see these imply the loss of
# any specific countries, or if these are random across the data set
c5.3 <- unique(wvs5.sci.no.na$country)
length(c5.3)
c6.3 <- unique(wvs6.sci.no.na$country)
length(c6.3)
# In wave 5, we went from 47 countries to 44, and in wave 6 we went from 60 to
# 53 countries, so we lost 3 and 7 respectively. Let's see which they are:
setdiff(c5.2, c5.3)
# The three that we lost in wave 5 are China, India, and Malaysia. Let's see why
# we lost them
sapply(wvs5.sci, function(x) sum(!is.na(x[wvs5.sci$country == "China"])))
sapply(wvs5.sci, function(x) sum(!is.na(x[wvs5.sci$country == "India"])))
sapply(wvs5.sci, function(x) sum(!is.na(x[wvs5.sci$country == "Malaysia"])))
# We see that China is missing the 'leftright' variable, India is missing the
# trust (and race) variable, and Malaysia is also missing the leftright variable
setdiff(c6.2, c6.3)
# In wave 6, we go from 60 to 53, countries, so we lost 7. These are: China,
# Egypt, Jordan, Kuwait, New Zealand, Qatar, Singapore. Again, let's see why
# we lost them:
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "China"])))
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "Egypt"])))
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "Jordan"])))
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "Kuwait"])))
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "New Zealand"])))
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "Qatar"])))
sapply(wvs6.sci, function(x) sum(!is.na(x[wvs6.sci$country == "Singapore"])))
# We lost China, Jordan, Kuwait (also missing race, religserv and postmater),
# Qater (also missing race nd religserv) and Singapore because they are missing
# the 'leftright' variable
# Egypt misses 'race' and religious' (but it does have religserv!)
# New Zealand misses 'trust'
length(intersect(c5.3, c6.3))
# SO, to sum up, we are left with 44 countries in wave 5 and 53 in wave 6, of
# which 27 are common to both. In total, then, we have...
44+53-27
# ... 70 countries
# Now we bind together waves 5 and 6 into a single dataset
wvsAll <- rbind.fill(wvs5.sci.no.na, wvs6.sci.no.na)
wvsAll$wave <- as.factor(wvsAll$wave)
wvsAll$age2 <- wvsAll$age * wvsAll$age
wvsAll$country[wvsAll$country == "Cyprus (G)"] <- "Cyprus"
wvsAll$country[wvsAll$country == "Viet Nam"] <- "Vietnam"
wvsAll$country[wvsAll$country == "Serbia and Montenegro"] <- "Serbia"
wvsAll$US <- ifelse(wvsAll$country == "United States", 1, 0)
wvsAll$US <- as.factor(wvsAll$US)
wvsAll$country <- as.factor(wvsAll$country)
save(wvsAll, file = "wvsAll-no.country.vars.Rda")
# Now we need to bring in the country-level variables
load("countryvariables.Rda")
wvsAll <- left_join(wvsAll, countryvariables, by = c("country", "year"))
# Before we can scale the variables, we need to know what the final analytical
# frame is, so that we actually scale based on the cases that are used in the
# analysis
# ALL: we start by using all the data we have
wvsAll$age <- scale(wvsAll$age)
wvsAll$age2 <- scale(wvsAll$age2)
wvsAll$incdecile <- scale(wvsAll$incdecile)
wvsAll$education <- scale(wvsAll$education)
wvsAll$leftright <- scale(wvsAll$leftright)
wvsAll$religserv <- scale(wvsAll$religserv)
wvsAll$postmater <- scale(wvsAll$postmater)
wvsAll$control <- scale(wvsAll$control)
wvsAll$trust <- scale(wvsAll$trust)
wvsAll$scibetlife <- scale(wvsAll$scibetlife)
wvsAll$scinextgen <- scale(wvsAll$scinextgen)
wvsAll$scichangefast <- scale(wvsAll$scichangefast)
wvsAll$scinotenfaith <- scale(wvsAll$scinotenfaith)
wvsAll$scibetoff <- scale(wvsAll$scibetoff)
wvsAll$scirightwrong <- scale(wvsAll$scirightwrong)
wvsAll$sciimportant <- scale(wvsAll$sciimportant)
wvsAll$gdp <- scale(wvsAll$gdp)
wvsAll$gini <- scale(wvsAll$gini)
wvsAll$governance <- scale(wvsAll$governance)
wvsAll$polstab <- scale(wvsAll$polstab)
wvsAll$mleftright <- scale(wvsAll$mleftright)
wvsAll$mleft <- scale(wvsAll$mleft)
wvsAll$mright <- scale(wvsAll$mright)
for(i in 1:ncol(wvsAll)) {
if(class(wvsAll[,i]) == "matrix") attributes(wvsAll[,i]) <- NULL
}
prepareMplusData(wvsAll, "mplusAll", keepCols = c("country", "year", "female", "age", "incdecile", "religious", "religserv", "education", "leftright", "trust", "scibetlife", "scinextgen", "scichangefast", "scinotenfaith", "scibetoff", "wave", "scirightwrong", "sciimportant", "age2", "US", "gdp", "gini", "mleftright", "mleft", "mright"))