-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcountryvariablesAll.R
453 lines (348 loc) · 17.6 KB
/
countryvariablesAll.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
library(WDI)
library(stringr)
library(dplyr)
### Country-level variables
#NOOOO
# We start by reading from disk the 'countryvariables' data frame. It already
# contains, from when I first created it, the columns for indicators for gdp per
# capita, gini, and governance (and some of the data for them, some of it added
# by hand because it was not readily accessible), but we need to add those for
# political stability and polarization
# NOOOO
# Let's first load the wvsAll data set without the country variables, which we
# will use to create our countryvariables data frame by extracting the all the
# country-year combinations
load("wvsAll-no.country.vars.Rda")
countryvariables <- unique(wvsAll[,c('country', 'year')])
rownames(countryvariables) <- NULL # reset rownames to consecutive numbers
countryvariables <- arrange(countryvariables, country) # sort by country name
# Add variable columns for all the indicators we want to include, as well as the
# year of the data (we can't always find the data for the exact year of the
# survey), and the source of the data. The variables are GDP, governance,
# gini coefficient, and political stability
countryvariables$gdp <- NA
countryvariables$gdpyear <- NA
countryvariables$gdpsource <- ""
countryvariables$governance <- NA
countryvariables$govyear <- NA
countryvariables$govsource <- ""
countryvariables$gini <- NA
countryvariables$giniyear <- NA
countryvariables$ginisource <- ""
countryvariables$polstab <- NA
countryvariables$polstabyear <- NA
countryvariables$polstabsource <- ""
countryvariables$mleftright <- NA
countryvariables$mleft <- NA
countryvariables$mright <- NA
countryvariables$lryear <- NA
countryvariables$lrsource <- ""
### GDP per capita, from the World Bank's World Development Indicators (WDI)
gdp = WDI(indicator = 'NY.GDP.PCAP.PP.KD', start = 2005, end = 2015)
# Let's rename some countries so that we can match the downloaded GDP data to
# the name of the countries in our data set
gdp$country[gdp$country == "Russian Federation"] <- "Russia"
gdp$country[gdp$country == "Korea, Rep."] <- "South Korea"
gdp$country[gdp$country == "Iran, Islamic Rep."] <- "Iran"
gdp$country[gdp$country == "Egypt, Arab Rep."] <- "Egypt"
gdp$country[gdp$country == "Hong Kong SAR, China"] <- "Hong Kong"
gdp$country[gdp$country == "Kyrgyz Republic"] <- "Kyrgyzstan"
gdp$country[gdp$country == "Yemen, Rep."] <- "Yemen"
# Let's add the GDP per capita information to countryvariables
nodatagdp <- vector(mode = "character", 0) # For not-found countries
for(row in 1:nrow(countryvariables)) {
country <- countryvariables[row, 1]
year <- countryvariables[row, 2]
if(sum(gdp$country %in% country) == 0) {
nodatagdp <- c(nodatagdp, levels(country)[country])
next
}
if(is.na(countryvariables[row, 3])) {
if(!is.na(gdp$NY.GDP.PCAP.PP.KD[gdp$country == country & gdp$year == year])) {
countryvariables[row, 3] <- gdp$NY.GDP.PCAP.PP.KD[gdp$country == country & gdp$year == year]
countryvariables[row, 4] <- year
countryvariables[row, 5] <- "WDI"
} else {
values <- gdp$NY.GDP.PCAP.PP.KD[gdp$country %in% country]
years <- gdp$year[gdp$country %in% country]
nas <- !is.na(values)
if(sum(nas) == 0) {
countryvariables[row, 3] <- NA
countryvariables[row, 4] <- NA
} else {
values <- values[nas]
years <- years[nas]
dist <- abs(years - year)
countryvariables[row, 3] <- values[which.min(dist)]
countryvariables[row, 4] <- years[which.min(dist)]
countryvariables[row, 5] <- "WDI"
}
}
}
}
# Countries not available in the WDIs for GDP:
nodatagdp # Palestine and Taiwan (for both waves)
# Countries without GDP data:
countryvariables$country[is.na(countryvariables$gdp)]
# Andorra, Palestine and Taiwan (for both waves)
### GINI coefficient, also from the World Bank's WDI
gini = WDI(indicator = 'SI.POV.GINI', start = 1990, end = 2015)
gini$country[gini$country == "Russian Federation"] <- "Russia"
gini$country[gini$country == "Korea, Rep."] <- "South Korea"
gini$country[gini$country == "Iran, Islamic Rep."] <- "Iran"
gini$country[gini$country == "Egypt, Arab Rep."] <- "Egypt"
gini$country[gini$country == "Hong Kong SAR, China"] <- "Hong Kong"
gini$country[gini$country == "Kyrgyz Republic"] <- "Kyrgyzstan"
gini$country[gini$country == "Yemen, Rep."] <- "Yemen"
# Here we add the GINI information to countryvariables
nodatagini <- vector(mode = "character", 0)
for(row in 1:nrow(countryvariables)) {
country <- countryvariables[row, 1]
year <- countryvariables[row, 2]
if(sum(gini$country %in% country) == 0) {
nodatagini <- c(nodatagini, levels(country)[country])
next
}
if(is.na(countryvariables[row, 9])) {
if(!is.na(gini$SI.POV.GINI[gini$country == country & gini$year == year])) {
countryvariables[row, 9] <- (gini$SI.POV.GINI[gini$country == country & gini$year == year])/100
countryvariables[row, 10] <- year
countryvariables[row, 11] <- "WDI"
} else {
values <- gini$SI.POV.GINI[gini$country %in% country]
years <- gini$year[gini$country %in% country]
nas <- !is.na(values)
if(sum(nas) == 0) {
countryvariables[row, 9] <- NA
countryvariables[row, 10] <- NA
} else {
values <- values[nas]
years <- years[nas]
dist <- abs(years - year)
countryvariables[row, 9] <- (values[which.min(dist)])/100
countryvariables[row, 10] <- years[which.min(dist)]
countryvariables[row, 11] <- "WDI"
}
}
}
}
# Countries not available in the WDIs for GINI:
nodatagini # Palestine and Taiwan (for both waves)
# Countries without GINI data:
countryvariables$country[is.na(countryvariables$gini)]
# Algeria, Andorra, Bahrain, Egypt, Hong Kong, Iraq, Jordan, Lebanon, Libya,
# Palestine, South Korea (for both waves), Taiwan (for both waves), and Yemen
## GOVERNANCE: Now we prepare the data on government effectiveness, which comes
# from the Worldwide Governance Indicators.
# http://info.worldbank.org/governance/wgi/index.aspx#home
gov <- read.csv("governance.csv", na.strings = "#N/A")
gov$country <- str_to_title(gov$country)
gov$country[gov$country == "Russian Federation"] <- "Russia"
gov$country[gov$country == "Korea, Rep."] <- "South Korea"
gov$country[gov$country == "Iran, Islamic Rep."] <- "Iran"
gov$country[gov$country == "Egypt, Arab Rep."] <- "Egypt"
gov$country[gov$country == "Hong Kong Sar, China"] <- "Hong Kong"
gov$country[gov$country == "Kyrgyz Republic"] <- "Kyrgyzstan"
gov$country[gov$country == "Taiwan, China"] <- "Taiwan"
gov$country[gov$country == "Trinidad And Tobago"] <- "Trinidad and Tobago"
gov$country[gov$country == "Yemen, Rep."] <- "Yemen"
# Now we add the government effectiveness information to countryvariables
nodatagov <- vector(mode = "character", 0)
for(row in 1:nrow(countryvariables)) {
country <- countryvariables[row, 1]
year <- countryvariables[row, 2]
if(sum(gov$country %in% country) == 0) {
nodatagov <- c(nodatagov, levels(country)[country])
next
}
cols <- which(gov[1,] == year)
if(is.na(countryvariables[row, 6])) {
if(!is.na(gov[gov$country == levels(country)[country], cols[4]])) {
countryvariables[row, 6] <- gov[gov$country == levels(country)[country], cols[4]]
countryvariables[row, 7] <- year
countryvariables[row, 8] <- "WGI"
} else {
countryvariables[row, 6] <- NA
countryvariables[row, 7] <- NA
}
}
}
# Countries not available in the WGIs for governance:
nodatagov # Palestine
# Countries without governance data:
countryvariables$country[is.na(countryvariables$governance)] # Palestine
## POLITICAL STABILITY: Now we prepare the data on political stability, also
# from the Worldwide Governance Indicators.
polstab <- read.csv("politicalstability.csv", na.strings = "#N/A")
polstab$country <- str_to_title(polstab$country) # Change case to title
polstab$country[polstab$country == "Russian Federation"] <- "Russia"
polstab$country[polstab$country == "Korea, Rep."] <- "South Korea"
polstab$country[polstab$country == "Iran, Islamic Rep."] <- "Iran"
polstab$country[polstab$country == "Egypt, Arab Rep."] <- "Egypt"
polstab$country[polstab$country == "Hong Kong Sar, China"] <- "Hong Kong"
polstab$country[polstab$country == "Kyrgyz Republic"] <- "Kyrgyzstan"
polstab$country[polstab$country == "Taiwan, China"] <- "Taiwan"
polstab$country[polstab$country == "Trinidad And Tobago"] <- "Trinidad and Tobago"
polstab$country[polstab$country == "Yemen, Rep."] <- "Yemen"
# Now we add the political stability information to countryvariables
nodatapol <- vector(mode = "character", 0)
for(row in 1:nrow(countryvariables)) {
country <- countryvariables[row, 1]
year <- countryvariables[row, 2]
if(sum(polstab$country %in% country) == 0) {
nodatapol <- c(nodatapol, levels(country)[country])
next
}
cols <- which(polstab[1,] == year)
if(is.na(countryvariables[row, 12])) {
if(!is.na(polstab[polstab$country == levels(country)[country], cols[4]])) {
countryvariables[row, 12] <- polstab[polstab$country == levels(country)[country], cols[4]]
countryvariables[row, 13] <- year
countryvariables[row, 14] <- "WGI"
} else {
countryvariables[row, 12] <- NA
countryvariables[row, 13] <- NA
}
}
}
# Countries not available in the WDIs for POLITICAL STABILITY:
nodatapol # Palestine
# Countries without POLITICAL STABILITY data:
countryvariables$country[is.na(countryvariables$polstab)] # Palestine
## POLITICAL POLARIZATION: Now we get to the data on political polarization
# This comes from the Manifesto Project, which looks at the party programs of
# political parties to figure out their ideological position. Basically, here
# I take the ideological score (-100 -left- to 100 - right) of the parties'
# program for a given election year, weigh it by its popular support, and find
# the mean for that election (mleftright variable). Then I take that mean value and
# subtract it from each party's value, square them, add them together, and do the square root.
# Then I find the mean of that value for each party, which gives me the average
# polarization, how much distance the parties are from each other (or the
# mean) for that election.
# https://manifestoproject.wzb.eu/
pol <- read.csv("MPDataset_MPDS2016a.csv", stringsAsFactors = FALSE)
pol$edate <- as.Date(pol$edate, format = "%d/%m/%Y")
# Remove all data from before 2005. We go from 3924 entries to 734
pol <- pol[pol$edate >= "2005-01-01",]
# Select only the variables we need for the analysis. From 169 to 5
pol <- select(pol, countryname, edate, party, pervote, rile)
# We now do the same for the South America dataset, which is a separate file
polSA <- read.csv("MPDataset_MPDSSA2015a.csv", stringsAsFactors = FALSE)
polSA$edate <- as.Date(polSA$edate, format = "%d/%m/%Y")
polSA <- polSA[polSA$edate >= "2005-01-01",]
polSA <- select(polSA, countryname, edate, party, pervote, rile)
# We merge both datasets into one
pol <- rbind(pol, polSA)
pol <- arrange(pol, countryname, edate) # Sort by country and election
rm(polSA) # Remove the South America dataset
# Let's see if there's any missing data in the dataset, since that could affect
# the size of the final dataset that we will use for our analysis
pol[is.na(pol$pervote),]
pol[is.na(pol$rile),]
# There are 7 cases in which the percentage of the vote is missing (Brazil and
# Hungary) and 12 in which party ideology is missing. It makes most sense to
# delete these entries
pol <- pol[complete.cases(pol),]
# Before we move on, let's check to see if the percentage of votes in each
# election is close to 100%. If not, that could be problematic, since the
# ideological score for the country at that point in time would be calculated
# missing a lot of the votes, and therefore might be very inaccurate/biased
percVotes <- pol %>% group_by(countryname, edate) %>% dplyr::summarise(totalPerc = sum(pervote))
# Let's single out the cases in which the total percentage of the vote accounted
# for in the dataset for each election is lower than 75%
problemElections <- percVotes[percVotes$totalPerc < 75,]
# There are 3 cases very close to 75% (Croatia 2007, Macedonia 2006 and South
# Korea 2008). Since these are close, I will keep them here. However, if we
# chose 85% instead of 75%, the number of cases climbs to 24, so we might want
# to think whether this is what we want to do
# The other three cases below 75% are for Argentina 2013, where the three are
# set on consecutive days (10/27, 28 & 29). We could just count them as part of
# the same election, but even then they would just add up to 54% of the vote,
# so it makes sense to remove them
pol <- pol[!(pol$countryname == 'Argentina' & as.numeric(format(pol$edate,'%Y')) > 2012),]
# 'pol' now has a row for the percentage of votes that each party got in each
# election for each country, as well as the ideological score for that party
# at that point in time (based on their program, which can be found in the rile
# variable with a score between -100 for extreme left, and 100 for extreme
# right). We need to do some magic to find the average ideological score for
# the country at each election, which we get by taking the rows for each
# election/country, and summarizing them into one row that calculates the mean
# of the ideological score weighted by the percentage of the vote
# I first create a dataset that contains that data frame for each
# country/election combination
meanlr <- pol %>% group_by(countryname, edate) %>% dplyr::summarise(meanlr = sum(rile*pervote/100, na.rm = TRUE))
# And now I add a column to the 'pol' data frame with the corresponding matching
# data. The value will be repeated for each row that refers to a
# country/election (there's a line for each party). We do this (instead of just
# using the meanlr data frame or collapsing all the entries for a given
# country/election into one) so that we can add more variables that distinguish
# between right and left parties
pol$meanlr <- meanlr$meanlr[match(interaction(pol$countryname, pol$edate), interaction(meanlr$countryname, meanlr$edate))]
# The problem with the mean ideological score for each country is that we lose
# the information on how much political polarization there in the country's
# political system. In order to keep some of that information, we will first
# create two variables that capture the average ideological score for each
# country/election only for parties on the right (mright) and only for parties
# on the left (mleft).
pol$left <- ifelse(pol$rile < 0, pol$rile, 0)
pol$left <- -pol$left
pol$right <- ifelse(pol$rile > 0, pol$rile, 0)
### THINK ABOUT: Should I use the percentage of total vote, or should I
# transform this to be the percentage of all votes on the left & right?
mleft <- pol%>% group_by(countryname, edate) %>% dplyr::summarise(mleft = sum(left*pervote/100, na.rm = TRUE))
pol$mleft <- mleft$mleft[match(interaction(pol$countryname, pol$edate), interaction(mleft$countryname, mleft$edate))]
mright <- pol%>% group_by(countryname, edate) %>% dplyr::summarise(mright = sum(right*pervote/100, na.rm = TRUE))
pol$mright <- mright$mright[match(interaction(pol$countryname, pol$edate), interaction(mright$countryname, mright$edate))]
mleftright <- pol %>% group_by(countryname, edate) %>% dplyr::summarise(mleftright = max(meanlr), mleft = max(mleft), mright = max(mright))
mleftright$year <- as.numeric(format(mleftright$edate, "%Y"))
mleftright <- filter(mleftright, !(countryname == "Greece" & edate == "2012-05-06"))
mleftright <- filter(mleftright, !(countryname == "Argentina" & edate == "2013-10-27"))
mleftright <- filter(mleftright, !(countryname == "Argentina" & edate == "2013-10-28"))
nodataideology <- vector(mode = "character", 0)
for(row in 1:nrow(countryvariables)) {
country <- countryvariables[row, 1]
year <- countryvariables[row, 2]
if(sum(mleftright$countryname %in% country) == 0) {
nodataideology <- c(nodataideology, levels(country)[country])
next
}
if(is.na(countryvariables[row, 15])) {
if(length(mleftright$mleftright[mleftright$countryname == country & mleftright$year == year]) != 0) {
if(!is.na(mleftright$mleftright[mleftright$countryname == country & mleftright$year == year])) {
countryvariables[row, 15] <- mleftright$mleftright[mleftright$countryname == country & mleftright$year == year]
countryvariables[row, 16] <- mleftright$mleft[mleftright$countryname == country & mleftright$year == year]
countryvariables[row, 17] <- mleftright$mright[mleftright$countryname == country & mleftright$year == year]
countryvariables[row, 18] <- year
countryvariables[row, 19] <- "MP"
next
}
} else {
values <- mleftright$mleftright[mleftright$countryname %in% country]
values2 <- mleftright$mleft[mleftright$countryname %in% country]
values3 <- mleftright$mright[mleftright$countryname %in% country]
years <- mleftright$year[mleftright$countryname %in% country]
nas <- !is.na(values)
if(sum(nas) == 0) {
countryvariables[row, 15] <- NA
countryvariables[row, 16] <- NA
countryvariables[row, 17] <- NA
countryvariables[row, 18] <- NA
} else {
values <- values[nas]
values2 <- values2[nas]
values3 <- values3[nas]
years <- years[nas]
dist <- abs(years - year)
countryvariables[row, 15] <- values[which.min(dist)]
countryvariables[row, 16] <- values2[which.min(dist)]
countryvariables[row, 17] <- values3[which.min(dist)]
countryvariables[row, 18] <- years[which.min(dist)]
countryvariables[row, 19] <- "MP"
}
}
}
}
# Finally, we save countryvariables so that it can be loaded in the main
# script and used to add the values to the main data frame with the World
# Values Survey data
save(countryvariables, file = "countryvariables.Rda")