Skip to content

Commit 83ab7a2

Browse files
committed
drop NAs, reduce sample size of gss
1 parent 8041866 commit 83ab7a2

File tree

4 files changed

+20
-23
lines changed

4 files changed

+20
-23
lines changed

R/gss.R

+6-6
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
#'
33
#' The General Social Survey is a high-quality survey which gathers data on
44
#' American society and opinions, conducted since 1972. This data set is a
5-
#' sample of 3,000 entries from the GSS, including demographic markers and some
5+
#' sample of 500 entries from the GSS, including demographic markers and some
66
#' economic variables. Note that this data is included for demonstration only,
7-
#' and will not provide accurate estimates relating to GSS variables unless
8-
#' properly weighted. However, due to the high quality of the GSS, the
9-
#' unweighted data will approximate the weighted data in some analyses.
10-
#' @format A tibble with 3000 rows and 11 variables:
7+
#' and should not be assumed to provide accurate estimates relating to the GSS.
8+
#' However, due to the high quality of the GSS, the unweighted data will
9+
#' approximate the weighted data in some analyses.
10+
#' @format A tibble with 500 rows and 11 variables:
1111
#' \describe{
1212
#' \item{year}{year respondent was surveyed}
1313
#' \item{age}{age at time of survey, truncated at 89}
14-
#' \item{sex}{respondent's sex}
14+
#' \item{sex}{respondent's sex (self-identified)}
1515
#' \item{college}{whether on not respondent has a college degree, including
1616
#' junior/community college}
1717
#' \item{partyid}{political party affiliation}

data-raw/save_gss.R

+8-11
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,16 @@ library(srvyr)
44
library(ggplot2)
55

66
# pull gss data
7-
87
temp <- tempfile()
98
download.file("https://gss.norc.org/documents/stata/GSS_stata.zip",temp)
10-
gss_orig <- haven::read_dta(unz(temp, filename = "GSS7218_R1.DTA")) %>%
9+
10+
# if this next line errors with "No such file or directory", try
11+
# incrementing the number after "_R"
12+
gss_orig <- haven::read_dta(unz(temp, filename = "GSS7218_R2.DTA")) %>%
1113
haven::as_factor()
1214
unlink(temp)
1315

1416
# select relevant columns
15-
1617
gss_small <- gss_orig %>%
1718
filter(!stringr::str_detect(sample, "blk oversamp")) %>% # this is for weighting
1819
select(year, age, sex, college = degree, partyid, hompop,
@@ -49,14 +50,13 @@ gss_small <- gss_orig %>%
4950
)
5051
)
5152

52-
# sample 3k of the full data set
53-
54-
set.seed(20191105)
53+
# sample 3k rows, first dropping NAs
54+
set.seed(20200201)
5555
gss <- gss_small %>%
56-
sample_n(3000)
56+
drop_na() %>%
57+
sample_n(500)
5758

5859
# check that the sample is similar unweighted to weighted
59-
6060
gss_wt <- srvyr::as_survey_design(gss, weights = weight)
6161

6262
unweighted <- gss %>%
@@ -70,10 +70,7 @@ weighted <- gss_wt %>%
7070
group_by(year, sex, partyid) %>%
7171
summarize(prop = srvyr::survey_mean())
7272

73-
# ehhhh close enough until you really drill down, we'll put a disclaimer
74-
7573
# save data into package
76-
7774
usethis::use_data(gss, overwrite = TRUE)
7875

7976
devtools::document()

data/gss.rda

-13.9 KB
Binary file not shown.

man/gss.Rd

+6-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)