tidymodels
diff --git a/‎R/gss.R
+6-6 b/‎R/gss.R
+6-6
diff --git a/‎data-raw/save_gss.R
+8-11 b/‎data-raw/save_gss.R
+8-11
diff --git a/‎data/gss.rda
-13.9 KB b/‎data/gss.rda
-13.9 KB
diff --git a/‎man/gss.Rd
+6-6 b/‎man/gss.Rd
+6-6
@@ -2,16 +2,16 @@
 #' 
 #' The General Social Survey is a high-quality survey which gathers data on 
 #' American society and opinions, conducted since 1972. This data set is a 
-#' sample of 3,000 entries from the GSS, including demographic markers and some 
+#' sample of 500 entries from the GSS, including demographic markers and some 
 #' economic variables. Note that this data is included for demonstration only, 
-#' and will not provide accurate estimates relating to GSS variables unless 
-#' properly weighted. However, due to the high quality of the GSS, the 
-#' unweighted data will approximate the weighted data in some analyses.
-#' @format A tibble with 3000 rows and 11 variables:
+#' and should not be assumed to provide accurate estimates relating to the GSS. 
+#' However, due to the high quality of the GSS, the unweighted data will 
+#' approximate the weighted data in some analyses.
+#' @format A tibble with 500 rows and 11 variables:
 #' \describe{
 #'   \item{year}{year respondent was surveyed}
 #'   \item{age}{age at time of survey, truncated at 89}
-#'   \item{sex}{respondent's sex}
+#'   \item{sex}{respondent's sex (self-identified)}
 #'   \item{college}{whether on not respondent has a college degree, including 
 #'   junior/community college}
 #'   \item{partyid}{political party affiliation}
 
@@ -4,15 +4,16 @@ library(srvyr)
 library(ggplot2)
 
 # pull gss data
-
 temp <- tempfile()
 download.file("https://gss.norc.org/documents/stata/GSS_stata.zip",temp)
-gss_orig <- haven::read_dta(unz(temp, filename = "GSS7218_R1.DTA")) %>%
+
+# if this next line errors with "No such file or directory", try
+# incrementing the number after "_R"
+gss_orig <- haven::read_dta(unz(temp, filename = "GSS7218_R2.DTA")) %>%
   haven::as_factor()
 unlink(temp)
 
 # select relevant columns
-
 gss_small <- gss_orig %>%
   filter(!stringr::str_detect(sample, "blk oversamp")) %>% # this is for weighting
   select(year, age, sex, college = degree, partyid, hompop, 
@@ -49,14 +50,13 @@ gss_small <- gss_orig %>%
          )
          )
 
-# sample 3k of the full data set
-
-set.seed(20191105)
+# sample 3k rows, first dropping NAs
+set.seed(20200201)
 gss <- gss_small %>%
-  sample_n(3000)
+  drop_na() %>%
+  sample_n(500)
 
 # check that the sample is similar unweighted to weighted
-
 gss_wt <- srvyr::as_survey_design(gss, weights = weight)
 
 unweighted <- gss %>%
@@ -70,10 +70,7 @@ weighted <- gss_wt %>%
   group_by(year, sex, partyid) %>%
   summarize(prop = srvyr::survey_mean())
 
-# ehhhh close enough until you really drill down, we'll put a disclaimer
-
 # save data into package
-
 usethis::use_data(gss, overwrite = TRUE)
 
 devtools::document()