commit DGP1

oddish3 · Aug 15, 2024 · 0a7663e · 0a7663e
1 parent 470cbb9
commit 0a7663e
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 154 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -1,149 +1,3 @@
-y2 <- y1 + treat_noise
-# Calculate change in outcomes
-dy <- y2 - y1
-# Calculate true ATT
-pop_att <- dgp_params$E_func
-samp_att <- mean(treatment_effect[treatment == 1])
-# Calculate observed ATE (difference-in-differences)
-diff_treated <- mean(dy[treatment == 1])
-diff_control <- mean(dy[treatment == 0])
-observed_ate <- diff_treated - diff_control
-# Calculate true ACR
-samp_acr <- mean(dgp_params$func_deriv(dose[dose > 0]))
-pop_acr <- dgp_params$E_deriv
-browser()
-x_grid <- seq(0, 1, length.out = 1000)
-func_values <- dgp_params$func(x_grid)
-func_deriv_values <- dgp_params$func_deriv(x_grid)
-df <- data.frame(
-y1 = y1,
-y2 = y2,
-dy = dy,
-treatment = treatment,
-dose = dose
-)
-return(list(
-data = df,
-info = list(
-pop_att = pop_att,
-obs_ate = observed_ate,
-pop_acr = pop_acr,
-func_values = func_values,
-func_deriv_values = func_deriv_values,
-noisy_treatment_effect = noisy_treatment_effect
-)
-))
-}
-# Generate data
-# set.seed(1234567)
-dat <- gdata(n = n, dgp = dgp)
-func_values <- dgp_params$func(x_grid)
-func_deriv_values <- dgp_params$func_deriv(x_grid)
-<-
-x_grid <- if (length(x) == 1) rep(x, 1000) else seq(0, 1, length.out = 1000)
-x_grid <- if (length(x) == 1) rep(x, 1000) else seq(0, 1, length.out = 1000)
-if (length(x) == 1) rep(x, 1000) else seq(0, 1, length.out = 1000)
-func_values <- dgp_params$func(x_grid)
-func_deriv_values <- dgp_params$func_deriv(x_grid)
-func_values <- dgp_params$func(x_grid)
-if (length(x) == 1) {
-x_grid <- c(x_grid, rep(x, length(x_grid)))
-}
-func_deriv_values <- if (is.numeric(dgp_params$func_deriv) && length(dgp_params$func_deriv) == 1) {
-rep(dgp_params$func_deriv, length(x_grid))
-} else if (is.function(dgp_params$func_deriv)) {
-dgp_params$func_deriv(x_grid)
-} else {
-stop("dgp_params$func_deriv must be either a scalar or a function")
-}
-is.numeric(dgp_params$func_deriv)
-length(dgp_params$func_deriv) == 1
-func_deriv_values <- if (length(dgp_params$func_deriv) == 1) {
-rep(dgp_params$func_deriv, length(x_grid))
-} else if (is.function(dgp_params$func_deriv)) {
-dgp_params$func_deriv(x_grid)
-} else {
-stop("dgp_params$func_deriv must be either a scalar or a function")
-}
-# Generate data
-# set.seed(1234567)
-dat <- gdata(n = n, dgp = dgp)
-# browser()
-tryCatch({
-if (any(data$dose < 0 | data$dose > 1, na.rm = TRUE)) {
-knots <- c(0.4, 0.5, 0.6)
-} else {
-quantiles <- quantile(data$dose, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
-if (length(unique(quantiles)) == 3) {
-margin <- 0.05
-knots <- pmax(margin, pmin(1 - margin, quantiles))
-} else {
-knots <- c(0.4, 0.5, 0.6)
-}
-}
-formula <- bquote(
-dy ~ splines2::bSpline(dose, knots = .(knots), degree = 3, intercept = TRUE) - 1
-)
-model <- suppressWarnings(
-fixest::feols(
-eval(formula),
-data = data,
-# cluster = ~ individual
-)
-)
-att_SR <- predict(model)
-mean_att_SR <- mean(att_SR)
-spline_dosage_SR <- bSpline(data$dose,
-knots = knots,
-degree = 3,
-intercept = TRUE)
-n_treated_SR <- sum(data$dose > 0)
-infl_reg_SR <- model$residuals * spline_dosage_SR %*%
-(MASS::ginv(t(spline_dosage_SR) %*% spline_dosage_SR / n_treated_SR))
-infl_att_SR <- infl_reg_SR %*% t(spline_dosage_SR)
-se_att_SR <- sqrt(mean(colMeans(infl_att_SR^2)) / n_treated_SR)
-derivative_spline_dosage_SR <- dbs(data$dose,
-knots = knots,
-degree = 3,
-intercept = TRUE)
-coefficients_SR <- coef(model)
-acrt_SR <- as.vector(derivative_spline_dosage_SR %*% coefficients_SR)
-acrt_SR_filtered <- acrt_SR[data$dose > 0]
-ACR_hat_0 <- mean(acrt_SR_filtered)
-list(
-att_estimate = mean_att_SR,
-att_se = se_att_SR,
-acr_estimate = ACR_hat_0
-)
-}, error = function(e) {
-return(list(
-att_estimate = NA_real_,
-att_se = NA_real_,
-acr_estimate = NA_real_,
-))
-})
-source("~/Documents/uni/master-dissertation/contdid/simulation/DGP1.R")
-source("~/Documents/uni/master-dissertation/contdid/R/ucb_cvge.R")
-devtools::document()
-# # Run estimators and calculate metrics
-# NPIV
-# debugonce(npiv_regression)
-npiv_result <- tryCatch({
-result <- npiv_regression(
-data = data,
-treatment_col = "dose",
-outcome_col = "dy",
-)
-# write_debug_info("npiv_regression completed successfully")
-result
-}, error = function(e) {
-# write_debug_info(paste("Error in npiv_regression:", e$message))
-return(NULL)
-})
-npiv_result <- tryCatch({
-result <- npiv_regression(
-data = data,
-treatment_col = "dose",
 outcome_col = "dy",
 )
 # write_debug_info("npiv_regression completed successfully")
@@ -510,3 +364,149 @@ stopCluster(cl)
 toc()
 View(results_list)
 devtools::test()
+devtools::load_all()
+devtools::test()
+devtools::load_all()
+devtools::test()
+devtools::load_all()
+devtools::test()
+debugonce(npiv_regression)
+devtools::load_all()
+devtools::test()
+library(testthat)
+library(readr)
+library(dplyr)
+library(purrr)
+library(contdid)
+set.seed(1234567)
+# Helper function to load CSV files
+load_csv_files <- function(directory) {
+tryCatch({
+csv_files <- list.files(directory, pattern = "\\.csv$", full.names = TRUE)
+if (length(csv_files) == 0) {
+stop("No CSV files found in the specified directory")
+}
+message(paste("Found", length(csv_files), "CSV files"))
+data_list <- map(csv_files, ~{
+message(paste("Processing file:", .x))
+var_name <- tools::file_path_sans_ext(basename(.x))
+var_name <- sub("^123_", "", var_name)  # Remove "123_" prefix
+data <- read_csv(.x, col_types = cols(), col_names = FALSE)
+if (ncol(data) == 1) {
+setNames(list(as.numeric(data[[1]])), var_name)
+} else {
+setNames(list(data), var_name)
+}
+}) %>% flatten()
+message("CSV files loaded successfully")
+return(data_list)
+}, error = function(e) {
+message(paste("Error in load_csv_files:", e$message))
+return(NULL)
+})
+}
+# Test function
+test_matlab_equality <- function(r_result, matlab_data, tolerance = 1e-2) {
+tryCatch({
+common_vars <- intersect(names(r_result), names(matlab_data))
+message(paste("Testing", length(common_vars), "common variables"))
+for (var in common_vars) {
+message(paste("Testing variable:", var))
+r_value <- r_result[[var]]
+matlab_value <- matlab_data[[var]]
+# Convert to vector if it's a single-column data frame
+restructure_variable <- function(var) {
+if (is.data.frame(var) && ncol(var) == 1) {
+# If the variable is a data frame with 1 column, extract the vector
+var <- var[[1]]
+}
+if (is.matrix(var) && ncol(var) == 1) {
+# If the variable is a matrix with 1 column, convert to a vector
+var <- as.numeric(var)
+}
+if (is.array(var) && length(dim(var)) == 2 && dim(var)[2] == 1) {
+# If the variable is a 2D array with 1 column, convert to a vector
+var <- as.numeric(var)
+}
+return(var)
+}
+# Restructure r_value and matlab_value
+r_value <- restructure_variable(r_value)
+matlab_value <- restructure_variable(matlab_value)
+# Check if the lengths are the same
+expect_equal(length(r_value), length(matlab_value),
+info = paste("Length mismatch for variable:", var))
+# Check if the values are approximately equal
+expect_equal(r_value, matlab_value, tolerance = tolerance,
+info = paste("Value mismatch for variable:", var))
+}
+message("All variables tested successfully")
+}, error = function(e) {
+message(paste("Error in test_matlab_equality:", e$message))
+})
+}
+tryCatch({
+skip_if_not(dir.exists(test_path("data", "matlab_results")),
+"MATLAB data directory not found")
+skip_if_not(file.exists(test_path("data", "medicare1.csv")),
+"Medicare data file not found")
+message("Loading MATLAB CSV files")
+matlab_data <- load_csv_files(test_path("data", "matlab_results"))
+if (is.null(matlab_data)) {
+skip("Failed to load MATLAB data")
+}
+message("Loading R data")
+data <- read.csv(test_path("data", "medicare1.csv"))
+message(paste("R data dimensions:", nrow(data), "x", ncol(data)))
+message("Running npiv_regression")
+r_result <- npiv_regression(data, "medicare_share_1983", "d_capital_labor_ratio")
+message("Comparing results")
+test_matlab_equality(r_result, matlab_data)
+message("Test completed successfully")
+}, error = function(e) {
+message(paste("Error in main test:", e$message))
+stop(e)
+})
+skip_if_not(dir.exists(test_path("data", "matlab_results")),
+"MATLAB data directory not found")
+skip_if_not(file.exists(test_path("data", "medicare1.csv")),
+"Medicare data file not found")
+message("Loading MATLAB CSV files")
+matlab_data <- load_csv_files(test_path("data", "matlab_results"))
+if (is.null(matlab_data)) {
+skip("Failed to load MATLAB data")
+}
+message("Loading R data")
+data <- read.csv(test_path("data", "medicare1.csv"))
+message(paste("R data dimensions:", nrow(data), "x", ncol(data)))
+message("Running npiv_regression")
+r_result <- npiv_regression(data, "medicare_share_1983", "d_capital_labor_ratio")
+debugonce(npiv_regression)
+r_result <- npiv_regression(data, "medicare_share_1983", "d_capital_labor_ratio")
+devtools::load_all()
+devtools::test()
+results_list <- readRDS("~/Documents/uni/master-dissertation/contdid/simulation/results_list.rds")
+# saveRDS(results_list, file = "results_list.rds")
+# Combine all results into a single data frame
+all_results <- do.call(rbind, results_list)
+# Calculate mean results
+mean_results <- all_results %>%
+group_by(n, dgp) %>%
+summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
+library(tictoc)
+library(parallel)
+library(doParallel)
+library(foreach)
+library(doRNG)
+library(tidyverse)
+library(fixest)
+library(splines2)
+library(triangle)
+# saveRDS(results_list, file = "results_list.rds")
+# Combine all results into a single data frame
+all_results <- do.call(rbind, results_list)
+# Calculate mean results
+mean_results <- all_results %>%
+group_by(n, dgp) %>%
+summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
+View(mean_results)
diff --git a/R/npiv_regression.R b/R/npiv_regression.R
@@ -62,13 +62,19 @@ npiv_regression <- function(data,
   CJ <- c(0, cumsum(TJ))
   # log_dimensions("TJ", TJ)
   # log_dimensions("CJ", CJ)
-  # Create grid for x
-  # Xx <- seq(0, 1, length.out = nx)
-  # Xx_sub <- Xx[Xx>0.01 & Xx<=0.99]
+
+   # Create grid for x
+  # x_min <- min(x)
+  # x_max <- max(x)
+  # Xx <- seq(x_min, x_max, length.out = nx + 1)
+  # buffer <- 0.01 * (x_max - x_min)
+  # Xx_sub <- Xx[Xx > x_min + buffer & Xx <= x_max - buffer]
+
   Xx <- seq(0, 1, length.out = nx + 1)  # +0 because MATLAB's 0:1/nx:1 includes both endpoints removed + 1
   Xx_sub <- Xx[Xx > 0.01 & Xx <= 0.99]
   # log_dimensions("Xx", Xx)
   # log_dimensions("Xx_sub", Xx_sub)
+
   # Compute basis functions
   compute_basis_functions <- function(x, nL, r, CJ, derivative = FALSE) {
     result <- matrix(0, nrow = length(x), ncol = CJ[length(CJ)])

diff --git a/simulation/working-sim.R b/simulation/working-sim.R
@@ -36,7 +36,7 @@ clusterExport(cl, c("dgp_function", "gdata", "run_twfe", "run_feols_bspline", "c
 
 results_list <- list()
 tic()
-for (dgp in 2:2) {
+for (dgp in 1:4) {
   # cat("Processing DGP:", dgp, "\n")
   dgp_results <- list()
   for (sample_size in n) {
@@ -57,11 +57,11 @@ stopCluster(cl)
 toc()
 # saveRDS(results_list, file = "results_list.rds")
 # Combine all results into a single data frame
-# all_results <- do.call(rbind, results_list)
+all_results <- do.call(rbind, results_list)
 
 # Calculate mean results
-# mean_results <- all_results %>%
-#   group_by(n, dgp) %>%
-#   summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
+mean_results <- all_results %>%
+  group_by(n, dgp) %>%
+  summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .groups = "drop")
 
 
diff --git a/tests/testthat/test-matlab-equality.R b/tests/testthat/test-matlab-equality.R
@@ -92,6 +92,7 @@ test_that("Short Test: R results match MATLAB results", {
     message(paste("R data dimensions:", nrow(data), "x", ncol(data)))
 
     message("Running npiv_regression")
+    # debugonce(npiv_regression)
     r_result <- npiv_regression(data, "medicare_share_1983", "d_capital_labor_ratio")
 
     message("Comparing results")