From 0876fa8261760c8cae7efb3f99605df28295f343 Mon Sep 17 00:00:00 2001 From: Jim Thorson <50178738+James-Thorson-NOAA@users.noreply.github.com> Date: Thu, 23 Jan 2025 11:13:12 -0800 Subject: [PATCH] Test merge and release (#30) * small updates * Update vignette.Rmd * attempt at residuals for family="fixed" * fix docs * update to allow DHARMa using predictive samples * update citation * fix bug in loo_residuals * fix CI * add track_progress argument to loo_residuals * Update vignette.Rmd * add upper and lower bounds for optimizer * fix residuals demo * Update vignette.Rmd * update plots * fix last commit * Update dsem.R * fit error in DHARMa vignette example * add `stepwise_selection(.)` * fix notes and warnings * add `equations_to_text` * update vignette to show `convert_equations` * update DFA analysis using lag-and-equations syntax * fix bug in docs * update make_dsem_ram to use additive-kronecker * try to fix CI error * fix attempt-2 * attempt-3 * change DFA vignette back * next attempt * another fix attempt * update citation and pkgdown yaml * adding cAIC ... ... which only works when using models with measurement errors * merging add_priors to dev branch * fix merge * add test-priors CI check * test fix for R-CMD-check error * add dsemRTMB * add reduced-rank GMRF to dsemRTMB * adding log_prior argument to dsemRTMB * Update test_dsemRTMB.R * finish initial dsemRTMB * get dsemRTMB working as package * allow vector-valued priors * more simulator scratch-scripts * adding obj$simulator(.) slot * fix dsemRTMB to work without loading locally * add run_time slot (to compare TMB and RTMB) * fix error in sparseMatrix stuff * updates for merge * try making CI happy * another attempt at R_CMD_check * another try * another try to R_CMD_check * cautious update * make check_win_devel(.) happy * add CI for fixed and mapped parameters and fix issue in dsemRTMB with these * make check_win_devel(.) happy * Update make_matrices.R --------- Co-authored-by: Jim Thorson --- .github/workflows/pkgdown.yaml | 4 +- .github/workflows/test-coverage.yaml | 4 +- DESCRIPTION | 14 +- NAMESPACE | 38 ++++ NEWS.md | 13 ++ R/cAIC.R | 122 +++++++++++ R/compute_nll.R | 255 +++++++++++++++++++++++ R/convert_equations.R | 188 +++++++++++++++++ R/data.R | 2 +- R/dsem.R | 139 +++++++++++-- R/dsemRTMB.R | 285 ++++++++++++++++++++++++++ R/make_dsem_ram.R | 56 +++-- R/make_matrices.R | 65 ++++++ R/read_model.R | 137 +++++++++++++ R/rgmrf.R | 15 ++ R/stepwise_selection.R | 68 ++++++ R/utility.R | 128 ++++++++++++ inst/CITATION | 5 +- man/cAIC.Rd | 55 +++++ man/convert_equations.Rd | 44 ++++ man/dsem.Rd | 16 +- man/dsemRTMB.Rd | 91 ++++++++ man/dsem_control.Rd | 12 +- man/isle_royale.Rd | 2 +- man/logLik.dsem.Rd | 2 +- man/loo_residuals.Rd | 46 +++++ man/plot.dsem.Rd | 15 +- man/read_model.Rd | 27 +++ man/stepwise_selection.Rd | 37 ++++ scratch/README.R | 5 + scratch/debug_simulator.R | 44 ++++ scratch/example.R | 24 +++ scratch/example.RDS | Bin 0 -> 1118 bytes scratch/example2.R | 81 ++++++++ scratch/example2_tmp.R | 102 +++++++++ scratch/show_simulate_errors.R | 53 +++++ scratch/test_cAIC.R | 132 ++++++++++++ scratch/test_dsemRTMB.R | 225 ++++++++++++++++++++ src/dsem.cpp | 2 +- tests/testthat/test-platform.R | 130 +++++++----- tests/testthat/test-priors.R | 65 ++++++ vignettes/dynamic_factor_analysis.Rmd | 31 ++- vignettes/vignette.Rmd | 102 ++++++++- 43 files changed, 2765 insertions(+), 116 deletions(-) create mode 100644 R/cAIC.R create mode 100644 R/compute_nll.R create mode 100644 R/convert_equations.R create mode 100644 R/dsemRTMB.R create mode 100644 R/make_matrices.R create mode 100644 R/read_model.R create mode 100644 R/rgmrf.R create mode 100644 R/stepwise_selection.R create mode 100644 R/utility.R create mode 100644 man/cAIC.Rd create mode 100644 man/convert_equations.Rd create mode 100644 man/dsemRTMB.Rd create mode 100644 man/loo_residuals.Rd create mode 100644 man/read_model.Rd create mode 100644 man/stepwise_selection.Rd create mode 100644 scratch/README.R create mode 100644 scratch/debug_simulator.R create mode 100644 scratch/example.R create mode 100644 scratch/example.RDS create mode 100644 scratch/example2.R create mode 100644 scratch/example2_tmp.R create mode 100644 scratch/show_simulate_errors.R create mode 100644 scratch/test_cAIC.R create mode 100644 scratch/test_dsemRTMB.R create mode 100644 tests/testthat/test-priors.R diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index a578894..6543e39 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, dev] + branches: [main] pull_request: - branches: [main, dev] + branches: [main] release: types: [published] workflow_dispatch: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 35a8cf7..01ee697 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -15,7 +15,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -56,7 +56,7 @@ jobs: - name: Upload test results if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage-test-failures path: ${{ runner.temp }}/package diff --git a/DESCRIPTION b/DESCRIPTION index 4035e31..40ad697 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: dsem Type: Package Title: Fit Dynamic Structural Equation Models -Version: 1.3.0 -Date: 2024-07-18 +Version: 1.4.0 +Date: 2025-01-10 Authors@R: c(person(given = "James", family = "Thorson", @@ -14,6 +14,10 @@ Imports: Matrix, sem, igraph, + RTMB (>= 1.7.0), + ggraph, + ggplot2, + grid, methods Depends: R (>= 4.0.0), @@ -26,12 +30,10 @@ Suggests: gridExtra, dynlm, MARSS, - ggplot2, ggpubr, - ggraph, - grid, vars, - testthat + testthat, + DHARMa Enhances: rstan, tmbstan diff --git a/NAMESPACE b/NAMESPACE index aedd77b..be10967 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,32 +11,70 @@ S3method(vcov,dsem) export(TMBAIC) export(as_fitted_DAG) export(as_sem) +export(cAIC) export(classify_variables) +export(convert_equations) export(dsem) +export(dsemRTMB) export(dsem_control) export(list_parameters) +export(loo_residuals) export(make_dfa) export(make_dsem_ram) export(parse_path) +export(stepwise_selection) importFrom(Matrix,Cholesky) +importFrom(Matrix,Diagonal) +importFrom(Matrix,crossprod) +importFrom(Matrix,diag) +importFrom(Matrix,drop0) +importFrom(Matrix,kronecker) +importFrom(Matrix,mat2triplet) importFrom(Matrix,solve) +importFrom(Matrix,sparseMatrix) +importFrom(Matrix,t) +importFrom(Matrix,tcrossprod) +importFrom(RTMB,AD) +importFrom(RTMB,ADREPORT) +importFrom(RTMB,ADoverload) +importFrom(RTMB,REPORT) +importFrom(RTMB,dbinom) +importFrom(RTMB,dgamma) +importFrom(RTMB,dgmrf) +importFrom(RTMB,dnorm) +importFrom(RTMB,dpois) importFrom(TMB,MakeADFun) importFrom(TMB,compile) importFrom(TMB,dynlib) importFrom(TMB,sdreport) importFrom(TMB,summary.sdreport) +importFrom(ggplot2,aes) +importFrom(ggraph,create_layout) +importFrom(ggraph,geom_edge_arc) +importFrom(ggraph,geom_node_text) +importFrom(ggraph,ggraph) +importFrom(ggraph,rectangle) +importFrom(ggraph,theme_graph) +importFrom(grid,arrow) importFrom(igraph,graph_from_data_frame) +importFrom(igraph,layout_) importFrom(igraph,plot.igraph) +importFrom(igraph,with_sugiyama) importFrom(methods,is) importFrom(sem,sem) importFrom(stats,"tsp<-") importFrom(stats,.preformat.ts) +importFrom(stats,AIC) importFrom(stats,logLik) importFrom(stats,na.omit) importFrom(stats,nlminb) importFrom(stats,optimHess) +importFrom(stats,plogis) importFrom(stats,pnorm) +importFrom(stats,rbinom) +importFrom(stats,rgamma) importFrom(stats,rnorm) +importFrom(stats,rpois) importFrom(stats,sd) importFrom(stats,simulate) importFrom(stats,time) diff --git a/NEWS.md b/NEWS.md index 681fc75..77a6820 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +# dsem 1.4.0 + +* Adding option for lower and upper bounds +* Adding `stepwise_selection` for automated stepwise model selection +* Adding `plot` option to use `ggraph` as alternative to previous `igraph` + option +* Adding `convert_equations` to extend `sem::specifyEquations` and simplify + specification for large models +* Adding argument `prior_negloglike` as interface to specify Bayesian priors + and/or likelihood penalties in `dsem` +* Adding `dsemRTMB` using RTMB as interchangeable alternative to `dsem` using + TMB + # dsem 1.3.0 * Adding option to specify constant marginal variance, as alternative to existing diff --git a/R/cAIC.R b/R/cAIC.R new file mode 100644 index 0000000..d594e3f --- /dev/null +++ b/R/cAIC.R @@ -0,0 +1,122 @@ + +#' @title Calculate conditional AIC +#' +#' @description +#' Calculates the conditional Akaike Information criterion (cAIC). +#' +#' @param object Output from \code{\link{dsem}} +#' @param what Whether to return the cAIC or the effective degrees of freedom +#' (EDF) for each group of random effects. +#' +#' @details +#' cAIC is designed to optimize the expected out-of-sample predictive +#' performance for new data that share the same random effects as the +#' in-sample (fitted) data, e.g., spatial interpolation. In this sense, +#' it should be a fast approximation to optimizing the model structure +#' based on k-fold crossvalidation. +#' By contrast, \code{AIC} calculates the +#' marginal Akaike Information Criterion, which is designed to optimize +#' expected predictive performance for new data that have new random effects, +#' e.g., extrapolation, or inference about generative parameters. +#' +#' cAIC also calculates as a byproduct the effective degrees of freedom, +#' i.e., the number of fixed effects that would have an equivalent impact on +#' model flexibility as a given random effect. +#' +#' Both cAIC and EDF are calculated using Eq. 6 of Zheng Cadigan Thorson 2024. +#' +#' Note that, for models that include profiled fixed effects, these profiles +#' are turned off. +#' +#' @return +#' Either the cAIC, or the effective degrees of freedom (EDF) by group +#' of random effects +#' +#' @references +#' +#' **Deriving the general approximation to cAIC used here** +#' +#' Zheng, N., Cadigan, N., & Thorson, J. T. (2024). +#' A note on numerical evaluation of conditional Akaike information for +#' nonlinear mixed-effects models (arXiv:2411.14185). arXiv. +#' \doi{10.48550/arXiv.2411.14185} +#' +#' **The utility of EDF to diagnose hierarchical model behavior** +#' +#' Thorson, J. T. (2024). Measuring complexity for hierarchical +#' models using effective degrees of freedom. Ecology, +#' 105(7), e4327 \doi{10.1002/ecy.4327} +#' +#' @export +cAIC <- +function( object, + what = c("cAIC","EDF") ){ + + what = match.arg(what) + data = object$tmb_inputs$data + + # Error checks + if(any(is.na(object$tmb_inputs$map$x_tj))){ + stop("cAIC is not implemented when fixing states at data using family=`fixed`") + } + + # Turn on all GMRF parameters + map = object$tmb_inputs$map + map$x_tj = factor(seq_len(prod(dim(data$y_tj)))) + + # Make sure profile = NULL + #if( is.null(object$internal$control$profile) ){ + obj = object$obj + #}else{ + obj = TMB::MakeADFun( data = data, + parameters = object$internal$parhat, + random = object$tmb_inputs$random, + map = map, + profile = NULL, + DLL="dsem", + silent = TRUE ) + #} + + # Weights = 0 is equivalent to data = NA + data$y_tj[] = NA + # Make obj_new + obj_new = TMB::MakeADFun( data = data, + parameters = object$internal$parhat, + map = map, + random = object$tmb_inputs$random, + DLL = "dsem", + profile = NULL ) + + # + par = obj$env$parList() + parDataMode <- obj$env$last.par + indx = obj$env$lrandom() + q = sum(indx) + p = length(object$opt$par) + + ## use - for Hess because model returns negative loglikelihood; + Hess_new = -Matrix::Matrix(obj_new$env$f(parDataMode,order=1,type="ADGrad"),sparse = TRUE) + #cov_Psi_inv = -Hess_new[indx,indx]; ## this is the marginal prec mat of REs; + Hess_new = Hess_new[indx,indx] + + ## Joint hessian etc + Hess = -Matrix::Matrix(obj$env$f(parDataMode,order=1,type="ADGrad"),sparse = TRUE) + Hess = Hess[indx,indx] + #negEDF = diag(as.matrix(solve(ddlj.r)) %*% ddlr.r) + negEDF = Matrix::diag(Matrix::solve(Hess, Hess_new)) + # + + if(what=="cAIC"){ + jnll = obj$env$f(parDataMode) + cnll = jnll - obj_new$env$f(parDataMode) + cAIC = 2*cnll + 2*(p+q) - 2*sum(negEDF) + return(cAIC) + } + if(what=="EDF"){ + #Sdims = object$tmb_inputs$tmb_data$Sdims + #group = rep.int( seq_along(Sdims), times=Sdims ) + #names(negEDF) = names(obj$env$last.par)[indx] + EDF = length(negEDF) - sum(negEDF) + return(EDF) + } +} diff --git a/R/compute_nll.R b/R/compute_nll.R new file mode 100644 index 0000000..ddd83e9 --- /dev/null +++ b/R/compute_nll.R @@ -0,0 +1,255 @@ +# model function +compute_nll <- +function( parlist, + model, + y_tj, + family, + options, + log_prior, + simulate_data = FALSE, + simulate_gmrf = FALSE ) { + # options[1] -> 0: full rank; 1: rank-reduced GMRF + # options[2] -> 0: constant conditional variance; 1: constant marginal variance + + "c" <- ADoverload("c") + "[<-" <- ADoverload("[<-") + + # Temporary fix for solve(adsparse) returning dense-matrix + #sparse_solve = function(x){ + # invx = solve(x) + # if( RTMB:::ad_context() ){ + # out = sparseMatrix( + # i = row(invx), + # j = col(invx), + # x = 1, + # ) + # out = AD(out) + # out@x = invx + # #out = drop0(out) # drop0 doesn't work + # return(out) + # }else{ + # return(invx) + # } + #} + sparse_solve = solve + + # Unpack parameters explicitly + beta_z = parlist$beta_z + delta0_j = parlist$delta0_j + mu_j = parlist$mu_j + sigma_j = exp( parlist$lnsigma_j ) + x_tj = parlist$x_tj + + # + n_z = length(unique(model$parameter)) + #n_p2 = length(unique(subset(model,direction==2)$parameter)) + #n_p1 = length(unique(subset(model,direction==1)$parameter)) + n_t = nrow(y_tj) + n_j = ncol(y_tj) + n_k = prod(dim(y_tj)) + + # Unpack + #model_unique = model[match(unique(model$parameter),model$parameter),] + #beta_p[which(model_unique$direction==1)] = beta_p1 + #beta_p[which(model_unique$direction==2)] = beta_p2 + + # Build matrices + ram = make_matrices( + beta_p = beta_z, + model = model, + times = as.numeric(time(y_tj)), + variables = colnames(y_tj) ) + + # Assemble + Rho_kk = AD(ram$P_kk) + IminusRho_kk = Diagonal(n_k) - Rho_kk + # Assemble variance + Gamma_kk = AD(ram$G_kk) + V_kk = t(Gamma_kk) %*% Gamma_kk + #V_kk = crossprod( Gamma_kk, Gamma_kk ) # crossprod(A,B) = t(A) %*% B + + # Rescale I-Rho and Gamma if using constant marginal variance options + if( (options[2]==1) || (options[2]==2) ){ + invIminusRho_kk = sparse_solve(IminusRho_kk) # solve(adsparse) returns dense-matrix + + # Hadamard squared LU-decomposition + # See: https://eigen.tuxfamily.org/dox/group__QuickRefPage.html + squared_invIminusRho_kk = invIminusRho_kk + squared_invIminusRho_kk@x = squared_invIminusRho_kk@x^2 + + if( options[2] == 1 ){ + # 1-matrix + ones_k1 = matrix(1, nrow=n_k, ncol=1) + + # Calculate diag( t(Gamma) * Gamma ) + squared_Gamma_kk = Gamma_kk + squared_Gamma_kk@x = squared_Gamma_kk@x^2 + sigma2_k1 = t(squared_Gamma_kk) %*% ones_k1; + + # Rowsums + margvar_k1 = solve(squared_invIminusRho_kk, sigma2_k1) + + # Rescale IminusRho_kk and Gamma + invmargsd_kk = invsigma_kk = AD(Diagonal(n_k)) + invmargsd_kk@x = 1 / sqrt(margvar_k1[,1]) + invsigma_kk@x = 1 / sqrt(sigma2_k1[,1]) + IminusRho_kk = invmargsd_kk %*% IminusRho_kk; + Gamma_kk = invsigma_kk %*% Gamma_kk; + }else{ + # calculate diag(Gamma)^2 + targetvar_k = diag(Gamma_kk)^2 + + # Rescale Gamma + margvar_k = solve(squared_invIminusRho_kk, targetvar_k) + diag(Gamma_kk) = sqrt(margvar_k) + } + } + + # Calculate effect of initial condition -- SPARSE version + delta_tj = matrix( 0, nrow=n_t, ncol=n_j ) + if( length(delta0_j)>0 ){ + delta_tj[1,] = delta0_j + delta_k = solve( IminusRho_kk, as.vector(delta_tj) ) + delta_tj[] = delta_k + } + + # + xhat_tj = outer( rep(1,n_t), mu_j ) + + # Probability of GMRF + if( options[1] == 0 ){ + # Full rank GMRF + z_tj = x_tj + + # Works for diagonal + #invV_kk = AD(ram$G_kk) + #invV_kk@x = 1 / Gamma_kk@x^2 + #Q_kk = t(IminusRho_kk) %*% invV_kk %*% IminusRho_kk + + # Works in general + invV_kk = sparse_solve( V_kk ) + Q_kk = t(IminusRho_kk) %*% invV_kk %*% IminusRho_kk + + # Fine from here + jnll_gmrf = -1 * dgmrf( as.vector(z_tj), mu=as.vector(xhat_tj + delta_tj), Q=Q_kk, log=TRUE ) + REPORT( Q_kk ) + + # Only simulate GMRF if also simulating new data + if( isTRUE(simulate_data) & isTRUE(simulate_gmrf) ){ + x_tj[] = z_tj[] = rgmrf( mu=as.vector(xhat_tj + delta_tj), Q=Q_kk ) + } + }else{ + # Reduced rank projection .. dgmrf is lower precision than GMRF in CPP + jnll_gmrf = -1 * sum( dnorm(x_tj, mean=0, sd=1, log=TRUE) ) + + # Only simulate GMRF if also simulating new data + if( isTRUE(simulate_data) & isTRUE(simulate_gmrf) ){ + x_tj[] = rnorm(n=prod(dim(x_tj)), mean=0, sd=1) + } + + # + z_k1 = as.vector(x_tj) + z_k2 = Gamma_kk %*% z_k1 + z_k3 = solve(IminusRho_kk, z_k2) + z_tj = matrix(as.vector(z_k3), nrow=n_t, ncol=n_j) + xhat_tj + delta_tj + REPORT( z_k1 ) + REPORT( z_k2 ) + REPORT( z_k3 ) + } + + # Likelihood of data | random effects + loglik_tj = mu_tj = devresid_tj = matrix( 0, nrow=n_t, ncol=n_j ) + pow = function(a,b) a^b + for( t in 1:n_t ){ + for( j in 1:n_j ){ + # familycode = 0 : don't include likelihood + if( family[j]=="fixed" ){ + mu_tj[t,j] = z_tj[t,j]; + if(!is.na(y_tj[t,j])){ + if( isTRUE(simulate_data) ){ + y_tj[t,j] = mu_tj[t,j]; + } + } + devresid_tj[t,j] = 0; + } + # familycode = 1 : normal + if( family[j]=="normal" ){ + mu_tj[t,j] = z_tj[t,j]; + if(!is.na(y_tj[t,j])){ + loglik_tj[t,j] = dnorm( y_tj[t,j], mu_tj[t,j], sigma_j[j], TRUE ); + if( isTRUE(simulate_data) ){ + y_tj[t,j] = rnorm( n=1, mean=mu_tj[t,j], sd=sigma_j[j] ) + } + } + devresid_tj[t,j] = y_tj[t,j] - mu_tj[t,j]; + } + # familycode = 2 : binomial + if( family[j]=="binomial" ){ + mu_tj[t,j] = plogis(z_tj[t,j]); + if(!is.na(y_tj[t,j])){ + loglik_tj[t,j] = dbinom( y_tj[t,j], 1.0, mu_tj[t,j], TRUE ); + if( isTRUE(simulate_data) ){ + y_tj[t,j] = rbinom( n=1, size=1, prob=mu_tj[t,j] ); + } + } + devresid_tj[t,j] = sign(y_tj[t,j] - mu_tj[t,j]) * sqrt( -2*((1-y_tj[t,j])*log(1-mu_tj[t,j]) + y_tj[t,j]*log(mu_tj[t,j])) ); + } + # familycode = 3 : Poisson + if( family[j]=="poisson" ){ + mu_tj[t,j] = exp(z_tj[t,j]); + if(!is.na(y_tj[t,j])){ + loglik_tj[t,j] = dpois( y_tj[t,j], mu_tj[t,j], TRUE ); + if( isTRUE(simulate_data) ){ + y_tj[t,j] = rpois( n=1, lambda=mu_tj[t,j] ); + } + } + devresid_tj[t,j] = sign(y_tj[t,j] - mu_tj[t,j]) * sqrt( 2*(y_tj[t,j]*log((1e-10 + y_tj[t,j])/mu_tj[t,j]) - (y_tj[t,j]-mu_tj[t,j])) ); + } + # familycode = 4 : Gamma: shape = 1/CV^2; scale = mean*CV^2 + if( family[j]=="gamma" ){ + mu_tj[t,j] = exp(z_tj[t,j]); + if(!is.na(y_tj[t,j])){ + loglik_tj[t,j] = dgamma( y_tj[t,j], pow(sigma_j[j],-2), mu_tj[t,j]*pow(sigma_j[j],2), TRUE ); + if( isTRUE(simulate_data) ){ + y_tj[t,j] = rgamma( n=1, shape=pow(sigma_j[j],-2), scale=mu_tj[t,j]*pow(sigma_j[j],2) ); + } + } + devresid_tj[t,j] = sign(y_tj[t,j] - mu_tj[t,j]) * sqrt( 2*((y_tj[t,j]-mu_tj[t,j])/mu_tj[t,j] - log(y_tj[t,j]/mu_tj[t,j])) ); + } + }} + + # Calculate priors + log_prior_value = log_prior( parlist ) + + jnll = -1 * sum(loglik_tj) + jnll = jnll + jnll_gmrf - sum(log_prior_value) + + # + REPORT( loglik_tj ) + REPORT( jnll_gmrf ) + REPORT( xhat_tj ) # needed to simulate new GMRF in R + #REPORT( delta_k ) # FIXME> Eliminate in simulate.dsem + REPORT( delta_tj ) # needed to simulate new GMRF in R + REPORT( Rho_kk ) + REPORT( Gamma_kk ) + REPORT( mu_tj ) + REPORT( devresid_tj ) + REPORT( IminusRho_kk ) + REPORT( V_kk ) + REPORT( jnll ) + REPORT( loglik_tj ) + REPORT( jnll_gmrf ) + REPORT( log_prior_value ) + #SIMULATE{ + # REPORT( y_tj ) + #} + REPORT( z_tj ) + ADREPORT( z_tj ) + + if( isTRUE(simulate_data) ){ + list( y_tj=y_tj, z_tj=z_tj) + }else{ + jnll + } +} + diff --git a/R/convert_equations.R b/R/convert_equations.R new file mode 100644 index 0000000..ad16e7b --- /dev/null +++ b/R/convert_equations.R @@ -0,0 +1,188 @@ +#' @title Convert equations notation +#' +#' @description Converts equations to arrow-and-lag notation expected by dsem +#' +#' @param equations Specification for time-series structural equation model structure +#' including lagged or simultaneous effects. See Details section in +#' \code{\link[dsem]{convert_equations}} for more description +#' +#' @details +#' The function modifies code copied from package +#' `sem` under licence GPL (>= 2) with permission from John Fox. +#' +#' For specifyEquations, each input line is either a regression equation or the +#' specification of a variance or covariance. Regression equations are of the form +#' y = par1*x1 + par2*x2 + ... + park*xk +#' where y and the xs are variables in the model (either observed or latent), +#' and the pars are parameters. If a parameter is given as a numeric value +#' (e.g., 1) then it is treated as fixed. Note that no error variable is +#' included in the equation; error variances are specified via either +#' the covs argument, via V(y) = par (see immediately below), or are +#' added automatically to the model when, as by default, endog.variances=TRUE. +#' A regression equation may be split over more than one input by breaking at a +, +#' so that + is either the last non-blank character on a line or the +#' first non-blank character on the subsequent line. +#' +#' Variances are specified in the form V(var) = par and +#' covariances in the form C(var1, var2) = par, where the vars are +#' variables (observed or unobserved) in the model. The symbols V and C +#' may be in either lower- or upper-case. If par is a numeric value (e.g., 1) +#' then it is treated as fixed. In conformity with the RAM model, +#' a variance or covariance for an endogenous variable in the +#' model is an error variance or covariance. +#' +#' To set a start value for a free parameter, enclose the numeric +#' start value in parentheses after the parameter name, as parameter(value). +#' +#' @export +convert_equations <- +function(equations){ + + # Local functions + not.number <- function (constant){ + save <- options(warn = -1) + on.exit(save) + is.na(as.numeric(constant)) + } + par.start <- function(coef, eq) { + if (length(grep("\\(", coef)) == 0) { + return(c(coef, "0.01")) + } + par.start <- strsplit(coef, "\\(")[[1]] + if (length(par.start) != 2) + stop("Parse error in equation: ", eq, "\n Start values must be given in the form \"parameter(value)\".") + par <- par.start[[1]] + start <- par.start[[2]] + if (length(grep("\\)$", start)) == 0) + stop("Parse error in equation: ", eq, "\n Unbalanced parentheses.") + start <- sub("\\)", "", start) + return(c(par, start)) + } + parseEquation <- function(eqn) { + eq <- eqn + eqn <- gsub("\\s*", "", eqn) + eqn <- strsplit(eqn, "=")[[1]] + if (length(eqn) != 2) + stop("Parse error in equation: ", eq, "\n An equation must have a left- and right-hand side separated by =.") + lhs <- eqn[1] + rhs <- eqn[2] + if (length(grep("^[cC]\\(", lhs)) > 0) { + if (length(grep("\\)$", lhs)) == 0) + stop("Parse error in equation: ", eq, "\n Unbalanced parentheses.") + lhs <- sub("[cC]\\(", "", lhs) + lhs <- sub("\\)", "", lhs) + variables <- strsplit(lhs, ",")[[1]] + if (length(variables) != 2) + stop("Parse error in equation: ", eq, "\n A covariance must be in the form C(var1, var2) = cov12") + if (not.number(rhs)) { + par.start <- par.start(rhs, eq) + if (not.number(par.start[2]) && (par.start[2] != + "NA")) + stop("Parse error in equation: ", eq, "\n Start values must be numeric constants.") + ram <- paste(variables[1], " <-> ", variables[2], + ", ", par.start[1], ", ", par.start[2], sep = "") + } + else { + ram <- paste(variables[1], " <-> ", variables[2], + ", NA, ", rhs, sep = "") + } + } + else if (length(grep("^[vV]\\(", lhs)) > 0) { + lhs <- sub("[vV]\\(", "", lhs) + if (length(grep("\\)$", lhs)) == 0) + stop("Parse error in equation: ", eq, "\n Unbalanced parentheses.") + lhs <- sub("\\)", "", lhs) + if (not.number(rhs)) { + par.start <- par.start(rhs, eq) + if (not.number(par.start[2]) && (par.start[2] != + "NA")) + stop("Parse error in equation: ", eq, "\n Start values must be numeric constants.") + ram <- paste(lhs, " <-> ", lhs, ", ", par.start[1], + ", ", par.start[2], sep = "") + } + else { + ram <- paste(lhs, " <-> ", lhs, ", NA, ", rhs, + sep = "") + } + } + else { + terms <- strsplit(rhs, "\\+")[[1]] + terms <- strsplit(terms, "\\*") + ram <- character(length(terms)) + for (term in 1:length(terms)) { + trm <- terms[[term]] + if (length(trm) != 2) + stop("Parse error in equation: ", eq, "\n The term \"", + trm, "\" is malformed.", "\n Each term on the right-hand side of a structural equation must be of the form \"parameter*variable\".") + coef <- trm[1] + if (not.number(coef)) { + par.start <- par.start(coef, eq) + if (not.number(par.start[2]) && (par.start[2] != + "NA")) + stop("Parse error in equation: ", eq, "\n Start values must be numeric constants.") + ram[term] <- paste(trm[2], " -> ", lhs, ", ", + par.start[1], ", ", par.start[2], sep = "") + } + else { + ram[term] <- paste(trm[2], " -> ", lhs, ", NA, ", + coef, sep = "") + } + } + } + ram + } + parse_lag <- function(term){ + term_split = strsplit( term, " -> ", fixed=TRUE )[[1]] + var = term_split[1] + if (length(grep("\\[", var)) == 0) { + return(c(term, "0")) + } + var_split <- strsplit(var, "\\[")[[1]] + if (length(var_split) != 2){ + stop("Parse error in equation: ", term, "\n Lags must be given in the form \"lag[var,integer]\".") + } + par_split = strsplit(var_split[[2]], ",", fixed=TRUE)[[1]] + par = par_split[1] + lag <- par_split[[2]] + if (length(grep("\\]$", lag)) == 0){ + stop("Parse error in equation: ", term, "\n Unbalanced parentheses.") + } + lag <- sub("\\]", "", lag) + term_out = paste0( par, " -> ", term_split[2] ) + return(c(term_out, lag)) + } + add_lags <- function(term){ + term_split = strsplit(term, ", ", fixed=TRUE)[[1]] + lag = parse_lag(term_split[1]) + term_ram = paste0( lag[1], ", ", lag[2], ", ", term_split[2], ", ", term_split[3] ) + return(term_ram) + } + + # Read text + equations <- scan(text = equations, what = "", sep = ";", strip.white = TRUE, comment.char = "#") + equations2 <- character(0) + eqn <- 0 + skip <- FALSE + for (equation in equations) { + eqn <- eqn + 1 + if (skip) { + skip <- FALSE + next + } + if (substring(equation, 1, 1) == "+") { + equations2[length(equations2)] <- paste(equations2[length(equations2)], + equation) + } + else if (substring(equation, nchar(equation)) == "+") { + equations2 <- c(equations2, paste(equation, equations[eqn + + 1])) + skip <- TRUE + } + else equations2 <- c(equations2, equation) + } + ram <- unlist(lapply(equations2, parseEquation)) + + # + ram <- unlist(lapply(ram, add_lags)) + return(ram) +} diff --git a/R/data.R b/R/data.R index 612c097..02354e0 100644 --- a/R/data.R +++ b/R/data.R @@ -26,7 +26,7 @@ NULL #' #' @details #' Data extracted from file "Data_wolves_moose_Isle_Royale_June2019.csv" available at -#' \url{https://isleroyalewolf.org/data/data/home.html} and obtained 2023-06-23. +#' \url{https://www.isleroyalewolf.org} and obtained 2023-06-23. #' Reproduced with permission from John Vucetich, and generated by the #' Wolves and Moose of Isle Royale project. #' diff --git a/R/dsem.R b/R/dsem.R index f3a197d..929e419 100644 --- a/R/dsem.R +++ b/R/dsem.R @@ -20,14 +20,27 @@ #' while covs=c("x1", "x2") specifies the variance of x1 and the variance of x2 but not their covariance. #' These same covariances can be added manually via argument `sem`, but using argument `covs` might #' save time for models with many variables. +#' @param prior_negloglike A user-provided function that takes as input the vector of fixed effects out$obj$par +#' returns the negative log-prior probability. For example +#' \code{prior_negloglike = function(obj) -1 * dnorm( obj$par[1], mean=0, sd=0.1, log=TRUE)} +#' specifies a normal prior probability +#' for the for the first fixed effect with mean of zero and logsd of 0.1. +#' NOTE: this implementation does not work well with \code{tmbstan} and +#' is highly experimental. If using priors, considering using \code{\link{dsemRTMB}} +#' instead. The option in \code{dsem} is mainly intended to validate its +#' use in \code{dsemRTMB}. Note that the user must load RTMB using +#' \code{library(RTMB)} prior to running the model. #' @param control Output from \code{\link{dsem_control}}, used to define user #' settings, and see documentation for that function for details. #' #' @importFrom TMB compile dynlib MakeADFun sdreport summary.sdreport -#' @importFrom stats sd .preformat.ts na.omit nlminb optimHess pnorm rnorm simulate time tsp<- -#' @importFrom Matrix solve Cholesky +#' @importFrom stats AIC sd .preformat.ts na.omit nlminb optimHess pnorm rbinom rgamma rpois rnorm simulate time tsp<- plogis +#' @importFrom Matrix solve Cholesky sparseMatrix mat2triplet drop0 #' @importFrom sem sem -#' @importFrom igraph plot.igraph graph_from_data_frame +#' @importFrom igraph plot.igraph graph_from_data_frame with_sugiyama layout_ +#' @importFrom ggraph ggraph geom_edge_arc create_layout rectangle geom_node_text theme_graph +#' @importFrom ggplot2 aes +#' @importFrom grid arrow #' @importFrom methods is #' #' @details @@ -71,6 +84,7 @@ #' \item{sdrep}{The output from \code{\link[TMB]{sdreport}}} #' \item{interal}{Objects useful for package function, i.e., all arguments #' passed during the call} +#' \item{run_time}{Total time to run model} #' } #' #' @references @@ -78,9 +92,10 @@ #' **Introducing the package, its features, and comparison with other software #' (to cite when using dsem):** #' -#' Thorson, J. T., Andrews, A., Essington, T., Large, S. (In review). +#' Thorson, J. T., Andrews, A., Essington, T., Large, S. (2024). #' Dynamic structural equation models synthesize #' ecosystem dynamics constrained by ecological mechanisms. +#' Methods in Ecology and Evolution. \doi{10.1111/2041-210X.14289} #' #' @examples #' # Define model @@ -120,8 +135,10 @@ function( sem, tsdata, family = rep("fixed",ncol(tsdata)), estimate_delta0 = FALSE, + prior_negloglike = NULL, control = dsem_control(), covs = colnames(tsdata) ){ + start_time = Sys.time() # General error checks if( isFALSE(is(control, "dsem_control")) ) stop("`control` must be made by `dsem_control()`") @@ -135,8 +152,8 @@ function( sem, # General warnings if( isFALSE(control$quiet) ){ tsdata_SD = apply( tsdata, MARGIN=2, FUN=sd, na.rm=TRUE ) - if( any((max(tsdata_SD)/min(tsdata_SD)) > 10, rm.na=TRUE) ){ - warning("Some variables in `tsdata` have much higher variance than others. Please consider rescaling variables to prevent issues with numerical convergence.") + if( any((max(tsdata_SD,rm.na=TRUE)/min(tsdata_SD,rm.na=TRUE)) > 100) ){ + warning("Some variables in `tsdata` have much higher variance than others. Please consider rescaling variables to prevent issues with numerical convergence.") } } @@ -158,6 +175,11 @@ function( sem, if( ncol(tsdata) != length(unique(colnames(tsdata))) ){ stop("Please check `colnames(tsdata)` to confirm that all variables (columns) have a unique name") } + if( !all(control$lower == -Inf) | !all(control$upper == Inf) ){ + if( control$newton_loops > 0 ){ + stop("If specifying `lower` or `upper`, please set `dsem_control('newton_loops'=0)`") + } + } # options = c( @@ -223,13 +245,15 @@ function( sem, } # Build object - obj = MakeADFun( data=Data, + obj = TMB::MakeADFun( data=Data, parameters=Params, random=Random, map=Map, profile = control$profile, - DLL="dsem" ) + DLL="dsem", + silent = TRUE ) if(control$quiet==FALSE) list_parameters(obj) + # bundle internal = list( sem = sem, tsdata = tsdata, @@ -238,6 +262,26 @@ function( sem, control = control, covs = covs ) + + # Parse priors + if( !is.null(prior_negloglike) ){ + # prior_negloglike = \(obj) -dnorm(obj$par[1],0,1,log=TRUE) + prior_value = tryCatch( expr = prior_negloglike(obj) ) + if( is.na(prior_value) ) stop("Check `prior_negloglike(obj$par)`") + obj$fn_orig = obj$fn + obj$gr_orig = obj$gr + + # BUild prior evaluator + requireNamespace("RTMB") + priors_obj = RTMB::MakeADFun( func = prior_negloglike, + parameters = list(par=obj$par), + silent = TRUE ) + obj$fn = \(pars) obj$fn_orig(pars) + priors_obj$fn(pars) + obj$gr = \(pars) obj$gr_orig(pars) + priors_obj$gr(pars) + internal$priors_obj = priors_obj + } + + # Further bundle out = list( "obj"=obj, "ram"=ram, "sem_full"=out$model, @@ -251,7 +295,6 @@ function( sem, } # Fit - obj$env$beSilent() # if(!is.null(Random)) #out$opt = fit_tmb( obj, # quiet = control$quiet, # control = list(eval.max=10000, iter.max=10000, trace=ifelse(control$quiet==TRUE,0,1) ), @@ -264,6 +307,8 @@ function( sem, out$opt = nlminb( start = out$opt$par, objective = obj$fn, gradient = obj$gr, + upper = control$upper, + lower = control$lower, control = list( eval.max = control$eval.max, iter.max = control$iter.max, trace = control$trace ) ) @@ -277,6 +322,7 @@ function( sem, out$opt$par = out$opt$par - solve(h, g) out$opt$objective = obj$fn(out$opt$par) } + out$internal$parhat = obj$env$parList() if( isTRUE(control$extra_convergence_checks) ){ # Gradient checks @@ -285,7 +331,7 @@ function( sem, warning("Some gradients are higher than 0.01. Some parameters might not be converged. Consider increasing `control$newton_loops`") } # Hessian check ... condition and positive definite - Hess_fixed = optimHess( par=out$opt$par, fn=obj$fn, gr=obj$gr ) + Hess_fixed = optimHess( par=out$opt$par, fn=obj$fn, gr=obj$gr, control=list(ndeps=rep(0.001,length(out$opt$par))) ) Eigen_fixed = eigen( Hess_fixed, only.values=TRUE ) if( (max(Eigen_fixed$values)/min(Eigen_fixed$values)) > 1e6 ){ # See McCullough and Vinod 2003 @@ -302,14 +348,16 @@ function( sem, if( isTRUE(control$getsd) ){ if( isTRUE(control$verbose) ) message("Running sdreport") if( is.null(Hess_fixed) ){ - Hess_fixed = optimHess( par=out$opt$par, fn=obj$fn, gr=obj$gr ) + Hess_fixed = optimHess( par=out$opt$par, fn=obj$fn, gr=obj$gr, control=list(ndeps=rep(0.001,length(out$opt$par))) ) } - out$sdrep = sdreport( obj, + out$sdrep = TMB::sdreport( obj, + par.fixed = out$opt$par, hessian.fixed = Hess_fixed, getJointPrecision = control$getJointPrecision ) }else{ out$sdrep = NULL } + out$run_time = Sys.time() - start_time # output class(out) = "dsem" @@ -354,7 +402,7 @@ function( sem, #' are equivalent when the model includes no lags (only simultaneous effects) and #' no covariances (no two-headed arrows). \code{"diagonal"} and \code{"marginal"} #' are equivalent when the model includes no covariances. Given some exogenous covariance, -#' \code{constant_variance = "marginal"} preserves the conditional correlation and has +#' \code{constant_variance = "diagonal"} preserves the conditional correlation and has #' changing conditional variance, while \code{constant_variance = "marginal"} has changing #' conditional correlation along the causal graph. #' @param quiet Boolean indicating whether to run model printing messages to terminal or not; @@ -369,6 +417,10 @@ function( sem, #' to \code{\link[TMB]{sdreport}}. #' @param extra_convergence_checks Boolean indicating whether to run extra checks on model #' convergence. +#' @param lower vectors of lower bounds, replicated to be as long as start and passed to \code{\link[stats]{nlminb}}. +#' If unspecified, all parameters are assumed to be unconstrained. +#' @param upper vectors of upper bounds, replicated to be as long as start and passed to \code{\link[stats]{nlminb}}. +#' If unspecified, all parameters are assumed to be unconstrained. #' #' @return #' An S3 object of class "dsem_control" that specifies detailed model settings, @@ -391,7 +443,9 @@ function( nlminb_loops = 1, parameters = NULL, map = NULL, getJointPrecision = FALSE, - extra_convergence_checks = TRUE ){ + extra_convergence_checks = TRUE, + lower = -Inf, + upper = Inf ){ gmrf_parameterization = match.arg(gmrf_parameterization) constant_variance = match.arg(constant_variance) @@ -413,7 +467,9 @@ function( nlminb_loops = 1, parameters = parameters, map = map, getJointPrecision = getJointPrecision, - extra_convergence_checks = extra_convergence_checks + extra_convergence_checks = extra_convergence_checks, + lower = lower, + upper = upper ), class = "dsem_control" ) } @@ -489,8 +545,11 @@ function( object, ... ){ #' #' @param x Output from \code{\link{dsem}} #' @param y Not used -#' @param edge_label Whether to plot parameter names or estimated values +#' @param edge_label Whether to plot parameter names, estimated values, +#' or estimated values along with stars indicating significance at +#' 0.05, 0.01, or 0.001 levels (based on two-sided Wald tests) #' @param digits integer indicating the number of decimal places to be used +#' @param style Whether to make a graph using \code{igraph} or \code{ggraph} #' @param ... arguments passed to \code{\link[igraph]{plot.igraph}} #' #' @details @@ -505,12 +564,15 @@ function( object, ... ){ plot.dsem <- function( x, y, - edge_label = c("name","value"), + edge_label = c("name","value","value_and_stars"), digits = 2, + style = c("igraph","ggraph"), ... ){ - # Extract stuff + style = match.arg(style) edge_label = match.arg(edge_label) + + # Extract stuff out = summary(x) # Format inputs @@ -520,12 +582,49 @@ function( x, if( edge_label=="value"){ DF$label = round(out$Estimate, digits=digits) } + if( edge_label == "value_and_stars" ){ + DF$label = round(out$Estimate, digits=digits) + add_stars = cut(out[,'p_value'], breaks=c(1,0.05,0.01,0.001,0) ) + add_stars = c("***","**","*","")[as.numeric(add_stars)] + DF$label = paste0(DF$label, add_stars) + } # Create and plotgraph pg <- graph_from_data_frame( d = DF, directed = TRUE, vertices = data.frame(vertices) ) - plot( pg, ... ) + + # Two plot styles + if(style=="igraph"){ + coords = layout_(pg, with_sugiyama()) + plot( pg, layout = coords, ... ) + } + if(style=="ggraph"){ + # Modified from phylopath::plot.DAG + algorithm = 'sugiyama' + manual_layout = NULL + text_size = 6 + box_x = 12 + box_y = 8 + edge_width = 1 + curvature = 0 + rotation = 0 + flip_x = FALSE + flip_y = FALSE + label = DF$label + l = ggraph::create_layout(pg, 'igraph', algorithm = algorithm) + arrow = grid::arrow(type = 'closed', 18, grid::unit(15, 'points')) + gplot = ggraph::ggraph(l) + + ggraph::geom_edge_arc( + aes(label = label), + strength = curvature, arrow = arrow, edge_width = edge_width, + end_cap = ggraph::rectangle(box_x, box_y, 'mm'), + start_cap = ggraph::rectangle(box_x, box_y, 'mm') + ) + + ggraph::geom_node_text(ggplot2::aes_(label = ~name), size = text_size) + + ggraph::theme_graph(base_family = 'sans') + plot(gplot) + } return(invisible(pg)) } @@ -852,7 +951,7 @@ function( object, return(out) } -#' @title Marglinal log-likelihood +#' @title Marginal log-likelihood #' #' @description Extract the (marginal) log-likelihood of a dsem model #' diff --git a/R/dsemRTMB.R b/R/dsemRTMB.R new file mode 100644 index 0000000..198030a --- /dev/null +++ b/R/dsemRTMB.R @@ -0,0 +1,285 @@ +#' @title Fit dynamic structural equation model +#' +#' @description Fits a dynamic structural equation model +#' +#' @inheritParams dsem +#' +#' @param log_prior A user-provided function that takes as input the list of +#' parameters \code{out$obj$env$parList()} where \code{out} is the output from +#' \code{dsemRTMB()}, and returns the log-prior probability. For example +#' \code{log_prior = function(p) dnorm( p$beta_z[1], mean=0, sd=0.1, log=TRUE)} +#' specifies a normal prior probability for the first path coefficient +#' with mean of zero and sd of 0.1. Note that the user must load RTMB using +#' \code{library(RTMB)} prior to running the model. +#' +#' @importFrom Matrix solve Diagonal sparseMatrix drop0 kronecker crossprod tcrossprod t diag +#' @importFrom RTMB ADoverload AD dgmrf REPORT ADREPORT dnorm dpois dbinom dgamma +#' +#' @details +#' \code{dsemRTMB} is interchangeable with \code{\link{dsem}}, but uses RTMB +#' instead of TMB for estimation. Both are provided for comparison and +#' real-world comparison. +#' +#' @return +#' An object (list) of class `dsem`, fitted using RTMB +#' +#' @examples +#' # Define model +#' sem = " +#' # Link, lag, param_name +#' cprofits -> consumption, 0, a1 +#' cprofits -> consumption, 1, a2 +#' pwage -> consumption, 0, a3 +#' gwage -> consumption, 0, a3 +#' cprofits -> invest, 0, b1 +#' cprofits -> invest, 1, b2 +#' capital -> invest, 0, b3 +#' gnp -> pwage, 0, c2 +#' gnp -> pwage, 1, c3 +#' time -> pwage, 0, c1 +#' " +#' +#' # Load data +#' data(KleinI, package="AER") +#' TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) +#' tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', +#' "gwage","invest","capital")] +#' +#' # Fit model +#' fit = dsemRTMB( sem=sem, +#' tsdata = tsdata, +#' estimate_delta0 = TRUE, +#' control = dsem_control(quiet=TRUE) ) +#' +#' @export +dsemRTMB <- +function( sem, + tsdata, + family = rep("fixed",ncol(tsdata)), + estimate_delta0 = FALSE, + log_prior = function(p) 0, + control = dsem_control(), + covs = colnames(tsdata) ){ + start_time = Sys.time() + + # General error checks + if( isFALSE(is(control, "dsem_control")) ) stop("`control` must be made by `dsem_control()`") + if( control$gmrf_parameterization=="projection" ){ + if( any(family=="fixed" & colSums(!is.na(tsdata))>0) ){ + stop("`family` cannot be `fixed` using `gmrf_parameterization=projection` for any variable with data") + } + } + if( isFALSE(is(tsdata,"ts")) ) stop("`tsdata` must be a `ts` object") + + # General warnings + if( isFALSE(control$quiet) ){ + tsdata_SD = apply( tsdata, MARGIN=2, FUN=sd, na.rm=TRUE ) + if( any((max(tsdata_SD,rm.na=TRUE)/min(tsdata_SD,rm.na=TRUE)) > 100) ){ + warning("Some variables in `tsdata` have much higher variance than others. Please consider rescaling variables to prevent issues with numerical convergence.") + } + } + + # Build RAM + model = read_model( sem = sem, + times = as.numeric(time(tsdata)), + variables = colnames(tsdata), + covs = covs, + quiet = FALSE ) + #ram = make_matrices( + # beta_p = rnorm(nrow(model)), + # model = model, + # times = as.numeric(time(tsdata)), + # variables = colnames(tsdata) ) + + # + options = c( + ifelse(control$gmrf_parameterization=="separable", 0, 1), + switch(control$constant_variance, "conditional"=0, "marginal"=1, "diagonal"=2) + ) + y_tj = tsdata + + # + n_z = length(unique(model$parameter)) + n_t = nrow(y_tj) + n_j = ncol(y_tj) + n_k = prod(dim(y_tj)) + + # Load data in environment for function "dBdt" + data4 = local({ + "c" <- ADoverload("c") + "[<-" <- ADoverload("[<-") + environment() + }) + environment(log_prior) <- data4 + + # Construct parameters + if( is.null(control$parameters) ){ + Params = list( + #beta_p2 = subset(model,direction==2)$start, + #beta_p1 = subset(model,direction==1)$start, + beta_z = 0.01 * rnorm(n_z), + lnsigma_j = rep(0, n_j), + mu_j = rep(0, n_j), + delta0_j = rep(0, n_j), + x_tj = ifelse( is.na(y_tj), 0, y_tj ) + ) + + # Turn off initial conditions + if( estimate_delta0==FALSE ){ + Params$delta0_j = numeric(0) + } + + # Scale starting values with higher value for two-headed than one-headed arrows + which_nonzero = which(model[,5]>0) + beta_type = tapply( model[which_nonzero,8], INDEX=model[which_nonzero,5], max) + Params$beta_z = ifelse(beta_type==1, 0.01, 1) + + # Override starting values if supplied + which_nonzero = which(model[,5]>0) + start_z = tapply( as.numeric(model[which_nonzero,4]), INDEX=model[which_nonzero,5], mean ) + Params$beta_z = ifelse( is.na(start_z), Params$beta_z, start_z) + }else{ + Params = control$parameters + } + + # Construct map + if( is.null(control$map) ){ + Map = list() + Map$x_tj = factor(ifelse( is.na(as.vector(y_tj)) | (family[col(y_tj)] %in% c("normal","binomial","poisson","gamma")), seq_len(n_k), NA )) + Map$lnsigma_j = factor( ifelse(family=="fixed", NA, seq_along(Params$lnsigma_j)) ) + + # Map off mean for latent variables + Map$mu_j = factor( ifelse(colSums(!is.na(y_tj))==0, NA, seq_len(n_j)) ) + }else{ + Map = control$map + } + + # Define random + if(isTRUE(control$use_REML)){ + Random = c( "x_tj", "mu_j" ) + }else{ + Random = "x_tj" + } + + # Build object + cmb <- function(f, ...) function(p) f(p, ...) ## Helper to make closure + #f(parlist, model, tsdata, family) + obj = RTMB::MakeADFun( + func = cmb( compute_nll, + model = model, + y_tj = y_tj, + family = family, + options = options, + log_prior = log_prior ), + parameters = Params, + random = Random, + map = Map, + profile = control$profile, + silent = TRUE ) + + if(control$quiet==FALSE) list_parameters(obj) + # bundle + internal = list( + sem = sem, + tsdata = tsdata, + family = family, + estimate_delta0 = estimate_delta0, + control = control, + covs = covs + ) + + # Further bundle + out = list( "obj" = obj, + #"ram" = ram, # Not useful using RTMB structure + "sem_full" = model, + "tmb_inputs"=list("parameters"=Params, "random"=Random, "map"=Map), + #"call" = match.call(), + "internal" = internal ) + + # Export stuff + if( control$run_model==FALSE ){ + return( out ) + } + + # Fit + #out$opt = fit_tmb( obj, + # quiet = control$quiet, + # control = list(eval.max=10000, iter.max=10000, trace=ifelse(control$quiet==TRUE,0,1) ), + # ... ) + + # Optimize + out$opt = list( "par"=obj$par ) + for( i in seq_len(max(0,control$nlminb_loops)) ){ + if( isFALSE(control$quiet) ) message("Running nlminb_loop #", i) + out$opt = nlminb( start = out$opt$par, + objective = obj$fn, + gradient = obj$gr, + upper = control$upper, + lower = control$lower, + control = list( eval.max = control$eval.max, + iter.max = control$iter.max, + trace = control$trace ) ) + } + + # Newtonsteps + for( i in seq_len(max(0,control$newton_loops)) ){ + if( isFALSE(control$quiet) ) message("Running newton_loop #", i) + g = as.numeric( obj$gr(out$opt$par) ) + h = optimHess(out$opt$par, fn=obj$fn, gr=obj$gr) + out$opt$par = out$opt$par - solve(h, g) + out$opt$objective = obj$fn(out$opt$par) + } + out$internal$parhat = obj$env$parList() + + # + out$simulator = function( parlist = obj$env$parList(), + simulate_gmrf = TRUE ){ + compute_nll( parlist = parlist, + model = model, + y_tj = y_tj, + family = family, + options = options, + log_prior = log_prior, + simulate_gmrf = simulate_gmrf, + simulate_data = TRUE ) + } + + if( isTRUE(control$extra_convergence_checks) ){ + # Gradient checks + Grad_fixed = obj$gr( out$opt$par ) + if( isTRUE(any(Grad_fixed > 0.01)) ){ + warning("Some gradients are higher than 0.01. Some parameters might not be converged. Consider increasing `control$newton_loops`") + } + # Hessian check ... condition and positive definite + Hess_fixed = optimHess( par=out$opt$par, fn=obj$fn, gr=obj$gr, control=list(ndeps=rep(0.001,length(out$opt$par))) ) + Eigen_fixed = eigen( Hess_fixed, only.values=TRUE ) + if( (max(Eigen_fixed$values)/min(Eigen_fixed$values)) > 1e6 ){ + # See McCullough and Vinod 2003 + warning("The ratio of maximum and minimum Hessian eigenvalues is high. Some parameters might not be identifiable.") + } + if( isTRUE(any(Eigen_fixed$values < 0)) ){ + warning("Some Hessian eigenvalue is negative. Some parameters might not be converged.") + } + }else{ + Hess_fixed = NULL + } + + # Run sdreport + if( isTRUE(control$getsd) ){ + if( isTRUE(control$verbose) ) message("Running sdreport") + if( is.null(Hess_fixed) ){ + Hess_fixed = optimHess( par=out$opt$par, fn=obj$fn, gr=obj$gr, control=list(ndeps=rep(0.001,length(out$opt$par))) ) + } + out$sdrep = TMB::sdreport( obj, + par.fixed = out$opt$par, + hessian.fixed = Hess_fixed, + getJointPrecision = control$getJointPrecision ) + }else{ + out$sdrep = NULL + } + out$run_time = Sys.time() - start_time + + # output + class(out) = "dsem" + return(out) +} diff --git a/R/make_dsem_ram.R b/R/make_dsem_ram.R index dbbe6e4..e926e1e 100644 --- a/R/make_dsem_ram.R +++ b/R/make_dsem_ram.R @@ -332,22 +332,50 @@ function( sem, } # Loop through paths + G_kk = P_kk = drop0(sparseMatrix( i=1, j=1, x=0, dims=rep(length(variables)*length(times),2) )) # Make with a zero + #P_kk = new("dgCMatrix") + #P_kk = Matrix() + #P_kk@Dim <- as.integer(rep(length(variables)*length(times),2)) + #P_kk@p = integer(length(variables)*length(times)+1L) + #G_kk = P_kk for( i in seq_len(nrow(model)) ){ - for( t in seq_along(times) ){ lag = as.numeric(model[i,2]) - par.no = par.nos[i] - # Get index for "from" - from = c( times[t], model[i,'first'] ) - from_index = match_row( Q_names, from ) - from_index = ifelse( length(from_index)==0, NA, from_index ) - # Get index for "to" - to = c( times[t+lag], model[i,'second'] ) - to_index = match_row( Q_names, to ) - to_index = ifelse( length(to_index)==0, NA, to_index ) - ram_new = data.frame( "heads"=abs(as.numeric(model[i,'direction'])), "to"=to_index, "from"=from_index, "parameter"=par.no, "start"=startvalues[i] ) - ram = rbind( ram, ram_new ) - }} - rownames(ram) = NULL + L_tt = sparseMatrix( i = seq(lag+1,length(times)), + j = seq(1,length(times)-lag), + x = 1, + dims = rep(length(times),2) ) + P_jj = sparseMatrix( i = match(model[i,'second'],variables), + j = match(model[i,'first'],variables), + x = 1, + dims = rep(length(variables),2) ) + tmp_kk = kronecker(P_jj, L_tt) + if(abs(as.numeric(model[i,'direction']))==1){ + P_kk = P_kk + tmp_kk * i + }else{ + G_kk = G_kk + tmp_kk * i + } + #for( t in seq_along(times) ){ + # # Get index for "from" + # from = c( times[t], model[i,'first'] ) + # from_index = match_row( Q_names, from ) + # from_index = ifelse( length(from_index)==0, NA, from_index ) + # # Get index for "to" + # to = c( times[t+lag], model[i,'second'] ) + # to_index = match_row( Q_names, to ) + # to_index = ifelse( length(to_index)==0, NA, to_index ) + # ram_new = data.frame( "heads"=abs(as.numeric(model[i,'direction'])), "to"=to_index, "from"=from_index, "parameter"=par.no, "start"=startvalues[i] ) + # ram = rbind( ram, ram_new ) + #} + } + #rownames(ram) = NULL + #f = \(x) sapply(mat2triplet(drop0(x)),cbind) + f = \(x) matrix(unlist(mat2triplet(x)),ncol=3) + ram = rbind( cbind(1, f(P_kk)), + cbind(2, f(G_kk)) ) + ram = data.frame( ram[,1:3,drop=FALSE], + as.numeric(par.nos)[ram[,4]], + as.numeric(startvalues)[ram[,4]] ) + colnames(ram) = c("heads", "to", "from", "parameter", "start") # if( isTRUE(remove_na) ){ diff --git a/R/make_matrices.R b/R/make_matrices.R new file mode 100644 index 0000000..2c1754f --- /dev/null +++ b/R/make_matrices.R @@ -0,0 +1,65 @@ +make_matrices <- +function( beta_p, + model, + times, + variables ){ + + "c" <- ADoverload("c") + "[<-" <- ADoverload("[<-") + model <- as.data.frame(model) + model$parameter = as.integer(model$parameter) + + # Combine fixed, estimated, and mapped parameters into vector + beta_i = rep(0, nrow(model)) + off = which(model[,'parameter'] == 0) + if( length(off) > 0 ){ + beta_i[off] = as.numeric(model[off,'start']) + } + not_off = which(model[,'parameter'] > 0) + if( length(not_off) > 0 ){ + beta_i[not_off] = beta_p[model[not_off,'parameter']] + } + + # Loop through paths + P_kk = drop0(sparseMatrix( i=1, j=1, x=0, dims=rep(length(variables)*length(times),2) )) # Make with a zero + #P_kk = AD(P_kk) + G_kk = (P_kk) + for( i in seq_len(nrow(model)) ){ + lag = as.numeric(model[i,2]) + L_tt = sparseMatrix( i = seq(lag+1,length(times)), + j = seq(1,length(times)-lag), + x = 1, + dims = rep(length(times),2) ) + + P_jj = sparseMatrix( i = match(model[i,'second'],variables), + j = match(model[i,'first'],variables), + x = 1, + dims = rep(length(variables),2) ) + + # Assemble + tmp_kk = kronecker(P_jj, L_tt) + if(abs(as.numeric(model[i,'direction']))==1){ + P_kk = P_kk + beta_i[i] * tmp_kk # AD(tmp_kk) + }else{ + G_kk = G_kk + beta_i[i] * tmp_kk # AD(tmp_kk) + } + } + + # Diagonal component + I_kk = Diagonal(nrow(P_kk)) + + # Assemble + IminusP_kk = AD(I_kk - P_kk) + invV_kk = AD(G_kk) + invV_kk@x = 1 / G_kk@x^2 + #Q_kk = t(IminusP_kk) %*% invV_kk %*% IminusP_kk + + out = list( + "P_kk" = P_kk, + "G_kk" = G_kk, + "invV_kk" = invV_kk, + #"Q_kk" = Q_kk, # NOT USED + "IminusP_kk" = IminusP_kk + ) + return(out) +} diff --git a/R/read_model.R b/R/read_model.R new file mode 100644 index 0000000..cb851af --- /dev/null +++ b/R/read_model.R @@ -0,0 +1,137 @@ + +#' @title Make a RAM (Reticular Action Model) +#' +#' @description \code{read_model} converts SEM arrow notation to \code{model} describing SEM parameters +#' +#' @inheritParams dsem +#' @param times A character vector listing the set of times in order +#' @param variables A character vector listing the set of variables +#' @param covs A character vector listing variables for which to estimate a standard deviation +#' @param quiet Boolean indicating whether to print messages to terminal +#' +#' @details +#' See \code{\link{make_dsem_ram}} for details +read_model <- +function( sem, + times, + variables, + covs = NULL, + quiet = FALSE ){ + + "c" <- ADoverload("c") + "[<-" <- ADoverload("[<-") + + ####### Error checks + if( !is.numeric(times) ) stop("`times` must be numeric in `make_dsem_ram`") + + ####### Define local functions + # + add.variances <- function() { + variables <- need.variance() + nvars <- length(variables) + if (nvars == 0) + return(model) + message("NOTE: adding ", nvars, " variances to the model") + paths <- character(nvars) + par.names <- character(nvars) + for (i in 1:nvars) { + paths[i] <- paste(variables[i], "<->", variables[i]) + par.names[i] <- paste("V[", variables[i], "]", sep = "") + } + model.2 <- cbind( + 'path' = c(model[, 1], paths), + 'lag' = c(model[,2], rep(0,nvars)), + 'name' = c(model[, 3], par.names), + 'start' = c(model[, 4], rep(NA, length(paths))) ) + model.2 + } + need.variance <- function() { + all.vars <- classify_variables(model) + exo.vars <- all.vars$exogenous + end.vars <- all.vars$endogenous + variables <- logical(0) + for (i in seq_len(nrow(model))) { + paths = model[i,1] + lag = model[i,2] + vars <- gsub(pattern=" ", replacement="", x=paths) + vars <- sub("-*>", "->", sub("<-*", "<-", vars)) + vars <- sub("<->|<-", "->", vars) + vars <- strsplit(vars, "->")[[1]] + if ((vars[1] != vars[2]) | (lag != 0)) { + for (a.variable in vars) { + if (is.na(variables[a.variable])) + variables[a.variable] <- TRUE + } + } + else { + variables[vars[1]] <- FALSE + } + } + if (!exog.variances && length(exo.vars) > 0) + variables[exo.vars] <- FALSE + if (!endog.variances && length(end.vars) > 0) + variables[end.vars] <- FALSE + names(variables)[variables] + } + + ####### Step 2 -- Make RAM + # convert to data frame + model = scan( text = sem, + what = list(path = "", lag = 1, par = "", start = 1, dump = ""), + sep = ",", + strip.white = TRUE, + comment.char = "#", + fill = TRUE, + quiet = quiet) + model$path <- gsub("\\t", " ", model$path) + model$par[model$par == ""] <- NA + model <- data.frame( "path" = model$path, + "lag" = model$lag, + "name" = model$par, + "start" = model$start ) + + # Adding a SD automatically + if( !is.null(covs) ){ + for (cov in covs) { + vars <- strsplit(cov, "[ ,]+")[[1]] + nvar <- length(vars) + for (i in 1:nvar) { + for (j in i:nvar) { + p1 = paste(vars[i], "<->", vars[j]) + p2 = if (i==j) paste("V[", vars[i], "]", sep = "") else paste("C[",vars[i], ",", vars[j], "]", sep = "") + p3 = NA + row <- c(p1, 0, p2, p3) + if( any((row[1]==model[,1]) & (row[2]==model[,2])) ){ + next + }else{ + model <- rbind(model, row, deparse.level = 0) + } + }} + } + } + + exog.variances = endog.variances = TRUE + model = add.variances() + + # Add parameter column + par.names = model[, 3] + pars = na.omit(unique(par.names)) + par.nos = apply(outer(pars, par.names, "=="), 2, which) + par.nos = unlist(sapply( par.nos, FUN=\(x) ifelse(length(x)==0, 0, x) )) + model = cbind( model, "parameter"=par.nos ) + + # Add incidence to model + model = cbind( model, first=NA, second=NA, direction=NA ) + for( i in seq_len(nrow(model)) ){ + path = parse_path(model[i,1]) + model[i,c('first','second','direction')] = unlist( path[c('first','second','direction')] ) + } + + ####### Step 2 -- Make RAM + + # Deal with fixed values + #if( max(model$parameter) != length(beta_p) ) stop("Check beta_p") + + return(model) +} + diff --git a/R/rgmrf.R b/R/rgmrf.R new file mode 100644 index 0000000..2e76b06 --- /dev/null +++ b/R/rgmrf.R @@ -0,0 +1,15 @@ +rgmrf <- +function( mu, # estimated fixed and random effects + Q # estimated joint precision + ) { + + # Simulate values + if(missing(mu)) mu = rep(0,nrow(Q)) + z0 = rnorm(length(mu)) + # Q = t(P) * L * t(L) * P + L = Matrix::Cholesky(Q, super=TRUE) + # Calcualte t(P) * solve(t(L)) * z0 in two steps + z = Matrix::solve(L, z0, system = "Lt") # z = Lt^-1 * z + z = Matrix::solve(L, z, system = "Pt") # z = Pt * z + return(mu + z) +} diff --git a/R/stepwise_selection.R b/R/stepwise_selection.R new file mode 100644 index 0000000..fbe6694 --- /dev/null +++ b/R/stepwise_selection.R @@ -0,0 +1,68 @@ + +#' @title Simulate dsem +#' +#' @description Plot from a fitted \code{dsem} model +#' +#' @param model_options character-vector containing sem elements +#' that could be included or dropped depending upon their +#' parsimony +#' @param model_shared character-vector containing sem elements +#' that must be included regardless of parsimony +#' @param quiet whether to avoid displaying progress to terminal +#' @param ... arguments passed to \code{\link{dsem}}, +#' other than \code{sem} e.g., \code{tsdata}, \code{family} +#' etc. +#' +#' @details +#' This function conducts stepwise (i.e., forwards and backwards) model +#' selection using marginal AIC, while forcing some model elements to be +#' included and selecting among others. +#' +#' @return +#' An object (list) that includes: +#' \describe{ +#' \item{model}{the string with the selected SEM model} +#' \item{record}{a list showing the AIC and whether each \code{model_options} is included or not} +#' } +#' +#' @export +stepwise_selection <- +function( model_options, + model_shared, + quiet = FALSE, + ... ){ + + # Loop + best = rep(0, length(model_options) ) + record = NULL + while(TRUE){ + if(isFALSE(quiet)) message("Running with ", sum(best), " vars included: ", Sys.time() ) + df_options = outer( rep(1,length(best)), best ) + which_diag = cbind( seq_len(nrow(df_options)), seq_len(nrow(df_options)) ) + df_options[which_diag] = 1 - df_options[which_diag] + df_options = rbind(best, df_options) + AIC_i = rep(NA, nrow(df_options)) + + for(i in 1:nrow(df_options)){ + model = paste( paste(model_options[which(df_options[i,]==1)],collapse="\n"), + paste(model_shared,collapse="\n"), + sep="\n" ) + myfit = dsem( sem = model, ... ) + AIC_i[i] = AIC(myfit) + } + + # Save record and decide whether to continue + record[[length(record)+1]] = cbind( AIC_i, df_options ) + if(which.min(AIC_i)==1){ + break() + }else{ + best = df_options[which.min(AIC_i),] + } + } + + # Return best + model = paste( paste(model_options[which(best==1)],collapse="\n"), + paste(model_shared,collapse="\n"), + sep="\n" ) + return(list(model=model, record=record)) +} diff --git a/R/utility.R b/R/utility.R new file mode 100644 index 0000000..0bf90af --- /dev/null +++ b/R/utility.R @@ -0,0 +1,128 @@ + +#' @title Calculate leave-one-out residuals +#' +#' @description Calculates quantile residuals using the predictive distribution from +#' a jacknife (i.e., leave-one-out predictive distribution) +#' +#' @param object Output from \code{\link{dsem}} +#' @param nsim Number of simulations to use if \code{family!="fixed"} for some variable, +#' such that simulation residuals are required. +#' @param what whether to return quantile residuals, or samples from the leave-one-out predictive +#' distribution of data, or a table of leave-one-out predictions and standard errors for the +#' latent state +#' @param track_progress whether to track runtimes on terminal +#' @param ... Not used +#' +#' @details +#' Conditional quantile residuals cannot be calculated when using \code{family = "fixed"}, because +#' state-variables are fixed at available measurements and hence the conditional distribution is a Dirac +#' delta function. One alternative is to use leave-one-out residuals, where we calculate the predictive distribution +#' for each state value when dropping the associated observation, and then either use that as the +#' predictive distribution, or sample from that predictive distribution and then calculate +#' a standard quantile distribution for a given non-fixed family. This appraoch is followed here. +#' It is currently only implemented when all variables follow \code{family = "fixed"}, but +#' could be generalized to a mix of families upon request. +#' +#' @return +#' A matrix of residuals, with same order and dimensions as argument \code{tsdata} +#' that was passed to \code{dsem}. +#' +#' @export +loo_residuals <- +function( object, + nsim = 100, + what = c("quantiles","samples","loo"), + track_progress = TRUE, + ... ){ + + # Extract and make object + what = match.arg(what) + tsdata = object$internal$tsdata + parlist = object$obj$env$parList() + df = expand.grid( time(tsdata), colnames(tsdata) ) + df$obs = as.vector(tsdata) + df = na.omit(df) + df = cbind( df, est=NA, se=NA ) + + # Loop through observations + for(r in 1:nrow(df) ){ + if( (r %% floor(nrow(df)/10))==1 ){ + if(isTRUE(track_progress)) message("Running leave-one-out fit ",r," of ",nrow(df)," at ",Sys.time()) + } + ts_r = tsdata + ts_r[match(df[r,1],time(tsdata)),match(df[r,2],colnames(tsdata))] = NA + + # + control = object$internal$control + control$quiet = TRUE + control$run_model = FALSE + + # Build inputs + #fit_r = dsem( sem = object$internal$sem, + # tsdata = ts_r, + # family = object$internal$family, + # estimate_delta0 = object$internal$estimate_delta0, + # control = control ) + fit_r = list( tmb_inputs = object$tmb_inputs ) + Data = fit_r$tmb_inputs$data + fit_r$tmb_inputs$map$x_tj = factor(ifelse( is.na(as.vector(ts_r)) | (Data$familycode_j[col(tsdata)] %in% c(1,2,3,4)), seq_len(prod(dim(ts_r))), NA )) + + # Modify inputs + map = fit_r$tmb_inputs$map + parameters = fit_r$tmb_inputs$parameters + parameters[c("beta_z","lnsigma_j","mu_j","delta0_j")] = parlist[c("beta_z","lnsigma_j","mu_j","delta0_j")] + for( v in c("beta_z","lnsigma_j","mu_j","delta0_j") ){ + map[[v]] = factor( rep(NA,length(as.vector(parameters[[v]])))) + } + + # Build object + obj = TMB::MakeADFun( data = fit_r$tmb_inputs$data, + parameters = parameters, + random = NULL, + map = map, + profile = control$profile, + DLL="dsem", + silent = TRUE ) + + # Rerun and record + opt = nlminb( start = obj$par, + objective = obj$fn, + gradient = obj$gr ) + sdrep = TMB::sdreport( obj ) + df[r,'est'] = as.list(sdrep, what="Estimate")$x_tj[match(df[r,1],time(tsdata)),match(df[r,2],colnames(tsdata))] + df[r,'se'] = as.list(sdrep, what="Std. Error")$x_tj[match(df[r,1],time(tsdata)),match(df[r,2],colnames(tsdata))] + } + + # Compute quantile residuals + resid_tjr = array( NA, dim=c(dim(tsdata),nsim) ) + if( all(object$internal$family == "fixed") ){ + # analytical quantile residuals + resid_tj = array( NA, dim=dim(tsdata) ) + resid_tj[cbind(match(df[,1],time(tsdata)),match(df[,2],colnames(tsdata)))] = pnorm( q=df$obs, mean=df$est, sd=df$se ) + # Simulations from predictive distribution for use in DHARMa etc + for(z in 1:nrow(df) ){ + resid_tjr[match(df[z,1],time(tsdata)),match(df[z,2],colnames(tsdata)),] = rnorm( n=nsim, mean=df[z,'est'], sd=df[z,'se'] ) + } + }else{ + # Sample from leave-one-out predictive distribution for states + resid_rz = apply( df, MARGIN=1, FUN=\(vec){rnorm(n=nsim, mean=as.numeric(vec['est']), sd=as.numeric(vec['se']))} ) + # Sample from predictive distribution of data given states + for(r in 1:nrow(resid_rz) ){ + parameters = object$obj$env$parList() + parameters$x_tj[which(!is.na(tsdata))] = resid_rz[r,] + obj = TMB::MakeADFun( data = object$tmb_inputs$data, + parameters = parameters, + random = NULL, + map = object$tmb_inputs$map, + profile = NULL, + DLL="dsem", + silent = TRUE ) + resid_tjr[,,r] = obj$simulate()$y_tj + } + # Calculate quantile residuals + resid_tj = apply( resid_tjr > outer(tsdata,rep(1,nsim)), MARGIN=1:2, FUN=mean ) + } + if(what=="quantiles") return( resid_tj ) + if(what=="samples") return( resid_tjr ) + if(what=="loo") return( df ) +} diff --git a/inst/CITATION b/inst/CITATION index d9b79c2..09a9768 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,9 +1,12 @@ citHeader("For methodological details regarding dynamic structural equation models, please see Thorson et al. (2024).") bibentry(bibtype = "Article", - textVersion = "Thorson, J. T., Andrews, A. G., Essington, T., & Large, S. (2024). Dynamic structural equation models synthesize ecosystem dynamics constrained by ecological mechanisms. Methods in Ecology and Evolution.", + textVersion = "Thorson, J. T., Andrews, A. G., Essington, T., & Large, S. (2024). Dynamic structural equation models synthesize ecosystem dynamics constrained by ecological mechanisms. Methods in Ecology and Evolution 15(4): 744-755. https://doi.org/10.1111/2041-210X.14289", title = "Dynamic structural equation models synthesize ecosystem dynamics constrained by ecological mechanisms", journal = "Methods in Ecology and Evolution", + volume = "15", + number = "4", + pages = "744-755", year = "2024", doi = "10.1111/2041-210X.14289", url = "https://doi.org/10.1111/2041-210X.14289", diff --git a/man/cAIC.Rd b/man/cAIC.Rd new file mode 100644 index 0000000..df69685 --- /dev/null +++ b/man/cAIC.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cAIC.R +\name{cAIC} +\alias{cAIC} +\title{Calculate conditional AIC} +\usage{ +cAIC(object, what = c("cAIC", "EDF")) +} +\arguments{ +\item{object}{Output from \code{\link{dsem}}} + +\item{what}{Whether to return the cAIC or the effective degrees of freedom +(EDF) for each group of random effects.} +} +\value{ +Either the cAIC, or the effective degrees of freedom (EDF) by group +of random effects +} +\description{ +Calculates the conditional Akaike Information criterion (cAIC). +} +\details{ +cAIC is designed to optimize the expected out-of-sample predictive +performance for new data that share the same random effects as the +in-sample (fitted) data, e.g., spatial interpolation. In this sense, +it should be a fast approximation to optimizing the model structure +based on k-fold crossvalidation. +By contrast, \code{AIC} calculates the +marginal Akaike Information Criterion, which is designed to optimize +expected predictive performance for new data that have new random effects, +e.g., extrapolation, or inference about generative parameters. + +cAIC also calculates as a byproduct the effective degrees of freedom, +i.e., the number of fixed effects that would have an equivalent impact on +model flexibility as a given random effect. + +Both cAIC and EDF are calculated using Eq. 6 of Zheng Cadigan Thorson 2024. + +Note that, for models that include profiled fixed effects, these profiles +are turned off. +} +\references{ +**Deriving the general approximation to cAIC used here** + +Zheng, N., Cadigan, N., & Thorson, J. T. (2024). +A note on numerical evaluation of conditional Akaike information for +nonlinear mixed-effects models (arXiv:2411.14185). arXiv. +\doi{10.48550/arXiv.2411.14185} + +**The utility of EDF to diagnose hierarchical model behavior** + +Thorson, J. T. (2024). Measuring complexity for hierarchical +models using effective degrees of freedom. Ecology, +105(7), e4327 \doi{10.1002/ecy.4327} +} diff --git a/man/convert_equations.Rd b/man/convert_equations.Rd new file mode 100644 index 0000000..3c8cc8e --- /dev/null +++ b/man/convert_equations.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convert_equations.R +\name{convert_equations} +\alias{convert_equations} +\title{Convert equations notation} +\usage{ +convert_equations(equations) +} +\arguments{ +\item{equations}{Specification for time-series structural equation model structure +including lagged or simultaneous effects. See Details section in +\code{\link[dsem]{convert_equations}} for more description} +} +\description{ +Converts equations to arrow-and-lag notation expected by dsem +} +\details{ +The function modifies code copied from package +`sem` under licence GPL (>= 2) with permission from John Fox. + +For specifyEquations, each input line is either a regression equation or the +specification of a variance or covariance. Regression equations are of the form +y = par1*x1 + par2*x2 + ... + park*xk +where y and the xs are variables in the model (either observed or latent), +and the pars are parameters. If a parameter is given as a numeric value +(e.g., 1) then it is treated as fixed. Note that no error variable is +included in the equation; error variances are specified via either +the covs argument, via V(y) = par (see immediately below), or are +added automatically to the model when, as by default, endog.variances=TRUE. +A regression equation may be split over more than one input by breaking at a +, +so that + is either the last non-blank character on a line or the +first non-blank character on the subsequent line. + +Variances are specified in the form V(var) = par and +covariances in the form C(var1, var2) = par, where the vars are +variables (observed or unobserved) in the model. The symbols V and C +may be in either lower- or upper-case. If par is a numeric value (e.g., 1) +then it is treated as fixed. In conformity with the RAM model, +a variance or covariance for an endogenous variable in the +model is an error variance or covariance. + +To set a start value for a free parameter, enclose the numeric +start value in parentheses after the parameter name, as parameter(value). +} diff --git a/man/dsem.Rd b/man/dsem.Rd index a0d9900..0c30f23 100644 --- a/man/dsem.Rd +++ b/man/dsem.Rd @@ -9,6 +9,7 @@ dsem( tsdata, family = rep("fixed", ncol(tsdata)), estimate_delta0 = FALSE, + prior_negloglike = NULL, control = dsem_control(), covs = colnames(tsdata) ) @@ -29,6 +30,17 @@ Other options correspond to different specifications of measurement error.} as fixed effects, or alternatively to assume that dynamics start at some stochastic draw away from the stationary distribution} +\item{prior_negloglike}{A user-provided function that takes as input the vector of fixed effects out$obj$par +returns the negative log-prior probability. For example +\code{prior_negloglike = function(obj) -1 * dnorm( obj$par[1], mean=0, sd=0.1, log=TRUE)} +specifies a normal prior probability +for the for the first fixed effect with mean of zero and logsd of 0.1. +NOTE: this implementation does not work well with \code{tmbstan} and +is highly experimental. If using priors, considering using \code{\link{dsemRTMB}} +instead. The option in \code{dsem} is mainly intended to validate its +use in \code{dsemRTMB}. Note that the user must load RTMB using +\code{library(RTMB)} prior to running the model.} + \item{control}{Output from \code{\link{dsem_control}}, used to define user settings, and see documentation for that function for details.} @@ -51,6 +63,7 @@ An object (list) of class `dsem`. Elements include: \item{sdrep}{The output from \code{\link[TMB]{sdreport}}} \item{interal}{Objects useful for package function, i.e., all arguments passed during the call} +\item{run_time}{Total time to run model} } } \description{ @@ -122,7 +135,8 @@ plot( fit, edge_label="value" ) **Introducing the package, its features, and comparison with other software (to cite when using dsem):** -Thorson, J. T., Andrews, A., Essington, T., Large, S. (In review). +Thorson, J. T., Andrews, A., Essington, T., Large, S. (2024). Dynamic structural equation models synthesize ecosystem dynamics constrained by ecological mechanisms. +Methods in Ecology and Evolution. \doi{10.1111/2041-210X.14289} } diff --git a/man/dsemRTMB.Rd b/man/dsemRTMB.Rd new file mode 100644 index 0000000..35e1131 --- /dev/null +++ b/man/dsemRTMB.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dsemRTMB.R +\name{dsemRTMB} +\alias{dsemRTMB} +\title{Fit dynamic structural equation model} +\usage{ +dsemRTMB( + sem, + tsdata, + family = rep("fixed", ncol(tsdata)), + estimate_delta0 = FALSE, + log_prior = function(p) 0, + control = dsem_control(), + covs = colnames(tsdata) +) +} +\arguments{ +\item{sem}{Specification for time-series structural equation model structure +including lagged or simultaneous effects. See Details section in +\code{\link[dsem]{make_dsem_ram}} for more description} + +\item{tsdata}{time-series data, as outputted using \code{\link[stats]{ts}}} + +\item{family}{Character-vector listing the distribution used for each column of \code{tsdata}, where +each element must be \code{fixed} or \code{normal}. +\code{family="fixed"} is default behavior and assumes that a given variable is measured exactly. +Other options correspond to different specifications of measurement error.} + +\item{estimate_delta0}{Boolean indicating whether to estimate deviations from equilibrium in initial year +as fixed effects, or alternatively to assume that dynamics start at some stochastic draw away from +the stationary distribution} + +\item{log_prior}{A user-provided function that takes as input the list of +parameters \code{out$obj$env$parList()} where \code{out} is the output from +\code{dsemRTMB()}, and returns the log-prior probability. For example +\code{log_prior = function(p) dnorm( p$beta_z[1], mean=0, sd=0.1, log=TRUE)} +specifies a normal prior probability for the first path coefficient +with mean of zero and sd of 0.1. Note that the user must load RTMB using +\code{library(RTMB)} prior to running the model.} + +\item{control}{Output from \code{\link{dsem_control}}, used to define user +settings, and see documentation for that function for details.} + +\item{covs}{optional: a character vector of one or more elements, with each element giving a string of variable +names, separated by commas. Variances and covariances among all variables in each such string are +added to the model. Warning: covs="x1, x2" and covs=c("x1", "x2") are not equivalent: +covs="x1, x2" specifies the variance of x1, the variance of x2, and their covariance, +while covs=c("x1", "x2") specifies the variance of x1 and the variance of x2 but not their covariance. +These same covariances can be added manually via argument `sem`, but using argument `covs` might +save time for models with many variables.} +} +\value{ +An object (list) of class `dsem`, fitted using RTMB +} +\description{ +Fits a dynamic structural equation model +} +\details{ +\code{dsemRTMB} is interchangeable with \code{\link{dsem}}, but uses RTMB +instead of TMB for estimation. Both are provided for comparison and +real-world comparison. +} +\examples{ +# Define model +sem = " + # Link, lag, param_name + cprofits -> consumption, 0, a1 + cprofits -> consumption, 1, a2 + pwage -> consumption, 0, a3 + gwage -> consumption, 0, a3 + cprofits -> invest, 0, b1 + cprofits -> invest, 1, b2 + capital -> invest, 0, b3 + gnp -> pwage, 0, c2 + gnp -> pwage, 1, c3 + time -> pwage, 0, c1 +" + +# Load data +data(KleinI, package="AER") +TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) +tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', + "gwage","invest","capital")] + +# Fit model +fit = dsemRTMB( sem=sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + control = dsem_control(quiet=TRUE) ) + +} diff --git a/man/dsem_control.Rd b/man/dsem_control.Rd index 20b2a9b..dd4a2ae 100644 --- a/man/dsem_control.Rd +++ b/man/dsem_control.Rd @@ -20,7 +20,9 @@ dsem_control( parameters = NULL, map = NULL, getJointPrecision = FALSE, - extra_convergence_checks = TRUE + extra_convergence_checks = TRUE, + lower = -Inf, + upper = Inf ) } \arguments{ @@ -64,7 +66,7 @@ All options are equivalent when the model includes no lags (only simultaneous effects) and no covariances (no two-headed arrows). \code{"diagonal"} and \code{"marginal"} are equivalent when the model includes no covariances. Given some exogenous covariance, -\code{constant_variance = "marginal"} preserves the conditional correlation and has +\code{constant_variance = "diagonal"} preserves the conditional correlation and has changing conditional variance, while \code{constant_variance = "marginal"} has changing conditional correlation along the causal graph.} @@ -85,6 +87,12 @@ to \code{\link[TMB]{sdreport}}.} \item{extra_convergence_checks}{Boolean indicating whether to run extra checks on model convergence.} + +\item{lower}{vectors of lower bounds, replicated to be as long as start and passed to \code{\link[stats]{nlminb}}. +If unspecified, all parameters are assumed to be unconstrained.} + +\item{upper}{vectors of upper bounds, replicated to be as long as start and passed to \code{\link[stats]{nlminb}}. +If unspecified, all parameters are assumed to be unconstrained.} } \value{ An S3 object of class "dsem_control" that specifies detailed model settings, diff --git a/man/isle_royale.Rd b/man/isle_royale.Rd index 963ea5a..ce1ddaf 100644 --- a/man/isle_royale.Rd +++ b/man/isle_royale.Rd @@ -12,7 +12,7 @@ Data used to demonstrate and test cross-lagged (vector autoregressive) models } \details{ Data extracted from file "Data_wolves_moose_Isle_Royale_June2019.csv" available at -\url{https://isleroyalewolf.org/data/data/home.html} and obtained 2023-06-23. +\url{https://www.isleroyalewolf.org} and obtained 2023-06-23. Reproduced with permission from John Vucetich, and generated by the Wolves and Moose of Isle Royale project. } diff --git a/man/logLik.dsem.Rd b/man/logLik.dsem.Rd index 58a320c..4da24bc 100644 --- a/man/logLik.dsem.Rd +++ b/man/logLik.dsem.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/dsem.R \name{logLik.dsem} \alias{logLik.dsem} -\title{Marglinal log-likelihood} +\title{Marginal log-likelihood} \usage{ \method{logLik}{dsem}(object, ...) } diff --git a/man/loo_residuals.Rd b/man/loo_residuals.Rd new file mode 100644 index 0000000..bb1c65b --- /dev/null +++ b/man/loo_residuals.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility.R +\name{loo_residuals} +\alias{loo_residuals} +\title{Calculate leave-one-out residuals} +\usage{ +loo_residuals( + object, + nsim = 100, + what = c("quantiles", "samples", "loo"), + track_progress = TRUE, + ... +) +} +\arguments{ +\item{object}{Output from \code{\link{dsem}}} + +\item{nsim}{Number of simulations to use if \code{family!="fixed"} for some variable, +such that simulation residuals are required.} + +\item{what}{whether to return quantile residuals, or samples from the leave-one-out predictive +distribution of data, or a table of leave-one-out predictions and standard errors for the +latent state} + +\item{track_progress}{whether to track runtimes on terminal} + +\item{...}{Not used} +} +\value{ +A matrix of residuals, with same order and dimensions as argument \code{tsdata} +that was passed to \code{dsem}. +} +\description{ +Calculates quantile residuals using the predictive distribution from + a jacknife (i.e., leave-one-out predictive distribution) +} +\details{ +Conditional quantile residuals cannot be calculated when using \code{family = "fixed"}, because +state-variables are fixed at available measurements and hence the conditional distribution is a Dirac +delta function. One alternative is to use leave-one-out residuals, where we calculate the predictive distribution +for each state value when dropping the associated observation, and then either use that as the +predictive distribution, or sample from that predictive distribution and then calculate +a standard quantile distribution for a given non-fixed family. This appraoch is followed here. +It is currently only implemented when all variables follow \code{family = "fixed"}, but +could be generalized to a mix of families upon request. +} diff --git a/man/plot.dsem.Rd b/man/plot.dsem.Rd index cbd7648..99889a4 100644 --- a/man/plot.dsem.Rd +++ b/man/plot.dsem.Rd @@ -4,17 +4,28 @@ \alias{plot.dsem} \title{Simulate dsem} \usage{ -\method{plot}{dsem}(x, y, edge_label = c("name", "value"), digits = 2, ...) +\method{plot}{dsem}( + x, + y, + edge_label = c("name", "value", "value_and_stars"), + digits = 2, + style = c("igraph", "ggraph"), + ... +) } \arguments{ \item{x}{Output from \code{\link{dsem}}} \item{y}{Not used} -\item{edge_label}{Whether to plot parameter names or estimated values} +\item{edge_label}{Whether to plot parameter names, estimated values, +or estimated values along with stars indicating significance at +0.05, 0.01, or 0.001 levels (based on two-sided Wald tests)} \item{digits}{integer indicating the number of decimal places to be used} +\item{style}{Whether to make a graph using \code{igraph} or \code{ggraph}} + \item{...}{arguments passed to \code{\link[igraph]{plot.igraph}}} } \value{ diff --git a/man/read_model.Rd b/man/read_model.Rd new file mode 100644 index 0000000..b2b6d26 --- /dev/null +++ b/man/read_model.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_model.R +\name{read_model} +\alias{read_model} +\title{Make a RAM (Reticular Action Model)} +\usage{ +read_model(sem, times, variables, covs = NULL, quiet = FALSE) +} +\arguments{ +\item{sem}{Specification for time-series structural equation model structure +including lagged or simultaneous effects. See Details section in +\code{\link[dsem]{make_dsem_ram}} for more description} + +\item{times}{A character vector listing the set of times in order} + +\item{variables}{A character vector listing the set of variables} + +\item{covs}{A character vector listing variables for which to estimate a standard deviation} + +\item{quiet}{Boolean indicating whether to print messages to terminal} +} +\description{ +\code{read_model} converts SEM arrow notation to \code{model} describing SEM parameters +} +\details{ +See \code{\link{make_dsem_ram}} for details +} diff --git a/man/stepwise_selection.Rd b/man/stepwise_selection.Rd new file mode 100644 index 0000000..fe68407 --- /dev/null +++ b/man/stepwise_selection.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stepwise_selection.R +\name{stepwise_selection} +\alias{stepwise_selection} +\title{Simulate dsem} +\usage{ +stepwise_selection(model_options, model_shared, quiet = FALSE, ...) +} +\arguments{ +\item{model_options}{character-vector containing sem elements +that could be included or dropped depending upon their +parsimony} + +\item{model_shared}{character-vector containing sem elements +that must be included regardless of parsimony} + +\item{quiet}{whether to avoid displaying progress to terminal} + +\item{...}{arguments passed to \code{\link{dsem}}, +other than \code{sem} e.g., \code{tsdata}, \code{family} +etc.} +} +\value{ +An object (list) that includes: +\describe{ +\item{model}{the string with the selected SEM model} +\item{record}{a list showing the AIC and whether each \code{model_options} is included or not} +} +} +\description{ +Plot from a fitted \code{dsem} model +} +\details{ +This function conducts stepwise (i.e., forwards and backwards) model +selection using marginal AIC, while forcing some model elements to be +included and selecting among others. +} diff --git a/scratch/README.R b/scratch/README.R new file mode 100644 index 0000000..6262421 --- /dev/null +++ b/scratch/README.R @@ -0,0 +1,5 @@ + +To use RTMB, change Makevars to be " +PKG_CPPFLAGS = -DTMBAD_FRAMEWORK -DTMBAD_INDEX_TYPE=uint64_t +" + diff --git a/scratch/debug_simulator.R b/scratch/debug_simulator.R new file mode 100644 index 0000000..0286269 --- /dev/null +++ b/scratch/debug_simulator.R @@ -0,0 +1,44 @@ + + +library(dsem) +library(RTMB) +library(Matrix) + +# Files +source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "make_matrices.R") ) +source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "compute_nll.R") ) +source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "read_model.R") ) +source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "dsemRTMB.R") ) + +# Define model +sem = " + # Link, lag, param_name + cprofits -> consumption, 0, a1 + cprofits -> consumption, 1, a2 + pwage -> consumption, 0, a3 + gwage -> consumption, 0, a3 + cprofits -> invest, 0, b1 + cprofits -> invest, 1, b2 + capital -> invest, 0, b3 + gnp -> pwage, 0, c2 + gnp -> pwage, 1, c3 + time -> pwage, 0, c1 +" + +# Load data +data(KleinI, package="AER") +TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) +tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', + "gwage","invest","capital")] + +# Fit model +fit = dsemRTMB( sem=sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + #family = rep("normal",ncol(tsdata)), + control = dsem_control( quiet = FALSE, + #run_model = TRUE, + #use_REML = TRUE, + gmrf_parameterization = "separable" ) ) + +fit$obj$simulate()$y_tj diff --git a/scratch/example.R b/scratch/example.R new file mode 100644 index 0000000..70b3b49 --- /dev/null +++ b/scratch/example.R @@ -0,0 +1,24 @@ + + +setwd( R'(C:\Users\James.Thorson\Desktop\Git\dsem\scratch)' ) + +example = readRDS( "example.RDS" ) +attach(example) + +# Custom version +dGMRF = function(x, mu, Q ){ + #(2*pi)^(-n_k/2) * + #determinant(solve(Q))^(-1/2) * + #exp(-1/2 * (x-mu) %*% Q %*% (x-mu) ) + (-length(as.vector(x))/2) * log(2*pi) + + (-1/2) * -1 * determinant(Q, log=TRUE)$modulus + + (-1/2 * as.vector(x-mu) %*% Q %*% as.vector(x-mu) )[1,1] +} + +# +library(RTMB) +dgmrf( y, mu=mu, Q=Q, log=TRUE ) +dgmrf( as.vector(y), mu=as.vector(mu), Q=Q, log=TRUE ) +mvtnorm::dmvnorm( y, mu, sigma=as.matrix(solve(Q)), log=TRUE ) +mvtnorm::dmvnorm( as.vector(y), as.vector(mu), sigma=as.matrix(solve(Q)), log=TRUE ) +dGMRF( y, mu=mu, Q=Q ) diff --git a/scratch/example.RDS b/scratch/example.RDS new file mode 100644 index 0000000000000000000000000000000000000000..2dbd1c99a9756dbde44ce082498be426422393bb GIT binary patch literal 1118 zcmV-k1flyMiwFP!000002JP5+Y!p=-!11?7FFdFqSOt-*9CBDxEEI7Usl}r}!6In6 z#x0bx-4@%5pj?VYE@=^O1cjmqT1AR-(-uYXK&b_l#Nc1X7&YR5{v-JPb?0Z6)x?;% z(Mh(+r*CKH&Ac~n-kbUTc2XN-jK_GrF5^kyqe+;MpEF>HG4A$kxQ)wmxiMZhpIydl zQlu>A-$mGA!hf#TigFfe${CP|BqT!)l2$Ijh4!%*qb*W#3EDv#yc8YK5tktibiiDJ zPUws)(FIqbE4raOdY~uL(F?uN2Yqoh`k_AtU?8qR1_t3;T!-s17&jmj41^hqEDXbq z7>=9ZgC8SsGe#mCw;%_ja4T-Z?YINE7>zL)i#&|Oc-)D6Ou${3h)K8`_n-iiF$GgG z4bx%aUIb8x8JLM8%!1sn*$AQpA(WyFb8sKZQGrU_kGXgN^DrL|;vp=+LM*~!EWyKg z1WWNKs;~@?VL4V{C01cI9>*F~qXuj71lHk6JcXz644%bvcpmHV0ybbHUc^h-gw3eM z7QBpCuobUj8@A&$ypB4&fj6TkwhV?$&bLhXFFvv36WgH^TOQ8mO>R$iu?&w)QFnOv zpQ~k~?xs#@PAUC83briFZ&~b= zGU~ycG}8)HmX+)C+eGb72$WbA>~yC_e3?>MG;*|6SzbIZ`ebvgK+r0ZcWz|;M6GPU z8ve)+T<7f9%PAAPXhhGe=hpLU8`_SxrR`~(+OD>(?Q0*jAKDk~kM>FXrG3->X&<$p zB32QOo>wChZ9}6IZBHW=ZCj%j?T1D%+9!=>w0|1eXkRtT(SB>hqt~F(k6w>PLV9f) z73p=#=d}^ntkIHQzeY|v78*tAxM)PBW2DiQj+aJSI(8a$={RZx7P8A&-LDu$j{^vfgBk$-0uYBfn}2+zuW9f0y5Ykn`cAiGm*jra&UZ3nQTp=a z`uP6%{{O{({r= 3 doesn't seem to converge using DSEM or MARSS without penalties +n_factors = 2 + +# Add factors to data +tsdata = harborSealWA[,c("SJI","EBays","SJF","PSnd","HC")] +newcols = array( NA, + dim = c(nrow(tsdata),n_factors), + dimnames = list(NULL,paste0("F",seq_len(n_factors))) ) +tsdata = ts( cbind(tsdata, newcols), start=1978) + +# Scale and center (matches with MARSS does when fitting a DFA) +tsdata = scale( tsdata, center=TRUE, scale=TRUE ) + +# +sem = make_dfa( variables = c("SJI","EBays","SJF","PSnd","HC"), + n_factors = n_factors ) + +# Initial fit +mydsem0 = dsem( tsdata = tsdata, + sem = sem, + family = c( rep("normal",5), rep("fixed",n_factors) ), + estimate_delta0 = TRUE, + control = dsem_control( quiet = TRUE, + run_model = FALSE, + gmrf_parameterization = "projection" ) ) + +# fix all measurement errors at diagonal and equal +map = mydsem0$tmb_inputs$map +map$lnsigma_j = factor( rep(1,ncol(tsdata)) ) + +# Fix factors to have initial value, and variables to not +map$delta0_j = factor( c(rep(NA,ncol(harborSealWA)-1), 1:n_factors) ) + +# Fix variables to have no stationary mean except what's predicted by initial value +map$mu_j = factor( rep(NA,ncol(tsdata)) ) + +# profile "delta0_j" to match MARSS (which treats initial condition as unpenalized random effect) +mydfa = dsem( tsdata = tsdata, + sem = sem, + family = c( rep("normal",5), rep("fixed",n_factors) ), + estimate_delta0 = TRUE, + control = dsem_control( quiet = TRUE, + map = map, + use_REML = TRUE, + #profile = "delta0_j", + gmrf_parameterization = "projection" ) ) + +cAIC(mydfa) +cAIC(mydfa, what="EDF") + +############### +# Klein example +############### + +# Define model +sem = " + # Link, lag, param_name + cprofits -> consumption, 0, a1 + cprofits -> consumption, 1, a2 + pwage -> consumption, 0, a3 + gwage -> consumption, 0, a3 + cprofits -> invest, 0, b1 + cprofits -> invest, 1, b2 + capital -> invest, 0, b3 + gnp -> pwage, 0, c2 + gnp -> pwage, 1, c3 + time -> pwage, 0, c1 +" + +# Load data +data(KleinI, package="AER") +TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) +tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', + "gwage","invest","capital")] + +# +n_missing = 20 +df_missing = expand.grid( seq_len(nrow(tsdata)), seq_len(ncol(tsdata)) ) +df_missing = df_missing[ sample(seq_len(nrow(df_missing)), size=n_missing, replace=FALSE), ] +tsdata[ as.matrix(df_missing) ] = NA + +# Fit model +fit = dsem( sem=sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + control = dsem_control(quiet=TRUE, + getsd = FALSE, + extra_convergence_checks = FALSE, + newton_loops = 0) ) + +cAIC(fit) +cAIC(fit, what="EDF") + +################### +# Linear model +################### + +# simulate normal distribution +x = rnorm(100) +y = 1 + 0.5 * x + rnorm(100) +data = data.frame(x=x, y=y) + +sem = " + x -> y, 0, beta +" + +# Fit as DSEM +fit = dsem( sem = sem, + tsdata = ts(data), + #family = c("fixed","normal"), + control = dsem_control(quiet=TRUE) ) # gmrf_parameterization = "projection", diff --git a/scratch/test_dsemRTMB.R b/scratch/test_dsemRTMB.R new file mode 100644 index 0000000..0174778 --- /dev/null +++ b/scratch/test_dsemRTMB.R @@ -0,0 +1,225 @@ + +#devtools::install_local( R'(C:\Users\James.Thorson\Desktop\Git\dsem)', force=TRUE, dep=FALSE ) +#devtools::install_github( "kaskr/RTMB/RTMB", force=TRUE, dep=FALSE ) + +library(dsem) +#library(RTMB) +#library(Matrix) + +# Define model +sem = " + # Link, lag, param_name + cprofits -> consumption, 0, a1 + cprofits -> consumption, 1, a2 + pwage -> consumption, 0, a3 + gwage -> consumption, 0, a3 + cprofits -> invest, 0, b1 + cprofits -> invest, 1, b2 + capital -> invest, 0, b3 + gnp -> pwage, 0, c2 + gnp -> pwage, 1, c3 + time -> pwage, 0, c1 +" + +# Load data +data(KleinI, package="AER") +TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) +tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', + "gwage","invest","capital")] + +# Fit model +fit0 = dsem( sem=sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + family = rep("normal",ncol(tsdata)), + control = dsem_control( quiet=TRUE, + run_model = FALSE, + use_REML = TRUE, + gmrf_parameterization = "projection" ) ) +#ParHat = fit$internal$parhat +#Rep = fit$obj$report() + +# +Map = fit0$tmb_inputs$map +Map$lnsigma_j = factor( rep(NA,ncol(tsdata)) ) +Params = fit0$tmb_inputs$parameters +Params$lnsigma_j[] = log(0.1) + +# +prior_negloglike = \(obj) -dnorm(obj$par[1],0,0.1,log=TRUE) + +# Fit model +fit = dsem( sem=sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + family = rep("normal",ncol(tsdata)), + prior_negloglike = prior_negloglike, + control = dsem_control( quiet=TRUE, + run_model = TRUE, + use_REML = TRUE, + gmrf_parameterization = "projection", + constant_variance = c("conditional", "marginal", "diagonal")[2], # marginal not working + map = Map, + parameters = Params ) ) + +# RUN dsemRTMB line-by-line +if( FALSE ){ + control = dsem_control() + covs = colnames(tsdata) +} + +# +fit$obj$simulate()$y_tj + +################### +# dsemRTMB +################### + +#devtools::install_local( R'(C:\Users\James.Thorson\Desktop\Git\dsem)', force=TRUE, dep=FALSE ) +#devtools::install_github( "James-Thorson-NOAA/dsem@dev", force=TRUE, dep=FALSE ) + +library(dsem) + +#devtools::install_github( "kaskr/RTMB/RTMB", force=TRUE, dep=FALSE ) + +# Files +if( FALSE ){ + source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "make_matrices.R") ) + source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "compute_nll.R") ) + source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "read_model.R") ) + source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "dsemRTMB.R") ) + source( file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\R)', "rgmrf.R") ) +} + +# Define model +sem = " + # Link, lag, param_name + cprofits -> consumption, 0, a1 + cprofits -> consumption, 1, a2 + pwage -> consumption, 0, a3 + gwage -> consumption, 0, a3 + cprofits -> invest, 0, b1 + cprofits -> invest, 1, b2 + capital -> invest, 0, b3 + gnp -> pwage, 0, c2 + gnp -> pwage, 1, c3 + time -> pwage, 0, c1 +" + +# Load data +data(KleinI, package="AER") +TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) +tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', + "gwage","invest","capital")] + +# Fit model +fit0 = dsemRTMB( sem=sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + family = rep("normal",ncol(tsdata)), + control = dsem_control( quiet=TRUE, + run_model = FALSE, + use_REML = TRUE, + gmrf_parameterization = "projection" ) ) +#ParHat = fit$internal$parhat +#Rep = fit$obj$report() + +# +Map = fit0$tmb_inputs$map +Map$lnsigma_j = factor( rep(NA,ncol(tsdata)) ) +Params = fit0$tmb_inputs$parameters +Params$lnsigma_j[] = log(0.1) + +# Define prior +log_prior = function(p) dnorm( p$beta_z[1], mean=0, sd=0.1, log=TRUE) + +fitRTMB = dsemRTMB( sem = sem, + tsdata = tsdata, + estimate_delta0 = TRUE, + family = rep("normal",ncol(tsdata)), + log_prior = log_prior, + control = dsem_control( quiet = TRUE, + run_model = TRUE, + use_REML = TRUE, + gmrf_parameterization = "projection", + constant_variance = c("conditional", "marginal", "diagonal")[2], # marginal not working + map = Map, + parameters = Params ) ) +#Rep = fitRTMB$obj$report() +range(fit$opt$par - fitRTMB$opt$par) +#fitRTMB$obj$report() +fitRTMB$obj$report() + +fitRTMB$simulator(simulate_gmrf=FALSE) + +# Check that it works in a function (no scoping/environment problems) +do_it = \(x) x$simulator(simulate_gmrf=FALSE) +do_it( fitRTMB ) + +# +fitRTMB$obj$simulate() + +if( FALSE ){ + Rep = fitRTMB$obj$report() + dgmrf( as.vector(Rep$z_tj), mu=as.vector(Rep$xhat_tj + Rep$delta_tj), Q=Rep$Q0_kk, log=TRUE ) + + example = list( + Gamma_kk = Rep$Gamma_kk, + Rho_kk = Rep$Rho_kk + ) + saveRDS( example, file=file.path(R'(C:\Users\James.Thorson\Desktop\Git\dsem\scratch)',"example.RDS")) + # Use in example2.R +} + +# +obj = fitRTMB$obj +type = tapply( summary(fit)[,'direction'], INDEX=as.numeric(summary(fit)[,'parameter']), FUN=max ) +lower = rep( -Inf, length(obj$par) ) +lower[names(obj$par)=="beta_z"] = ifelse( type==2, 0, -Inf ) +upper = rep( Inf, length(obj$par) ) + +# Check bounds on priors +cbind( obj$par, lower, upper ) + +# Run MCMC +library(tmbstan) +mcmc = tmbstan( obj, + lower = lower, + upper = upper, + init = 'last.par.best', + laplace = FALSE ) +traceplot(mcmc, pars=unique(names(obj$par)), inc_warmup=TRUE) + + +if( FALSE ){ + obj = fitRTMB$obj + obj$fn(obj$par) + obj$gr(obj$par) + + # + #obj$fn( fit$opt$par ) + rep0 = fitRTMB$obj$report( fit$obj$env$last.par.best ) + rep1 = fit$obj$report( fit$obj$env$last.par.best ) + + # + fitRTMB$obj$gr( fit$opt$par ) + + range(fit$opt$par - fitRTMB$opt$par) +} + +# +if( FALSE ){ + Rep = fitRTMB$obj$report() + dgmrf( as.vector(Rep$z_tj), mu=as.vector(Rep$xhat_tj + Rep$delta_tj), Q=Rep$Q_kk, log=TRUE ) + solve(Rep$V_kk, Rep$IminusRho_kk) + + fit$tmb_inputs$parameters$x_tj + fitRTMB$tmb_inputs$parameters$x_tj + fit$obj$report()$jnll_gmrf + fitRTMB$obj$report()$jnll_gmrf + + # + sum(dnorm( fit$tmb_inputs$parameters$x_tj, log=TRUE )) +} + +# diff --git a/src/dsem.cpp b/src/dsem.cpp index 3b33ba4..58d0699 100644 --- a/src/dsem.cpp +++ b/src/dsem.cpp @@ -181,7 +181,7 @@ Type objective_function::operator() () REPORT( Q_kk ); }else{ // Rank-deficient (projection) method - jnll_gmrf += GMRF(I_kk)( x_tj ); + jnll_gmrf = GMRF(I_kk)( x_tj ); // Forward-format matrix matrix z_k1( n_t*n_j, int(1) ); diff --git a/tests/testthat/test-platform.R b/tests/testthat/test-platform.R index 3987d6c..923193e 100644 --- a/tests/testthat/test-platform.R +++ b/tests/testthat/test-platform.R @@ -1,53 +1,41 @@ test_that("dsem example is working ", { #skip_on_ci() + data(KleinI, package="AER") + TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) + + # Specify by declaring each arrow and lag sem = " - Profits -> Consumption, 0, a2 - Profits -> Consumption, -1, a3 - Priv_wage -> Consumption, 0, a4 - Gov_wage -> Consumption, 0, a4 - Consumption <-> Consumption, 0, v1 - Consumption -> Consumption, -1, ar1 - Consumption -> Consumption, -2, ar2 - Profits -> Investment, 0, b2 - Profits -> Investment, -1, b3 - Capital_stock -> Investment, -1, b4 - Investment <-> Investment, 0, v2 - neg_Gov_wage <-> neg_Gov_wage, 0, v3 - GNP -> Priv_wage, 0, c2 - Taxes -> Priv_wage, 0, c2 - neg_Gov_wage -> Priv_wage, 0, c2 - GNP -> Priv_wage, -1, c3 - Taxes -> Priv_wage, -1, c3 - neg_Gov_wage -> Priv_wage, -1, c3 - Time -> Priv_wage, 0, c4 - Priv_wage <-> Priv_wage, 0, v4 - GNP <-> GNP, 0, v5 - Profits <-> Profits, 0, v6 - Capital_stock <-> Capital_stock, 0, v7 - Taxes <-> Taxes, 0, v8 - Time <-> Time, 0, v9 - Gov_wage <-> Gov_wage, 0, v10 - Gov_expense <-> Gov_expense, 0, v11 - " + # Link, lag, param_name + cprofits -> consumption, 0, a1 + cprofits -> consumption, 1, a2 + pwage -> consumption, 0, a3 + gwage -> consumption, 0, a3 - # Load data - data(KleinI, package="AER") - Data = as.data.frame(KleinI) - Data = cbind( Data, "time" = seq(1,22)-11 ) - colnames(Data) = sapply( colnames(Data), FUN=switch, - "consumption"="Consumption", "invest"="Investment", - "cprofits"="Profits", "capital"="Capital_stock", "gwage"="Gov_wage", - "pwage"="Priv_wage", "gexpenditure"="Gov_expense", "taxes"="Taxes", - "time"="Time", "gnp"="GNP") - Z = ts( cbind(Data, "neg_Gov_wage"=-1*Data[,'Gov_wage']) ) + cprofits -> invest, 0, b1 + cprofits -> invest, 1, b2 + capital -> invest, 0, b3 + + gnp -> pwage, 0, c2 + gnp -> pwage, 1, c3 + time -> pwage, 0, c1 + " + tsdata = TS[,c("time","gnp","pwage","cprofits",'consumption', + "gwage","invest","capital")] # Fit model fit = dsem( sem=sem, - tsdata=Z, + tsdata=tsdata, control = dsem_control(getJointPrecision=TRUE) ) # Check objective function - expect_equal( as.numeric(fit$opt$obj), 587.4755, tolerance=1e-2 ) + expect_equal( as.numeric(fit$opt$obj), 431.6122, tolerance=1e-2 ) + + # + fitRTMB = dsemRTMB( sem=sem, + tsdata=tsdata, + control = dsem_control(getsd=FALSE) ) + # Check objective function + expect_equal( as.numeric(fit$opt$obj), as.numeric(fitRTMB$opt$obj), tolerance=1e-2 ) # Convert and plot using phylopath as_fitted_DAG(fit) @@ -62,19 +50,35 @@ test_that("dsem example is working ", { as_sem(fit) predict(fit, type="link") predict(fit, type="response") - predict(fit, type="link", newdata=Z) + predict(fit, type="link", newdata=tsdata) simulate(fit, variance = "none") simulate(fit, variance = "random") simulate(fit, variance = "both") simulate(fit, resimulate_gmrf=TRUE) - # Refit with measurement errors + # Refit with measurement errors .. ignore quality of fit, just checking that it runs fit1 = dsem( sem=sem, - tsdata=Z, - family = c("normal","gamma",rep("fixed",ncol(Z)-2)), - control = dsem_control(getsd=FALSE, newton_loops=0) ) + tsdata = tsdata, + family = c("normal","gamma",rep("fixed",ncol(tsdata)-2)), + control = dsem_control(getsd=FALSE, + extra_convergence_checks = FALSE, + newton_loops=0) ) residuals(fit1, type="deviance") residuals(fit1, type="response") + + # Test equations including lags, starting values, and mirrored params + equations = " + consumption = a1(0.1)*cprofits + a2(0.1)*lag[cprofits,1]+ a3(0.1)*pwage + a3(0.2)*gwage + invest = b1*cprofits + b2*lag[cprofits,1] + b3*capital + pwage = c1*time + c2*gnp + c3*lag[gnp,1] + " + sem_equations = convert_equations(equations) + # Fit model + fit_equations = dsem( sem=sem_equations, + tsdata=tsdata, + control = dsem_control(getsd=FALSE) ) + # Check objective function + expect_equal( as.numeric(fit$opt$obj), as.numeric(fit_equations$opt$obj), tolerance=1e-2 ) }) test_that("dsem adds variances ", { @@ -97,7 +101,7 @@ test_that("dsem adds variances ", { test_that("bering sea example is stable ", { data(bering_sea) - + # Z = ts( bering_sea ) family = rep( "fixed", ncol(bering_sea) ) @@ -112,7 +116,7 @@ test_that("bering sea example is stable ", { log_Esummer -> log_PercentEuph, 0, Esummer_to_Suph log_Cfall -> log_PercentCop, 0, Cfall_to_Scop SSB -> log_RperS, 0, SSB_to_RperS - + log_seaice -> log_seaice, 1, AR1, 0.001 log_CP -> log_CP, 1, AR2, 0.001 log_Cfall -> log_Cfall, 1, AR4, 0.001 @@ -122,14 +126,42 @@ test_that("bering sea example is stable ", { log_PercentEuph -> log_PercentEuph, 1, AR8, 0.001 log_PercentCop -> log_PercentCop, 1, AR9, 0.001 " - + # Run model fit = dsem( sem=sem, tsdata=Z, family=family, control = dsem_control(use_REML=FALSE) ) - + # Check objective function expect_equal( as.numeric(fit$opt$obj), 189.3005, tolerance=1e-2 ) }) +test_that("Fixing parameters works ", { + sem = " + A -> B, 0, NA, 1 + B -> C, 0, NA, 0.5 + # Extra links + A -> D, 1, beta + B -> D, 1, beta + " + set.seed(101) + nobs = 40 + A = rnorm(nobs) + B = A + rnorm(nobs) + C = 0.5 * B + rnorm(nobs) + D = rnorm(nobs) + tsdata = ts(cbind(A=A, B=B, C=C, D=D)) + + # Run models + fit = dsem( sem=sem, + tsdata=tsdata, + control = dsem_control(getsd=FALSE) ) + fitRTMB = dsemRTMB( sem=sem, + tsdata=tsdata, + control = dsem_control(getsd=FALSE) ) + # Check objective function + expect_equal( as.numeric(fit$opt$obj), 224.2993, tolerance=1e-2 ) + expect_equal( as.numeric(fit$opt$obj), as.numeric(fitRTMB$opt$obj), tolerance=1e-2 ) + +}) \ No newline at end of file diff --git a/tests/testthat/test-priors.R b/tests/testthat/test-priors.R new file mode 100644 index 0000000..66c4ba0 --- /dev/null +++ b/tests/testthat/test-priors.R @@ -0,0 +1,65 @@ + +test_that("priors interface is working ", { + #skip_on_cran() + #skip_on_ci() + + # Requires loading RTMB + library(RTMB) + + # Load data + data(bering_sea) + + # + Z = ts( bering_sea ) + family = rep( "fixed", ncol(bering_sea) ) + + # Specify model + sem = " + log_seaice -> log_CP, 0, seaice_to_CP + log_CP -> log_Cfall, 0, CP_to_Cfall + log_CP -> log_Esummer, 0, CP_to_E + log_PercentEuph -> log_RperS, 0, Seuph_to_RperS + log_PercentCop -> log_RperS, 0, Scop_to_RperS + log_Esummer -> log_PercentEuph, 0, Esummer_to_Suph + log_Cfall -> log_PercentCop, 0, Cfall_to_Scop + SSB -> log_RperS, 0, SSB_to_RperS + + log_seaice -> log_seaice, 1, AR1, 0.001 + log_CP -> log_CP, 1, AR2, 0.001 + log_Cfall -> log_Cfall, 1, AR4, 0.001 + log_Esummer -> log_Esummer, 1, AR5, 0.001 + SSB -> SSB, 1, AR6, 0.001 + log_RperS -> log_RperS, 1, AR7, 0.001 + log_PercentEuph -> log_PercentEuph, 1, AR8, 0.001 + log_PercentCop -> log_PercentCop, 1, AR9, 0.001 + " + + # Using fitRTMB + log_prior = function(p){ + "c" <- ADoverload("c") + "[<-" <- ADoverload("[<-") + sum(dnorm( p$beta_z[9:16], mean=0, sd=0.25, log=TRUE)) + } + fitRTMB = dsemRTMB( sem = sem, + tsdata = Z, + family = family, + log_prior = log_prior, + control = dsem_control( use_REML = FALSE) ) + + # Run model using dsem ... not working in testthat mode + neglog_prior = function(obj){ + "c" <- ADoverload("c") + "[<-" <- ADoverload("[<-") + -1 * sum(dnorm( obj$par[9:16], mean=0, sd=0.25, log=TRUE)) + } + fit = dsem( sem=sem, + tsdata=Z, + family=family, + prior_negloglike = neglog_prior, + control = dsem_control(use_REML=FALSE) ) + + # Check objective function + expect_equal( as.numeric(fitRTMB$opt$obj), 198.1363, tolerance=1e-2 ) + expect_equal( as.numeric(fit$opt$obj), as.numeric(fitRTMB$opt$obj), tolerance=1e-2 ) +}) + diff --git a/vignettes/dynamic_factor_analysis.Rmd b/vignettes/dynamic_factor_analysis.Rmd index d9402f5..51c4897 100644 --- a/vignettes/dynamic_factor_analysis.Rmd +++ b/vignettes/dynamic_factor_analysis.Rmd @@ -153,9 +153,34 @@ tsdata = ts( cbind(tsdata, newcols), start=1978) # Scale and center (matches with MARSS does when fitting a DFA) tsdata = scale( tsdata, center=TRUE, scale=TRUE ) -# -sem = make_dfa( variables = c("SJI","EBays","SJF","PSnd","HC"), - n_factors = n_factors ) +# Automated version +#sem = make_dfa( variables = c("SJI","EBays","SJF","PSnd","HC"), +# n_factors = n_factors ) +# Manual specification to show structure, using equations-and-lags interface +equations = " + # Loadings of variables onto factors + SJI = L11(0.1) * F1 + EBays = L12(0.1) * F1 + L22(0.1) * F2 + SJF = L13(0.1) * F1 + L23(0.1) * F2 + PSnd = L14(0.1) * F1 + L24(0.1) * F2 + HC = L15(0.1) * F1 + L25(0.1) * F2 + + # random walk for factors + F1 = NA(1) * lag[F1,1] + F2 = NA(1) * lag[F2,1] + + # Unit variance for factors + V(F1) = NA(1) + V(F2) = NA(1) + + # Zero residual variance for variables + V(SJI) = NA(0) + V(EBays) = NA(0) + V(SJF) = NA(0) + V(PSnd) = NA(0) + V(HC) = NA(0) +" +sem = convert_equations(equations) # Initial fit mydsem0 = dsem( tsdata = tsdata, diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd index 7b41557..67ada8f 100644 --- a/vignettes/vignette.Rmd +++ b/vignettes/vignette.Rmd @@ -14,6 +14,7 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) +set.seed(101) # Install locally # devtools::install_local( R'(C:\Users\James.Thorson\Desktop\Git\dsem)', force=TRUE ) # Build @@ -31,20 +32,73 @@ library(phylopath) `dsem` is an R package for fitting dynamic structural equation models (DSEMs) with a simple user-interface and generic specification of simultaneous and lagged effects in a non-recursive structure. We here highlight a few features in particular. +## Comparison with linear models + +We first show that `dsem` is identical to a linear model. To do so, we simulate data with a single response and single predictor: + +```{r, echo=TRUE, message=FALSE, fig.width=7, fig.height=7} +# simulate normal distribution +x = rnorm(100) +y = 1 + 0.5 * x + rnorm(100) +data = data.frame(x=x, y=y) + +# Fit as linear model +Lm = lm( y ~ x, data=data ) + +# Fit as DSEM +fit = dsem( sem = "x -> y, 0, beta", + tsdata = ts(data), + control = dsem_control(quiet=TRUE) ) + +# Display output +m1 = rbind( + "lm" = summary(Lm)$coef[2,1:2], + "dsem" = summary(fit)[1,9:10] +) +knitr::kable( m1, digits=3) +``` + +This shows that linear and dynamic structural equation models give identical estimates of the single path coefficient. + +We can also calculate leave-one-out residuals and display them using DHARMa: + +```{r, echo=TRUE, message=FALSE, fig.width=7, fig.height=7} +# sample-based quantile residuals +samples = loo_residuals(fit, what="samples", track_progress=FALSE) +which_use = which(!is.na(data)) +fitResp = loo_residuals( fit, what="loo", track_progress=FALSE)[,'est'] +simResp = apply(samples, MARGIN=3, FUN=as.vector)[which_use,] + +# Build and display DHARMa object +res = DHARMa::createDHARMa( + simulatedResponse = simResp, + observedResponse = unlist(data)[which_use], + fittedPredictedResponse = fitResp ) +plot(res) +``` + +This shows no pattern in the quantile-quantile plot, as expected given that we have a correctly specified model. We can also confirm that this gives identical to results to the linear model: + +```{r, echo=TRUE, message=FALSE, fig.width=7, fig.height=7} +# Get DSEM Loo residuals and LM working residuals +res = loo_residuals(fit, what="quantiles", track_progress=FALSE) +res0 = resid(Lm,"working") + +# Plot comparison +plot( x=res0, y=qnorm(res[,2]), + xlab="linear model residuals", ylab="dsem residuals" ) +abline(a=0,b=1) +``` + ## Comparison with dynamic linear models -We first demonstrate that `dsem` gives identical results to `dynlm` for a well-known econometric model, the Klein-1 model. +We next demonstrate that `dsem` using a well-known econometric model, the Klein-1 model. ```{r, echo=TRUE, message=FALSE, fig.width=7, fig.height=7} data(KleinI, package="AER") TS = ts(data.frame(KleinI, "time"=time(KleinI) - 1931)) -# dynlm -fm_cons <- dynlm(consumption ~ cprofits + L(cprofits) + I(pwage + gwage), data = TS) -fm_inv <- dynlm(invest ~ cprofits + L(cprofits) + capital, data = TS) # -fm_pwage <- dynlm(pwage ~ gnp + L(gnp) + time, data = TS) - -# dsem +# Specify by declaring each arrow and lag sem = " # Link, lag, param_name cprofits -> consumption, 0, a1 @@ -68,8 +122,37 @@ fit = dsem( sem=sem, control = dsem_control( quiet = TRUE, newton_loops = 0) ) +``` + +This model could instead be specified using equation-and-lag notation, which makes the model structure more clear: + +```{r, echo=TRUE, message=FALSE, fig.width=7, fig.height=7} +# Specify using equations +equations = " + consumption = a1*cprofits + a2*lag[cprofits,1]+ a3*pwage + a3*gwage + invest = b1*cprofits + b2*lag[cprofits,1] + b3*capital + pwage = c1*time + c2*gnp + c3*lag[gnp,1] +" -# Compile +# Convert and run +sem_equations = convert_equations(equations) +fit = dsem( sem = sem_equations, + tsdata = tsdata, + estimate_delta0 = TRUE, + control = dsem_control( + quiet = TRUE, + newton_loops = 0) ) +``` + +We first demonstrate that `dsem` gives identical results to `dynlm`: + +```{r, echo=TRUE, message=FALSE, fig.width=7, fig.height=7} +# dynlm +fm_cons <- dynlm(consumption ~ cprofits + L(cprofits) + I(pwage + gwage), data = TS) +fm_inv <- dynlm(invest ~ cprofits + L(cprofits) + capital, data = TS) +fm_pwage <- dynlm(pwage ~ gnp + L(gnp) + time, data = TS) + +# Compile results m1 = rbind( summary(fm_cons)$coef[-1,], summary(fm_inv)$coef[-1,], summary(fm_pwage)$coef[-1,] )[,1:2] @@ -80,8 +163,7 @@ m = rbind( ) m = cbind(m, "lower"=m$Estimate-m$Std..Error, "upper"=m$Estimate+m$Std..Error ) -# ggplot estimates - +# ggplot display of estimates longform = melt( as.data.frame(KleinI) ) longform$year = rep( time(KleinI), 9 ) p1 = ggplot( data=longform, aes(x=year, y=value) ) +