Maintain feature parity with model-res-avm (#19)

* Update renv * Update ingest through data pulls to match res pipeline * Add char_data to file_dict.csv * Pausing work on ingest. Section 4 of ingest is next up. * Linting * Correct assessment data year * Remove time interval from assessment data * Ingest running * Preliminary train changes * Preliminary evaluate updates * Preliminary interpret changes * Preliminary finalize changes * Preliminary upload script changes * Initial export changes * Intitial assess stage changes * Assess stage changes * More ingest changes * Additional train updates * Additional finalize stage updates * Use noctua in export stage * Use noctua in api stage * Linting * Re-add removed summary stat hack in evaluate * Revert some lline removals in interpret * Minor formatting changes * Linting * Updates to description and params * Remove time_split from ingest * Running through train * Linting * Linting * Pipeline running through export stage * Shift to res avm run note style * Remove step_nzv from the linear model recipe * Ingest running with tunable lin model, haven't tried with cv * Update renv profiles * Docker stuff * Remove python-related files * Remove .renvignore from home dir * Remove ren settings file * Add terraform config files * Add github workflows * Update pre-commit config * Update dvc deps * Update snapshot type in setteings * Linting * Linting * Linting * Linting * Evaluate running again but depending on hack * Update lien date * Remove superfluous git workflow scripts * Add new secrets syntax and run_type choices * Remove additional redundant code
ccao-data · Jan 30, 2024 · 54870fa · 54870fa
1 parent 9acbc33
commit 54870fa
Show file tree

Hide file tree

Showing 39 changed files with 4,059 additions and 4,181 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+README.Rmd
+README.md
+renv/profile
+docs/
+input/
+renv/library/
+renv/sandbox/
+renv/staging/
diff --git a/.github/workflows/build-and-run-model.yaml b/.github/workflows/build-and-run-model.yaml
@@ -13,6 +13,48 @@ on:
   pull_request:
     types: [opened, reopened, synchronize, closed]
   workflow_dispatch:
+    inputs:
+      run_type:
+        type: choice
+        description: Run type or purpose
+        options:
+        - junk
+        - rejected
+        - test
+        - baseline
+        - candidate
+        - final
+        default: test
+        required: true
+      run_note:
+        type: string
+        description: Note to include with run
+        required: true
+      report_additional_pins:
+        type: string
+        description: Create reports for additional PINs
+        default: '13253180150000 05174150240000'
+        required: false
+      upload_enable:
+        type: boolean
+        description: Upload results to S3
+        default: true
+        required: true
+      cv_enable:
+        type: boolean
+        description: Run cross-validation
+        default: false
+        required: true
+      comp_enable:
+        type: boolean
+        description: Run comparables finding
+        default: false
+        required: true
+      shap_enable:
+        type: boolean
+        description: Calculate SHAP values
+        default: false
+        required: true
   push:
     branches: [master]
 
@@ -29,11 +71,28 @@ jobs:
       packages: write
     uses: ccao-data/actions/.github/workflows/build-and-run-batch-job.yaml@main
     with:
-      vcpu: "16.0"
-      memory: "65536"
-      role-duration-seconds: 14400  # Worst-case time for a full model run
+      backend: "ec2"
+      vcpu: "40"
+      memory: "158000"
+      # Maximum pipeline runtime. This is slightly below 6 hours, which
+      # is the maximum length of any single GitHub Actions job
+      role-duration-seconds: 21000
+      # Disable Batch job status polling since this workflow often takes
+      # more than 6 hours
+      poll_for_status: false
+      # Set these env vars in the container
+      container_env_vars: |
+        WORKFLOW_RUN_TYPE=${{ inputs.run_type }}
+        WORKFLOW_RUN_NOTE=${{ inputs.run_note }}
+        UPLOAD_ENABLE_OVERRIDE=${{ inputs.upload_enable }}
+        CV_ENABLE_OVERRIDE=${{ inputs.cv_enable }}
+        COMP_ENABLE_OVERRIDE=${{ inputs.comp_enable }}
+        SHAP_ENABLE_OVERRIDE=${{ inputs.shap_enable }}
+        REPORT_ADDITIONAL_PINS=${{ inputs.report_additional_pins }}
     secrets:
       AWS_IAM_ROLE_TO_ASSUME_ARN: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
       AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
+      # Set these env vars as secrets so they get masked in the GitHub
+      # Actions logs
       CONTAINER_ENV_VARS: |
         AWS_SNS_ARN_MODEL_STATUS=${{ secrets.AWS_SNS_ARN_MODEL_STATUS }}
diff --git a/.gitignore b/.gitignore
@@ -4,10 +4,11 @@
 
 # R project files
 .Rproj.user/
+reports/*_files/
 
 # knitr and R markdown default cache directories
-/*_cache/
-/cache/
+*_cache/
+cache/
 
 # Temporary files created by R markdown
 *.utf8.md
@@ -21,7 +22,7 @@
 *.xlsx
 *.xlsm
 *.html
+*.rmarkdown
 
-# Ignore python environment
-pipenv/
-pipenv
+# Ignore scratch documents
+scratch*.*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,17 +2,18 @@
 # R specific hooks: https://github.com/lorenzwalthert/precommit
 repos:
 -   repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.3.2.9013
+    rev: v0.3.2.9027
     hooks:
     -   id: style-files
         args: [--style_pkg=styler, --style_fun=tidyverse_style]
     -   id: lintr
     -   id: readme-rmd-rendered
+        exclude: reports/README.md
     -   id: parsable-R
     -   id: no-browser-statement
     -   id: no-debug-statement
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
     -   id: check-added-large-files
         args: ['--maxkb=200']
@@ -24,4 +25,4 @@ repos:
         name: Don't commit common R artifacts
         entry: Cannot commit .Rhistory, .RData, .Rds or .rds.
         language: fail
-        files: '\.(Rhistory|RData|Rds|rds)$'
+        files: '\.(Rhistory|RData|Rds|rds)$'
diff --git a/.renvignore b/.renvignore
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,83 @@
 Type: Project
-Description: Condo valuation model of the Cook County Assessor's Office
+Description: Condominium valuation model of the Cook County Assessor's Office
+Depends:
+    arrow,
+    assessr,
+    aws.s3,
+    aws.ec2metadata,
+    butcher,
+    ccao,
+    conflicted,
+    dplyr,
+    embed,
+    furrr,
+    git2r,
+    glue,
+    hardhat,
+    here,
+    knitr,
+    parsnip,
+    purrr,
+    lightgbm,
+    lightsnip,
+    lubridate,
+    paws.analytics,
+    paws.application.integration,
+    recipes,
+    rlang,
+    rsample,
+    stringr,
+    tictoc,
+    tidyr,
+    tune,
+    workflows,
+    yaml,
+    yardstick
+Remotes:
+    ccao-data/assessr,
+    ccao-data/ccao,
+    ccao-data/lightsnip
+Config/renv/profiles/dev/dependencies:
+    commonmark,
+    DBI,
+    igraph,
+    markdown,
+    noctua,
+    openxlsx,
+    readr,
+    rmarkdown
 Config/renv/profiles/reporting/dependencies:
-    quarto,
+    backports,
+    brio,
+    broom,
+    DBI,
+    decor,
+    desc,
+    DT,
+    digest,
+    fastmap,
+    ggplot2,
+    ggrepel,
+    gmodels,
+    grid,
+    gridExtra,
+    gtools,
+    htmltools,
+    htmlwidgets,
+    kableExtra,
     leaflet,
+    magrittr,
+    markdown,
     plotly,
-    sf
+    quarto,
+    repr,
+    scales,
+    shapviz,
+    sf,
+    skimr,
+    stringr,
+    survey,
+    svglite,
+    tableone,
+    tools,
+    xgboost
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM rocker/r-ver:4.3.1
+FROM rocker/r-ver:4.3.2
 
 # Set the working directory to setup. Uses a dedicated directory instead of
 # root since otherwise renv will try to scan every subdirectory
@@ -37,13 +37,13 @@ COPY renv/ renv/
 RUN Rscript -e 'renv::restore(packages = "renv"); renv::restore()'
 RUN Rscript -e 'renv::restore(lockfile = "reporting-renv.lock")'
 
-# Set the working directory to the app dir
+# Set the working directory to the model directory
 WORKDIR /model-condo-avm/
 
 # Copy the directory into the container
 COPY ./ .
 
-# Copy R dependencies into the app directory
+# Copy R dependencies into the model directory
 RUN rm -Rf /model-condo-avm/renv && \
     mv /setup/renv /model-condo-avm/renv
 

diff --git a/Pipfile b/Pipfile