Cohort covariates (#167)

* Adding covariate builders based on other cohorts * Fixing build * Regenerating jar using maven * Implementing runTestsOnEunomia switch for new tests * Some fixes to cohort-based covariates * Adding a vignette for cohort-based covariates. * Update CreatingCovariatesBasedOnOtherCohorts.Rmd * Throw warning if using pre-spec analysis ID for cohort-based covariate. * Adding count type temporal covariates based on other cohorts * Allowing cohordId to be vector in cohort-based covariate builder * Update vignette * Expand cohort covariates unit tests (#204) --------- Co-authored-by: Schuemie <MSCHUEMI@its.jnj.com> Co-authored-by: Admin_mschuemi <Admin_mschuemi@its.jnj.com> Co-authored-by: Anthony Sena <asena5@its.jnj.com> Co-authored-by: Ger Inberg <ginberg@gmail.com> Co-authored-by: Anthony Sena <anthonysena@users.noreply.github.com>
OHDSI · Jun 28, 2023 · cf4a169 · cf4a169
1 parent 1e46b67
commit cf4a169
Show file tree

Hide file tree

Showing 24 changed files with 2,099 additions and 12 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,7 +1,6 @@
 pom.xml
 extras
 docs
-bin
 man-roxygen
 ^.*\.Rproj$
 ^\.Rproj\.user$

diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml
@@ -17,13 +17,13 @@ jobs:
     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
 
     strategy:
+      max-parallel: 1
       fail-fast: false
       matrix:
         config:
           - {os: windows-latest, r: 'release'}  # Does not appear to have Java 32-bit, hence the --no-multiarch
           - {os: macOS-latest, r: 'release'}
           - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
-          #- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
 
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,7 +29,8 @@ Imports:
   readr,
   rlang,
   RSQLite,
-  DBI
+  DBI,
+  checkmate
 Suggests:
   testthat,
   knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,8 @@ export(computeStandardizedDifference)
 export(convertPrespecSettingsToDetailedSettings)
 export(createAnalysisDetails)
 export(createCohortAttrCovariateSettings)
+export(createCohortBasedCovariateSettings)
+export(createCohortBasedTemporalCovariateSettings)
 export(createCovariateSettings)
 export(createDefaultCovariateSettings)
 export(createDefaultTemporalCovariateSettings)
@@ -18,6 +20,7 @@ export(createTemporalSequenceCovariateSettings)
 export(filterByCohortDefinitionId)
 export(filterByRowId)
 export(getDbCohortAttrCovariatesData)
+export(getDbCohortBasedCovariatesData)
 export(getDbCovariateData)
 export(getDbDefaultCovariateData)
 export(getDefaultTable1Specifications)
@@ -44,3 +47,4 @@ importFrom(rlang,.data)
 importFrom(stats,aggregate)
 importFrom(stats,quantile)
 importFrom(stats,sd)
+importFrom(utils,read.csv)
diff --git a/R/FeatureExtraction.R b/R/FeatureExtraction.R
@@ -21,6 +21,7 @@
 
 #' @importFrom SqlRender loadRenderTranslateSql translate render
 #' @importFrom methods is
+#' @importFrom utils read.csv
 #' @importFrom stats aggregate quantile sd
 #' @importFrom rlang .data
 #' @import DatabaseConnector

diff --git a/R/GetCovariatesFromOtherCohorts.R b/R/GetCovariatesFromOtherCohorts.R
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
@@ -102,3 +102,12 @@ filterByCohortDefinitionId <- function(covariateData, cohortId) {
   attr(class(result), "package") <- "FeatureExtraction"
   return(result)
 }
+
+.assertCovariateId <- function(covariateId, len = NULL, min.len = NULL, null.ok = FALSE, add = NULL) {
+  checkmate::assertNumeric(covariateId, null.ok = null.ok, len = len, min.len = 1, add = add)
+  if (!is.null(covariateId)) {
+    message <- sprintf("Variable '%s' is a (64-bit) integer",
+                       paste0(deparse(eval.parent(substitute(substitute(covariateId))), width.cutoff = 500L),collapse = "\n"))
+    checkmate::assertTRUE(all(covariateId == round(covariateId)), .var.name = message, add = add)
+  }
+}
diff --git a/README.md b/README.md
@@ -45,6 +45,7 @@ The documentation website can be found at [https://ohdsi.github.io/FeatureExtrac
 * Vignette: [Using FeatureExtraction](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/UsingFeatureExtraction.pdf)
 * Vignette: [Creating covariates using cohort attributes](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCovariatesUsingCohortAttributes.pdf)
 * Vignette: [Creating custom covariate builders](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCustomCovariateBuilders.pdf)
+* Vignette: [Creating covariates based on other cohorts](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf)
 * Package manual: [FeatureExtraction manual](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/extras/FeatureExtraction.pdf) 
 
 These vignettes are also available in Korean:

diff --git a/extras/CohortBasedCovariatesVignetteDataFetch.R b/extras/CohortBasedCovariatesVignetteDataFetch.R
@@ -0,0 +1,116 @@
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of FeatureExtraction
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code should be used to fetch the data that is used in the cohort-based covariates vignette.  
+library(FeatureExtraction)
+library(SqlRender)
+vignetteFolder <- "s:/temp/vignetteFeatureExtractionCohortBased"
+
+# MDCD on RedShift
+connectionDetails <- createConnectionDetails(dbms = "redshift",
+                                             connectionString = keyring::key_get("redShiftConnectionStringOhdaMdcd"),
+                                             user = keyring::key_get("redShiftUserName"),
+                                             password = keyring::key_get("redShiftPassword"))
+cdmDatabaseSchema <- "cdm_truven_mdcd_v1978"
+cohortDatabaseSchema <- "scratch_mschuemi"
+cohortTable <- "feature_extraction_cohort_based"
+cdmVersion <- "5"
+
+
+# Create cohorts -------------------------------------------------------
+connection <- connect(connectionDetails)
+sql <- readSql(system.file("sql", "sql_server", "covariateCohorts.sql", package = "FeatureExtraction"))
+renderTranslateExecuteSql(connection = connection,
+                          sql = sql,
+                          cdm_database_schema = cdmDatabaseSchema,
+                          cohort_database_schema = cohortDatabaseSchema,
+                          cohort_table = cohortTable)
+
+# Check number of subjects per cohort:
+sql <- paste("SELECT cohort_definition_id, 
+                COUNT(*) AS count",
+             "FROM @cohort_database_schema.@cohort_table",
+             "GROUP BY cohort_definition_id")
+renderTranslateQuerySql(connection = connection,
+                        sql = sql,
+                        cohort_database_schema = cohortDatabaseSchema,
+                        cohort_table = cohortTable)
+disconnect(connection)
+
+
+# Construct covariates -----------------------------------------------
+covariateCohorts <- tibble(cohortId = 2,
+                           cohortName = "Type 2 diabetes")
+
+covariateSettings <- createCohortBasedCovariateSettings(analysisId = 999,
+                                                        covariateCohorts = covariateCohorts,
+                                                        valueType = "binary",
+                                                        startDay = -365,
+                                                        endDay = 0)
+
+covariateData <- getDbCovariateData(connectionDetails = connectionDetails,
+                                    cdmDatabaseSchema = cdmDatabaseSchema,
+                                    cohortDatabaseSchema = cohortDatabaseSchema,
+                                    cohortTable = cohortTable,
+                                    cohortId = 1,
+                                    rowIdField = "subject_id",
+                                    covariateSettings = covariateSettings)
+
+saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesPerPerson"))
+# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesPerPerson"))
+summary(covariateData)
+covariateData$covariateRef
+
+
+covariateSettings1 <- createCovariateSettings(useDemographicsGender = TRUE,
+                                              useDemographicsAgeGroup = TRUE,
+                                              useDemographicsRace = TRUE,
+                                              useDemographicsEthnicity = TRUE,
+                                              useDemographicsIndexYear = TRUE,
+                                              useDemographicsIndexMonth = TRUE)
+
+covariateCohorts <- tibble(cohortId = 2,
+                           cohortName = "Type 2 diabetes")
+
+covariateSettings2 <- createCohortBasedCovariateSettings(analysisId = 999,
+                                                         covariateCohorts = covariateCohorts,
+                                                         valueType = "binary",
+                                                         startDay = -365,
+                                                         endDay = 0)
+
+covariateSettingsList <- list(covariateSettings1, covariateSettings2)
+
+covariateData <- getDbCovariateData(connectionDetails = connectionDetails,
+                                    cdmDatabaseSchema = cdmDatabaseSchema,
+                                    cohortDatabaseSchema = cohortDatabaseSchema,
+                                    cohortTable = cohortTable,
+                                    cohortId = 1,
+                                    rowIdField = "subject_id",
+                                    covariateSettings = covariateSettingsList,
+                                    aggregated = TRUE)
+
+saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesAggregated"))
+# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesAggregated"))
+summary(covariateData)
+
+# Clean up ---------------------------------------------------------------------
+connection <- connect(connectionDetails)
+sql <- "DROP TABLE @cohort_database_schema.@cohort_table"
+renderTranslateExecuteSql(connection = connection,
+                          sql = sql,
+                          cohort_database_schema = cohortDatabaseSchema,
+                          cohort_table = cohortTable)
+disconnect(connection)
diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R
@@ -57,6 +57,13 @@ rmarkdown::render("vignettes/UsingFeatureExtraction.Rmd",
                                           number_sections = TRUE))
 unlink("inst/doc/UsingFeatureExtraction.tex")
 
+rmarkdown::render("vignettes/CreatingCovariatesBasedOnOtherCohorts.Rmd",
+                  output_file = "../inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf",
+                  rmarkdown::pdf_document(latex_engine = "pdflatex",
+                                          toc = TRUE,
+                                          number_sections = TRUE))
+unlink("inst/doc/CreatingCovariatesBasedOnOtherCohorts.tex")
+
 # Note: these LaTex packages are required to render the Korean vignettes, but for 
 # some reason are not installed automatically:
 # - kotex*

diff --git a/inst/csv/OtherSqlToLoad.csv b/inst/csv/OtherSqlToLoad.csv
@@ -0,0 +1,3 @@
+analysisName,sqlFileName
+cohort,CohortBasedBinaryCovariates.sql
+cohortCount,CohortBasedCountCovariates.sql
diff --git a/inst/csv/jarChecksum.txt b/inst/csv/jarChecksum.txt
@@ -1 +1 @@
-4c4c70d5446f1b6a33cf0f11faddc1b958fe9795798e3dc4f960ae09d65320af
+5133cf2f456e6ac9c6b0cf9ea8be76bc8b16867baa0bc63f796cf1b92510b56b
diff --git a/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf b/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf
diff --git a/inst/java/featureExtraction-3.2.0-SNAPSHOT.jar b/inst/java/featureExtraction-3.2.0-SNAPSHOT.jar
diff --git a/inst/sql/sql_server/CohortBasedBinaryCovariates.sql b/inst/sql/sql_server/CohortBasedBinaryCovariates.sql
@@ -0,0 +1,107 @@
+-- Feature construction
+SELECT 
+	CAST(covariate_cohort_id AS BIGINT) * 1000 + @analysis_id AS covariate_id,
+{@temporal | @temporal_sequence} ? {
+    time_id,
+}	
+{@aggregated} ? {
+	cohort_definition_id,
+	COUNT(*) AS sum_value
+} : {
+	row_id,
+	1 AS covariate_value 
+}
+INTO @covariate_table
+FROM (
+	SELECT DISTINCT covariate_cohort.cohort_definition_id AS covariate_cohort_id,
+{@temporal} ? {
+		time_id,
+}
+{@temporal_sequence} ? {
+		FLOOR(DATEDIFF(@time_part, covariate_cohort.cohort_start_date, cohort.cohort_start_date)*1.0/@time_interval ) as time_id,
+}
+{@aggregated} ? {
+		cohort.cohort_definition_id,
+		cohort.subject_id,
+		cohort.cohort_start_date
+} : {
+		cohort.@row_id_field AS row_id
+}
+	FROM @cohort_table cohort
+	INNER JOIN @covariate_cohort_table covariate_cohort
+		ON cohort.subject_id = covariate_cohort.subject_id
+	INNER JOIN #covariate_cohort_ref covariate_cohort_ref
+		ON covariate_cohort.cohort_definition_id = covariate_cohort_ref.cohort_id
+{@temporal} ? {
+	INNER JOIN #time_period time_period
+		ON covariate_cohort.cohort_start_date <= DATEADD(DAY, time_period.end_day, cohort.cohort_start_date)
+	WHERE CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END  >= DATEADD(DAY, time_period.start_day, cohort.cohort_start_date)
+} : {
+	WHERE covariate_cohort.cohort_start_date <= DATEADD(DAY, {@temporal_sequence} ? {@sequence_end_day} : {@end_day}, cohort.cohort_start_date)
+{@start_day != 'anyTimePrior'} ? {		
+		AND CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, {@temporal_sequence} ? {@sequence_start_day} : {@start_day}, cohort.cohort_start_date)
+}
+}
+{@included_cov_table != ''} ? {		AND CAST(covariate_cohort.cohort_definition_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)}
+{@cohort_definition_id != -1} ? {		AND cohort.cohort_definition_id IN (@cohort_definition_id)}
+) by_row_id
+{@aggregated} ? {		
+GROUP BY cohort_definition_id,
+	covariate_cohort_id
+{@temporal | @temporal_sequence} ? {
+    ,time_id
+} 
+} 
+;
+
+-- Reference construction
+INSERT INTO #cov_ref (
+	covariate_id,
+	covariate_name,
+	analysis_id,
+	concept_id
+	)
+SELECT covariate_id,
+{@temporal | @temporal_sequence} ? {
+	CAST(CONCAT('cohort: ', cohort_name) AS VARCHAR(512)) AS covariate_name,
+} : {
+{@start_day == 'anyTimePrior'} ? {
+	CAST(CONCAT('cohort any time prior through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name,
+} : {
+	CAST(CONCAT('cohort during day @start_day through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name,
+}
+}
+	@analysis_id AS analysis_id,
+	0 AS concept_id
+FROM (
+	SELECT DISTINCT covariate_id
+	FROM @covariate_table
+	) t1
+LEFT JOIN #covariate_cohort_ref
+	ON cohort_id = CAST((covariate_id - @analysis_id) / 1000 AS INT);
+
+INSERT INTO #analysis_ref (
+	analysis_id,
+	analysis_name,
+	domain_id,
+{!@temporal} ? {
+	start_day,
+	end_day,
+}
+	is_binary,
+	missing_means_zero
+	)
+SELECT @analysis_id AS analysis_id,
+	CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name,
+	CAST('cohort' AS VARCHAR(20)) AS domain_id,
+{!@temporal} ? {
+{@start_day == 'anyTimePrior'} ? {
+	CAST(NULL AS INT) AS start_day,
+} : {
+
+	{@temporal_sequence} ? {@sequence_start_day} : {@start_day}  AS start_day,
+}
+	{@temporal_sequence} ? {@sequence_end_day} : {@end_day} AS end_day,
+}
+	CAST('Y' AS VARCHAR(1)) AS is_binary,
+	CAST(NULL AS VARCHAR(1)) AS missing_means_zero;
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		4c4c70d5446f1b6a33cf0f11faddc1b958fe9795798e3dc4f960ae09d65320af
		5133cf2f456e6ac9c6b0cf9ea8be76bc8b16867baa0bc63f796cf1b92510b56b