-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Adding covariate builders based on other cohorts * Fixing build * Regenerating jar using maven * Implementing runTestsOnEunomia switch for new tests * Some fixes to cohort-based covariates * Adding a vignette for cohort-based covariates. * Update CreatingCovariatesBasedOnOtherCohorts.Rmd * Throw warning if using pre-spec analysis ID for cohort-based covariate. * Adding count type temporal covariates based on other cohorts * Allowing cohordId to be vector in cohort-based covariate builder * Update vignette * Expand cohort covariates unit tests (#204) --------- Co-authored-by: Schuemie <MSCHUEMI@its.jnj.com> Co-authored-by: Admin_mschuemi <Admin_mschuemi@its.jnj.com> Co-authored-by: Anthony Sena <asena5@its.jnj.com> Co-authored-by: Ger Inberg <ginberg@gmail.com> Co-authored-by: Anthony Sena <anthonysena@users.noreply.github.com>
- Loading branch information
1 parent
1e46b67
commit cf4a169
Showing
24 changed files
with
2,099 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,6 @@ | ||
pom.xml | ||
extras | ||
docs | ||
bin | ||
man-roxygen | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,8 @@ Imports: | |
readr, | ||
rlang, | ||
RSQLite, | ||
DBI | ||
DBI, | ||
checkmate | ||
Suggests: | ||
testthat, | ||
knitr, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
# Copyright 2022 Observational Health Data Sciences and Informatics | ||
# | ||
# This file is part of FeatureExtraction | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This code should be used to fetch the data that is used in the cohort-based covariates vignette. | ||
library(FeatureExtraction) | ||
library(SqlRender) | ||
vignetteFolder <- "s:/temp/vignetteFeatureExtractionCohortBased" | ||
|
||
# MDCD on RedShift | ||
connectionDetails <- createConnectionDetails(dbms = "redshift", | ||
connectionString = keyring::key_get("redShiftConnectionStringOhdaMdcd"), | ||
user = keyring::key_get("redShiftUserName"), | ||
password = keyring::key_get("redShiftPassword")) | ||
cdmDatabaseSchema <- "cdm_truven_mdcd_v1978" | ||
cohortDatabaseSchema <- "scratch_mschuemi" | ||
cohortTable <- "feature_extraction_cohort_based" | ||
cdmVersion <- "5" | ||
|
||
|
||
# Create cohorts ------------------------------------------------------- | ||
connection <- connect(connectionDetails) | ||
sql <- readSql(system.file("sql", "sql_server", "covariateCohorts.sql", package = "FeatureExtraction")) | ||
renderTranslateExecuteSql(connection = connection, | ||
sql = sql, | ||
cdm_database_schema = cdmDatabaseSchema, | ||
cohort_database_schema = cohortDatabaseSchema, | ||
cohort_table = cohortTable) | ||
|
||
# Check number of subjects per cohort: | ||
sql <- paste("SELECT cohort_definition_id, | ||
COUNT(*) AS count", | ||
"FROM @cohort_database_schema.@cohort_table", | ||
"GROUP BY cohort_definition_id") | ||
renderTranslateQuerySql(connection = connection, | ||
sql = sql, | ||
cohort_database_schema = cohortDatabaseSchema, | ||
cohort_table = cohortTable) | ||
disconnect(connection) | ||
|
||
|
||
# Construct covariates ----------------------------------------------- | ||
covariateCohorts <- tibble(cohortId = 2, | ||
cohortName = "Type 2 diabetes") | ||
|
||
covariateSettings <- createCohortBasedCovariateSettings(analysisId = 999, | ||
covariateCohorts = covariateCohorts, | ||
valueType = "binary", | ||
startDay = -365, | ||
endDay = 0) | ||
|
||
covariateData <- getDbCovariateData(connectionDetails = connectionDetails, | ||
cdmDatabaseSchema = cdmDatabaseSchema, | ||
cohortDatabaseSchema = cohortDatabaseSchema, | ||
cohortTable = cohortTable, | ||
cohortId = 1, | ||
rowIdField = "subject_id", | ||
covariateSettings = covariateSettings) | ||
|
||
saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesPerPerson")) | ||
# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesPerPerson")) | ||
summary(covariateData) | ||
covariateData$covariateRef | ||
|
||
|
||
covariateSettings1 <- createCovariateSettings(useDemographicsGender = TRUE, | ||
useDemographicsAgeGroup = TRUE, | ||
useDemographicsRace = TRUE, | ||
useDemographicsEthnicity = TRUE, | ||
useDemographicsIndexYear = TRUE, | ||
useDemographicsIndexMonth = TRUE) | ||
|
||
covariateCohorts <- tibble(cohortId = 2, | ||
cohortName = "Type 2 diabetes") | ||
|
||
covariateSettings2 <- createCohortBasedCovariateSettings(analysisId = 999, | ||
covariateCohorts = covariateCohorts, | ||
valueType = "binary", | ||
startDay = -365, | ||
endDay = 0) | ||
|
||
covariateSettingsList <- list(covariateSettings1, covariateSettings2) | ||
|
||
covariateData <- getDbCovariateData(connectionDetails = connectionDetails, | ||
cdmDatabaseSchema = cdmDatabaseSchema, | ||
cohortDatabaseSchema = cohortDatabaseSchema, | ||
cohortTable = cohortTable, | ||
cohortId = 1, | ||
rowIdField = "subject_id", | ||
covariateSettings = covariateSettingsList, | ||
aggregated = TRUE) | ||
|
||
saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesAggregated")) | ||
# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesAggregated")) | ||
summary(covariateData) | ||
|
||
# Clean up --------------------------------------------------------------------- | ||
connection <- connect(connectionDetails) | ||
sql <- "DROP TABLE @cohort_database_schema.@cohort_table" | ||
renderTranslateExecuteSql(connection = connection, | ||
sql = sql, | ||
cohort_database_schema = cohortDatabaseSchema, | ||
cohort_table = cohortTable) | ||
disconnect(connection) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
analysisName,sqlFileName | ||
cohort,CohortBasedBinaryCovariates.sql | ||
cohortCount,CohortBasedCountCovariates.sql |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
4c4c70d5446f1b6a33cf0f11faddc1b958fe9795798e3dc4f960ae09d65320af | ||
5133cf2f456e6ac9c6b0cf9ea8be76bc8b16867baa0bc63f796cf1b92510b56b |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
-- Feature construction | ||
SELECT | ||
CAST(covariate_cohort_id AS BIGINT) * 1000 + @analysis_id AS covariate_id, | ||
{@temporal | @temporal_sequence} ? { | ||
time_id, | ||
} | ||
{@aggregated} ? { | ||
cohort_definition_id, | ||
COUNT(*) AS sum_value | ||
} : { | ||
row_id, | ||
1 AS covariate_value | ||
} | ||
INTO @covariate_table | ||
FROM ( | ||
SELECT DISTINCT covariate_cohort.cohort_definition_id AS covariate_cohort_id, | ||
{@temporal} ? { | ||
time_id, | ||
} | ||
{@temporal_sequence} ? { | ||
FLOOR(DATEDIFF(@time_part, covariate_cohort.cohort_start_date, cohort.cohort_start_date)*1.0/@time_interval ) as time_id, | ||
} | ||
{@aggregated} ? { | ||
cohort.cohort_definition_id, | ||
cohort.subject_id, | ||
cohort.cohort_start_date | ||
} : { | ||
cohort.@row_id_field AS row_id | ||
} | ||
FROM @cohort_table cohort | ||
INNER JOIN @covariate_cohort_table covariate_cohort | ||
ON cohort.subject_id = covariate_cohort.subject_id | ||
INNER JOIN #covariate_cohort_ref covariate_cohort_ref | ||
ON covariate_cohort.cohort_definition_id = covariate_cohort_ref.cohort_id | ||
{@temporal} ? { | ||
INNER JOIN #time_period time_period | ||
ON covariate_cohort.cohort_start_date <= DATEADD(DAY, time_period.end_day, cohort.cohort_start_date) | ||
WHERE CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, time_period.start_day, cohort.cohort_start_date) | ||
} : { | ||
WHERE covariate_cohort.cohort_start_date <= DATEADD(DAY, {@temporal_sequence} ? {@sequence_end_day} : {@end_day}, cohort.cohort_start_date) | ||
{@start_day != 'anyTimePrior'} ? { | ||
AND CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, {@temporal_sequence} ? {@sequence_start_day} : {@start_day}, cohort.cohort_start_date) | ||
} | ||
} | ||
{@included_cov_table != ''} ? { AND CAST(covariate_cohort.cohort_definition_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)} | ||
{@cohort_definition_id != -1} ? { AND cohort.cohort_definition_id IN (@cohort_definition_id)} | ||
) by_row_id | ||
{@aggregated} ? { | ||
GROUP BY cohort_definition_id, | ||
covariate_cohort_id | ||
{@temporal | @temporal_sequence} ? { | ||
,time_id | ||
} | ||
} | ||
; | ||
|
||
-- Reference construction | ||
INSERT INTO #cov_ref ( | ||
covariate_id, | ||
covariate_name, | ||
analysis_id, | ||
concept_id | ||
) | ||
SELECT covariate_id, | ||
{@temporal | @temporal_sequence} ? { | ||
CAST(CONCAT('cohort: ', cohort_name) AS VARCHAR(512)) AS covariate_name, | ||
} : { | ||
{@start_day == 'anyTimePrior'} ? { | ||
CAST(CONCAT('cohort any time prior through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name, | ||
} : { | ||
CAST(CONCAT('cohort during day @start_day through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name, | ||
} | ||
} | ||
@analysis_id AS analysis_id, | ||
0 AS concept_id | ||
FROM ( | ||
SELECT DISTINCT covariate_id | ||
FROM @covariate_table | ||
) t1 | ||
LEFT JOIN #covariate_cohort_ref | ||
ON cohort_id = CAST((covariate_id - @analysis_id) / 1000 AS INT); | ||
|
||
INSERT INTO #analysis_ref ( | ||
analysis_id, | ||
analysis_name, | ||
domain_id, | ||
{!@temporal} ? { | ||
start_day, | ||
end_day, | ||
} | ||
is_binary, | ||
missing_means_zero | ||
) | ||
SELECT @analysis_id AS analysis_id, | ||
CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name, | ||
CAST('cohort' AS VARCHAR(20)) AS domain_id, | ||
{!@temporal} ? { | ||
{@start_day == 'anyTimePrior'} ? { | ||
CAST(NULL AS INT) AS start_day, | ||
} : { | ||
|
||
{@temporal_sequence} ? {@sequence_start_day} : {@start_day} AS start_day, | ||
} | ||
{@temporal_sequence} ? {@sequence_end_day} : {@end_day} AS end_day, | ||
} | ||
CAST('Y' AS VARCHAR(1)) AS is_binary, | ||
CAST(NULL AS VARCHAR(1)) AS missing_means_zero; |
Oops, something went wrong.