Skip to content

Commit

Permalink
Cohort covariates (#167)
Browse files Browse the repository at this point in the history
* Adding covariate builders based on other cohorts

* Fixing build

* Regenerating jar using maven

* Implementing runTestsOnEunomia switch for new tests

* Some fixes to cohort-based covariates

* Adding a vignette for cohort-based covariates.

* Update CreatingCovariatesBasedOnOtherCohorts.Rmd

* Throw warning if using pre-spec analysis ID for cohort-based covariate.

* Adding count type temporal covariates based on other cohorts

* Allowing cohordId to be vector in cohort-based covariate builder

* Update vignette

* Expand cohort covariates unit tests (#204)

---------

Co-authored-by: Schuemie <MSCHUEMI@its.jnj.com>
Co-authored-by: Admin_mschuemi <Admin_mschuemi@its.jnj.com>
Co-authored-by: Anthony Sena <asena5@its.jnj.com>
Co-authored-by: Ger Inberg <ginberg@gmail.com>
Co-authored-by: Anthony Sena <anthonysena@users.noreply.github.com>
  • Loading branch information
6 people authored Jun 28, 2023
1 parent 1e46b67 commit cf4a169
Show file tree
Hide file tree
Showing 24 changed files with 2,099 additions and 12 deletions.
1 change: 0 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
pom.xml
extras
docs
bin
man-roxygen
^.*\.Rproj$
^\.Rproj\.user$
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/R_CMD_check_Hades.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ jobs:
name: ${{ matrix.config.os }} (${{ matrix.config.r }})

strategy:
max-parallel: 1
fail-fast: false
matrix:
config:
- {os: windows-latest, r: 'release'} # Does not appear to have Java 32-bit, hence the --no-multiarch
- {os: macOS-latest, r: 'release'}
- {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
#- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}

env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
Expand Down
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ Imports:
readr,
rlang,
RSQLite,
DBI
DBI,
checkmate
Suggests:
testthat,
knitr,
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ export(computeStandardizedDifference)
export(convertPrespecSettingsToDetailedSettings)
export(createAnalysisDetails)
export(createCohortAttrCovariateSettings)
export(createCohortBasedCovariateSettings)
export(createCohortBasedTemporalCovariateSettings)
export(createCovariateSettings)
export(createDefaultCovariateSettings)
export(createDefaultTemporalCovariateSettings)
Expand All @@ -18,6 +20,7 @@ export(createTemporalSequenceCovariateSettings)
export(filterByCohortDefinitionId)
export(filterByRowId)
export(getDbCohortAttrCovariatesData)
export(getDbCohortBasedCovariatesData)
export(getDbCovariateData)
export(getDbDefaultCovariateData)
export(getDefaultTable1Specifications)
Expand All @@ -44,3 +47,4 @@ importFrom(rlang,.data)
importFrom(stats,aggregate)
importFrom(stats,quantile)
importFrom(stats,sd)
importFrom(utils,read.csv)
1 change: 1 addition & 0 deletions R/FeatureExtraction.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#' @importFrom SqlRender loadRenderTranslateSql translate render
#' @importFrom methods is
#' @importFrom utils read.csv
#' @importFrom stats aggregate quantile sd
#' @importFrom rlang .data
#' @import DatabaseConnector
Expand Down
286 changes: 286 additions & 0 deletions R/GetCovariatesFromOtherCohorts.R

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions R/HelperFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,12 @@ filterByCohortDefinitionId <- function(covariateData, cohortId) {
attr(class(result), "package") <- "FeatureExtraction"
return(result)
}

.assertCovariateId <- function(covariateId, len = NULL, min.len = NULL, null.ok = FALSE, add = NULL) {
checkmate::assertNumeric(covariateId, null.ok = null.ok, len = len, min.len = 1, add = add)
if (!is.null(covariateId)) {
message <- sprintf("Variable '%s' is a (64-bit) integer",
paste0(deparse(eval.parent(substitute(substitute(covariateId))), width.cutoff = 500L),collapse = "\n"))
checkmate::assertTRUE(all(covariateId == round(covariateId)), .var.name = message, add = add)
}
}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ The documentation website can be found at [https://ohdsi.github.io/FeatureExtrac
* Vignette: [Using FeatureExtraction](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/UsingFeatureExtraction.pdf)
* Vignette: [Creating covariates using cohort attributes](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCovariatesUsingCohortAttributes.pdf)
* Vignette: [Creating custom covariate builders](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCustomCovariateBuilders.pdf)
* Vignette: [Creating covariates based on other cohorts](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf)
* Package manual: [FeatureExtraction manual](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/extras/FeatureExtraction.pdf)

These vignettes are also available in Korean:
Expand Down
116 changes: 116 additions & 0 deletions extras/CohortBasedCovariatesVignetteDataFetch.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright 2022 Observational Health Data Sciences and Informatics
#
# This file is part of FeatureExtraction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This code should be used to fetch the data that is used in the cohort-based covariates vignette.
library(FeatureExtraction)
library(SqlRender)
vignetteFolder <- "s:/temp/vignetteFeatureExtractionCohortBased"

# MDCD on RedShift
connectionDetails <- createConnectionDetails(dbms = "redshift",
connectionString = keyring::key_get("redShiftConnectionStringOhdaMdcd"),
user = keyring::key_get("redShiftUserName"),
password = keyring::key_get("redShiftPassword"))
cdmDatabaseSchema <- "cdm_truven_mdcd_v1978"
cohortDatabaseSchema <- "scratch_mschuemi"
cohortTable <- "feature_extraction_cohort_based"
cdmVersion <- "5"


# Create cohorts -------------------------------------------------------
connection <- connect(connectionDetails)
sql <- readSql(system.file("sql", "sql_server", "covariateCohorts.sql", package = "FeatureExtraction"))
renderTranslateExecuteSql(connection = connection,
sql = sql,
cdm_database_schema = cdmDatabaseSchema,
cohort_database_schema = cohortDatabaseSchema,
cohort_table = cohortTable)

# Check number of subjects per cohort:
sql <- paste("SELECT cohort_definition_id,
COUNT(*) AS count",
"FROM @cohort_database_schema.@cohort_table",
"GROUP BY cohort_definition_id")
renderTranslateQuerySql(connection = connection,
sql = sql,
cohort_database_schema = cohortDatabaseSchema,
cohort_table = cohortTable)
disconnect(connection)


# Construct covariates -----------------------------------------------
covariateCohorts <- tibble(cohortId = 2,
cohortName = "Type 2 diabetes")

covariateSettings <- createCohortBasedCovariateSettings(analysisId = 999,
covariateCohorts = covariateCohorts,
valueType = "binary",
startDay = -365,
endDay = 0)

covariateData <- getDbCovariateData(connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
cohortDatabaseSchema = cohortDatabaseSchema,
cohortTable = cohortTable,
cohortId = 1,
rowIdField = "subject_id",
covariateSettings = covariateSettings)

saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesPerPerson"))
# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesPerPerson"))
summary(covariateData)
covariateData$covariateRef


covariateSettings1 <- createCovariateSettings(useDemographicsGender = TRUE,
useDemographicsAgeGroup = TRUE,
useDemographicsRace = TRUE,
useDemographicsEthnicity = TRUE,
useDemographicsIndexYear = TRUE,
useDemographicsIndexMonth = TRUE)

covariateCohorts <- tibble(cohortId = 2,
cohortName = "Type 2 diabetes")

covariateSettings2 <- createCohortBasedCovariateSettings(analysisId = 999,
covariateCohorts = covariateCohorts,
valueType = "binary",
startDay = -365,
endDay = 0)

covariateSettingsList <- list(covariateSettings1, covariateSettings2)

covariateData <- getDbCovariateData(connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
cohortDatabaseSchema = cohortDatabaseSchema,
cohortTable = cohortTable,
cohortId = 1,
rowIdField = "subject_id",
covariateSettings = covariateSettingsList,
aggregated = TRUE)

saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesAggregated"))
# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesAggregated"))
summary(covariateData)

# Clean up ---------------------------------------------------------------------
connection <- connect(connectionDetails)
sql <- "DROP TABLE @cohort_database_schema.@cohort_table"
renderTranslateExecuteSql(connection = connection,
sql = sql,
cohort_database_schema = cohortDatabaseSchema,
cohort_table = cohortTable)
disconnect(connection)
7 changes: 7 additions & 0 deletions extras/PackageMaintenance.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ rmarkdown::render("vignettes/UsingFeatureExtraction.Rmd",
number_sections = TRUE))
unlink("inst/doc/UsingFeatureExtraction.tex")

rmarkdown::render("vignettes/CreatingCovariatesBasedOnOtherCohorts.Rmd",
output_file = "../inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf",
rmarkdown::pdf_document(latex_engine = "pdflatex",
toc = TRUE,
number_sections = TRUE))
unlink("inst/doc/CreatingCovariatesBasedOnOtherCohorts.tex")

# Note: these LaTex packages are required to render the Korean vignettes, but for
# some reason are not installed automatically:
# - kotex*
Expand Down
3 changes: 3 additions & 0 deletions inst/csv/OtherSqlToLoad.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
analysisName,sqlFileName
cohort,CohortBasedBinaryCovariates.sql
cohortCount,CohortBasedCountCovariates.sql
2 changes: 1 addition & 1 deletion inst/csv/jarChecksum.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4c4c70d5446f1b6a33cf0f11faddc1b958fe9795798e3dc4f960ae09d65320af
5133cf2f456e6ac9c6b0cf9ea8be76bc8b16867baa0bc63f796cf1b92510b56b
Binary file not shown.
Binary file modified inst/java/featureExtraction-3.2.0-SNAPSHOT.jar
Binary file not shown.
107 changes: 107 additions & 0 deletions inst/sql/sql_server/CohortBasedBinaryCovariates.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
-- Feature construction
SELECT
CAST(covariate_cohort_id AS BIGINT) * 1000 + @analysis_id AS covariate_id,
{@temporal | @temporal_sequence} ? {
time_id,
}
{@aggregated} ? {
cohort_definition_id,
COUNT(*) AS sum_value
} : {
row_id,
1 AS covariate_value
}
INTO @covariate_table
FROM (
SELECT DISTINCT covariate_cohort.cohort_definition_id AS covariate_cohort_id,
{@temporal} ? {
time_id,
}
{@temporal_sequence} ? {
FLOOR(DATEDIFF(@time_part, covariate_cohort.cohort_start_date, cohort.cohort_start_date)*1.0/@time_interval ) as time_id,
}
{@aggregated} ? {
cohort.cohort_definition_id,
cohort.subject_id,
cohort.cohort_start_date
} : {
cohort.@row_id_field AS row_id
}
FROM @cohort_table cohort
INNER JOIN @covariate_cohort_table covariate_cohort
ON cohort.subject_id = covariate_cohort.subject_id
INNER JOIN #covariate_cohort_ref covariate_cohort_ref
ON covariate_cohort.cohort_definition_id = covariate_cohort_ref.cohort_id
{@temporal} ? {
INNER JOIN #time_period time_period
ON covariate_cohort.cohort_start_date <= DATEADD(DAY, time_period.end_day, cohort.cohort_start_date)
WHERE CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, time_period.start_day, cohort.cohort_start_date)
} : {
WHERE covariate_cohort.cohort_start_date <= DATEADD(DAY, {@temporal_sequence} ? {@sequence_end_day} : {@end_day}, cohort.cohort_start_date)
{@start_day != 'anyTimePrior'} ? {
AND CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, {@temporal_sequence} ? {@sequence_start_day} : {@start_day}, cohort.cohort_start_date)
}
}
{@included_cov_table != ''} ? { AND CAST(covariate_cohort.cohort_definition_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)}
{@cohort_definition_id != -1} ? { AND cohort.cohort_definition_id IN (@cohort_definition_id)}
) by_row_id
{@aggregated} ? {
GROUP BY cohort_definition_id,
covariate_cohort_id
{@temporal | @temporal_sequence} ? {
,time_id
}
}
;

-- Reference construction
INSERT INTO #cov_ref (
covariate_id,
covariate_name,
analysis_id,
concept_id
)
SELECT covariate_id,
{@temporal | @temporal_sequence} ? {
CAST(CONCAT('cohort: ', cohort_name) AS VARCHAR(512)) AS covariate_name,
} : {
{@start_day == 'anyTimePrior'} ? {
CAST(CONCAT('cohort any time prior through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name,
} : {
CAST(CONCAT('cohort during day @start_day through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name,
}
}
@analysis_id AS analysis_id,
0 AS concept_id
FROM (
SELECT DISTINCT covariate_id
FROM @covariate_table
) t1
LEFT JOIN #covariate_cohort_ref
ON cohort_id = CAST((covariate_id - @analysis_id) / 1000 AS INT);

INSERT INTO #analysis_ref (
analysis_id,
analysis_name,
domain_id,
{!@temporal} ? {
start_day,
end_day,
}
is_binary,
missing_means_zero
)
SELECT @analysis_id AS analysis_id,
CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name,
CAST('cohort' AS VARCHAR(20)) AS domain_id,
{!@temporal} ? {
{@start_day == 'anyTimePrior'} ? {
CAST(NULL AS INT) AS start_day,
} : {

{@temporal_sequence} ? {@sequence_start_day} : {@start_day} AS start_day,
}
{@temporal_sequence} ? {@sequence_end_day} : {@end_day} AS end_day,
}
CAST('Y' AS VARCHAR(1)) AS is_binary,
CAST(NULL AS VARCHAR(1)) AS missing_means_zero;
Loading

0 comments on commit cf4a169

Please sign in to comment.