diff --git a/.Rbuildignore b/.Rbuildignore index 53275cd7..8eed46ac 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,7 +1,6 @@ pom.xml extras docs -bin man-roxygen ^.*\.Rproj$ ^\.Rproj\.user$ diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index 63227ce0..8c5d59fa 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -17,13 +17,13 @@ jobs: name: ${{ matrix.config.os }} (${{ matrix.config.r }}) strategy: + max-parallel: 1 fail-fast: false matrix: config: - {os: windows-latest, r: 'release'} # Does not appear to have Java 32-bit, hence the --no-multiarch - {os: macOS-latest, r: 'release'} - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - #- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true diff --git a/DESCRIPTION b/DESCRIPTION index 905d2dd4..17338684 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,8 @@ Imports: readr, rlang, RSQLite, - DBI + DBI, + checkmate Suggests: testthat, knitr, diff --git a/NAMESPACE b/NAMESPACE index ab4fcd70..3659d2d4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,8 @@ export(computeStandardizedDifference) export(convertPrespecSettingsToDetailedSettings) export(createAnalysisDetails) export(createCohortAttrCovariateSettings) +export(createCohortBasedCovariateSettings) +export(createCohortBasedTemporalCovariateSettings) export(createCovariateSettings) export(createDefaultCovariateSettings) export(createDefaultTemporalCovariateSettings) @@ -18,6 +20,7 @@ export(createTemporalSequenceCovariateSettings) export(filterByCohortDefinitionId) export(filterByRowId) export(getDbCohortAttrCovariatesData) +export(getDbCohortBasedCovariatesData) export(getDbCovariateData) export(getDbDefaultCovariateData) export(getDefaultTable1Specifications) @@ -44,3 +47,4 @@ importFrom(rlang,.data) importFrom(stats,aggregate) importFrom(stats,quantile) importFrom(stats,sd) +importFrom(utils,read.csv) diff --git a/R/FeatureExtraction.R b/R/FeatureExtraction.R index 2562d5fc..b411a1cd 100644 --- a/R/FeatureExtraction.R +++ b/R/FeatureExtraction.R @@ -21,6 +21,7 @@ #' @importFrom SqlRender loadRenderTranslateSql translate render #' @importFrom methods is +#' @importFrom utils read.csv #' @importFrom stats aggregate quantile sd #' @importFrom rlang .data #' @import DatabaseConnector diff --git a/R/GetCovariatesFromOtherCohorts.R b/R/GetCovariatesFromOtherCohorts.R new file mode 100644 index 00000000..f981348d --- /dev/null +++ b/R/GetCovariatesFromOtherCohorts.R @@ -0,0 +1,286 @@ +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of FeatureExtraction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Get covariate information from the database based on other cohorts +#' +#' @description +#' Constructs covariates using other cohorts. +#' +#' @param covariateSettings An object of type \code{covariateSettings} as created using the +#' \code{\link{createCohortBasedCovariateSettings}} or +#' \code{\link{createCohortBasedTemporalCovariateSettings}} functions. +#' +#' @template GetCovarParams +#' +#' @export +getDbCohortBasedCovariatesData <- function(connection, + oracleTempSchema = NULL, + cdmDatabaseSchema, + cohortTable = "#cohort_person", + cohortId = -1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings, + aggregated = FALSE) { + errorMessages <- checkmate::makeAssertCollection() + checkmate::assertClass(connection, "DatabaseConnectorConnection", add = errorMessages) + checkmate::assertCharacter(oracleTempSchema, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertCharacter(cdmDatabaseSchema, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertCharacter(cohortTable, len = 1, add = errorMessages) + checkmate::assertIntegerish(cohortId, add = errorMessages) + # checkmate::assertCharacter(cdmVersion, len = 1, add = errorMessages) + checkmate::assertCharacter(rowIdField, len = 1, add = errorMessages) + checkmate::assertClass(covariateSettings, "covariateSettings", add = errorMessages) + checkmate::assertLogical(aggregated, len = 1, add = errorMessages) + checkmate::reportAssertions(collection = errorMessages) + + start <- Sys.time() + message("Constructing covariates from other cohorts") + + covariateCohorts <- covariateSettings$covariateCohorts %>% + select(cohortId, cohortName) + + DatabaseConnector::insertTable(connection, + tableName = "#covariate_cohort_ref", + data = covariateCohorts, + dropTableIfExists = TRUE, + createTable = TRUE, + tempTable = TRUE, + oracleTempSchema = oracleTempSchema, + camelCaseToSnakeCase = TRUE) + if (is.null(covariateSettings$covariateCohortTable)) { + covariateCohortTable <- cohortTable + } else if (is.null(covariateSettings$covariateCohortDatabaseSchema)) { + covariateCohortTable <- covariateSettings$covariateCohortTable + } else { + covariateCohortTable <- paste(covariateSettings$covariateCohortDatabaseSchema, + covariateSettings$covariateCohortTable, + sep = ".") + } + + if (covariateSettings$temporal) { + if (covariateSettings$valueType == "binary") { + sqlFileName <- "CohortBasedBinaryCovariates.sql" + } else { + sqlFileName <- "CohortBasedCountCovariates.sql" + } + parameters <- list(covariateCohortTable = covariateCohortTable, + analysisId = covariateSettings$analysisId, + analysisName = "CohortTemporal") + detail <- createAnalysisDetails(analysisId = covariateSettings$analysisId, + sqlFileName = sqlFileName, + parameters = parameters, + includedCovariateConceptIds = covariateSettings$includedCovariateIds, + addDescendantsToInclude = FALSE, + excludedCovariateConceptIds = c(), + addDescendantsToExclude = FALSE, + includedCovariateIds = c()) + detailledSettings <- createDetailedTemporalCovariateSettings(analyses = list(detail), + temporalStartDays = covariateSettings$temporalStartDays, + temporalEndDays = covariateSettings$temporalEndDays) + } else { + # Not temporal + if (covariateSettings$valueType == "binary") { + sqlFileName <- "CohortBasedBinaryCovariates.sql" + } else { + sqlFileName <- "CohortBasedCountCovariates.sql" + } + parameters <- list(covariateCohortTable = covariateCohortTable, + analysisId = covariateSettings$analysisId, + analysisName = "Cohort", + startDay = covariateSettings$startDay, + endDay = covariateSettings$endDay) + detail <- createAnalysisDetails(analysisId = covariateSettings$analysisId, + sqlFileName = sqlFileName, + parameters = parameters, + includedCovariateConceptIds = covariateSettings$includedCovariateIds, + addDescendantsToInclude = FALSE, + excludedCovariateConceptIds = c(), + addDescendantsToExclude = FALSE, + includedCovariateIds = c()) + detailledSettings <- createDetailedCovariateSettings(analyses = list(detail)) + } + result <- getDbDefaultCovariateData(connection = connection, + oracleTempSchema = oracleTempSchema, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortTable = cohortTable, + cohortId = cohortId, + cdmVersion = cdmVersion, + rowIdField = rowIdField, + detailledSettings, + aggregated = aggregated) + + sql <- "TRUNCATE TABLE #covariate_cohort_ref; DROP TABLE #covariate_cohort_ref;" + DatabaseConnector::renderTranslateExecuteSql(connection = connection, + sql = sql, + progressBar = FALSE, + reportOverallTime = FALSE) + return(result) +} + +#' Create settings for covariates based on other cohorts +#' +#' @details +#' Creates an object specifying covariates to be constructed based on the presence of other cohorts. +#' +#' @param analysisId A unique identifier for this analysis. +#' @param covariateCohortDatabaseSchema The database schema where the cohorts used to define the covariates +#' can be found. If set to \code{NULL}, the database schema will be +#' guessed, for example using the same one as for the main cohorts. +#' @param covariateCohortTable The table where the cohorts used to define the covariates +#' can be found. If set to \code{NULL}, the table will be +#' guessed, for example using the same one as for the main cohorts. +#' @param covariateCohorts A data frame with at least two columns: 'cohortId' and 'cohortName'. The +#' cohort ID should correspond to the \code{cohort_definition_id} of the cohort +#' to use for creating a covariate. +#' @param valueType Either 'binary' or 'count'. When \code{valueType = 'count'}, the covariate +#' value will be the number of times the cohort was observed in the window. +#' @param startDay What is the start day (relative to the index date) of the covariate window? +#' @param endDay What is the end day (relative to the index date) of the covariate window? +#' @param includedCovariateIds A list of covariate IDs that should be restricted to. +#' @param warnOnAnalysisIdOverlap Warn if the provided `analysisId` overlaps with any predefined analysis as +#' available in the `createCoverateSettings()` function. +#' +#' @return +#' An object of type \code{covariateSettings}, to be used in other functions. +#' +#' @export +createCohortBasedCovariateSettings <- function(analysisId, + covariateCohortDatabaseSchema = NULL, + covariateCohortTable = NULL, + covariateCohorts, + valueType = "binary", + startDay = -365, + endDay = 0, + includedCovariateIds = c(), + warnOnAnalysisIdOverlap = TRUE) { + errorMessages <- checkmate::makeAssertCollection() + checkmate::assertInt(analysisId, lower = 1, upper = 999, add = errorMessages) + checkmate::assertCharacter(covariateCohortDatabaseSchema, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertCharacter(covariateCohortTable, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertDataFrame(covariateCohorts, min.rows = 1, add = errorMessages) + checkmate::assertNames(colnames(covariateCohorts), must.include = c("cohortId", "cohortName"), add = errorMessages) + checkmate::assertChoice(valueType, c("binary", "count"), add = errorMessages) + checkmate::assertInt(startDay, add = errorMessages) + checkmate::assertInt(endDay, add = errorMessages) + checkmate::assertTRUE(startDay <= endDay, add = errorMessages) + .assertCovariateId(includedCovariateIds, null.ok = TRUE, add = errorMessages) + checkmate::assertLogical(warnOnAnalysisIdOverlap, len = 1, add = errorMessages) + checkmate::reportAssertions(collection = errorMessages) + + if (warnOnAnalysisIdOverlap) { + warnIfPredefined(analysisId) + } + + covariateSettings <- list(temporal = FALSE, + temporalSequence = FALSE) + + formalNames <- names(formals(createCohortBasedCovariateSettings)) + for (name in formalNames) { + value <- get(name) + covariateSettings[[name]] <- value + } + attr(covariateSettings, "fun") <- "getDbCohortBasedCovariatesData" + class(covariateSettings) <- "covariateSettings" + return(covariateSettings) +} + +#' Create settings for temporal covariates based on other cohorts +#' +#' @details +#' Creates an object specifying temporal covariates to be constructed based on the presence of other cohorts. +#' +#' @param analysisId A unique identifier for this analysis. +#' @param covariateCohortDatabaseSchema The database schema where the cohorts used to define the covariates +#' can be found. If set to \code{NULL}, the database schema will be +#' guessed, for example using the same one as for the main cohorts. +#' @param covariateCohortTable The table where the cohorts used to define the covariates +#' can be found. If set to \code{NULL}, the table will be +#' guessed, for example using the same one as for the main cohorts. +#' @param covariateCohorts A data frame with at least two columns: 'cohortId' and 'cohortName'. The +#' cohort ID should correspond to the \code{cohort_definition_id} of the cohort +#' to use for creating a covariate. +#' @param valueType Either 'binary' or 'count'. When \code{valueType = 'count'}, the covariate +#' value will be the number of times the cohort was observed in the window. +#' @param temporalStartDays A list of integers representing the start of a time +#' period, relative to the index date. 0 indicates the +#' index date, -1 indicates the day before the index +#' date, etc. The start day is included in the time +#' period. +#' @param temporalEndDays A list of integers representing the end of a time +#' period, relative to the index date. 0 indicates the +#' index date, -1 indicates the day before the index +#' date, etc. The end day is included in the time +#' period. +#' @param includedCovariateIds A list of covariate IDs that should be restricted to. +#' @param warnOnAnalysisIdOverlap Warn if the provided `analysisId` overlaps with any predefined analysis as +#' available in the `createTemporalCovariateSettings()` function. +#' +#' @return +#' An object of type \code{covariateSettings}, to be used in other functions. +#' +#' @export +createCohortBasedTemporalCovariateSettings <- function(analysisId, + covariateCohortDatabaseSchema = NULL, + covariateCohortTable = NULL, + covariateCohorts, + valueType = "binary", + temporalStartDays = -365:-1, + temporalEndDays = -365:-1, + includedCovariateIds = c(), + warnOnAnalysisIdOverlap = TRUE) { + errorMessages <- checkmate::makeAssertCollection() + checkmate::assertInt(analysisId, lower = 1, upper = 999, add = errorMessages) + checkmate::assertCharacter(covariateCohortDatabaseSchema, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertCharacter(covariateCohortTable, len = 1, null.ok = TRUE, add = errorMessages) + checkmate::assertDataFrame(covariateCohorts, min.rows = 1, add = errorMessages) + checkmate::assertNames(colnames(covariateCohorts), must.include = c("cohortId", "cohortName"), add = errorMessages) + checkmate::assertChoice(valueType, c("binary", "count"), add = errorMessages) + checkmate::assertIntegerish(temporalStartDays, add = errorMessages) + checkmate::assertIntegerish(temporalEndDays, add = errorMessages) + checkmate::assertTRUE(all(temporalStartDays <= temporalEndDays), add = errorMessages) + .assertCovariateId(includedCovariateIds, null.ok = TRUE, add = errorMessages) + checkmate::assertLogical(warnOnAnalysisIdOverlap, len = 1, add = errorMessages) + checkmate::reportAssertions(collection = errorMessages) + + if (warnOnAnalysisIdOverlap) { + warnIfPredefined(analysisId, TRUE) + } + + covariateSettings <- list(temporal = TRUE, + temporalSequence = FALSE) + formalNames <- names(formals(createCohortBasedTemporalCovariateSettings)) + for (name in formalNames) { + value <- get(name) + covariateSettings[[name]] <- value + } + attr(covariateSettings, "fun") <- "getDbCohortBasedCovariatesData" + class(covariateSettings) <- "covariateSettings" + return(covariateSettings) +} + +warnIfPredefined <- function(analysisId, temporal = FALSE) { + if (temporal) { + csvFile <- system.file("csv", "PrespecTemporalAnalyses.csv", package = "FeatureExtraction") + } else { + csvFile <- system.file("csv", "PrespecAnalyses.csv", package = "FeatureExtraction") + } + preSpecAnalysis <- read.csv(csvFile) %>% + filter(.data$analysisId == !!analysisId) + if (nrow(preSpecAnalysis) > 0) { + warning(sprintf("Analysis ID %d also used for prespecified analysis '%s'.", analysisId, preSpecAnalysis$analysisName)) + } +} diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index 7cd65b78..c7baaf42 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -102,3 +102,12 @@ filterByCohortDefinitionId <- function(covariateData, cohortId) { attr(class(result), "package") <- "FeatureExtraction" return(result) } + +.assertCovariateId <- function(covariateId, len = NULL, min.len = NULL, null.ok = FALSE, add = NULL) { + checkmate::assertNumeric(covariateId, null.ok = null.ok, len = len, min.len = 1, add = add) + if (!is.null(covariateId)) { + message <- sprintf("Variable '%s' is a (64-bit) integer", + paste0(deparse(eval.parent(substitute(substitute(covariateId))), width.cutoff = 500L),collapse = "\n")) + checkmate::assertTRUE(all(covariateId == round(covariateId)), .var.name = message, add = add) + } +} diff --git a/README.md b/README.md index dac18e69..3adf6a95 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ The documentation website can be found at [https://ohdsi.github.io/FeatureExtrac * Vignette: [Using FeatureExtraction](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/UsingFeatureExtraction.pdf) * Vignette: [Creating covariates using cohort attributes](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCovariatesUsingCohortAttributes.pdf) * Vignette: [Creating custom covariate builders](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCustomCovariateBuilders.pdf) +* Vignette: [Creating covariates based on other cohorts](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf) * Package manual: [FeatureExtraction manual](https://raw.githubusercontent.com/OHDSI/FeatureExtraction/main/extras/FeatureExtraction.pdf) These vignettes are also available in Korean: diff --git a/extras/CohortBasedCovariatesVignetteDataFetch.R b/extras/CohortBasedCovariatesVignetteDataFetch.R new file mode 100644 index 00000000..f6f079ac --- /dev/null +++ b/extras/CohortBasedCovariatesVignetteDataFetch.R @@ -0,0 +1,116 @@ +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of FeatureExtraction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This code should be used to fetch the data that is used in the cohort-based covariates vignette. +library(FeatureExtraction) +library(SqlRender) +vignetteFolder <- "s:/temp/vignetteFeatureExtractionCohortBased" + +# MDCD on RedShift +connectionDetails <- createConnectionDetails(dbms = "redshift", + connectionString = keyring::key_get("redShiftConnectionStringOhdaMdcd"), + user = keyring::key_get("redShiftUserName"), + password = keyring::key_get("redShiftPassword")) +cdmDatabaseSchema <- "cdm_truven_mdcd_v1978" +cohortDatabaseSchema <- "scratch_mschuemi" +cohortTable <- "feature_extraction_cohort_based" +cdmVersion <- "5" + + +# Create cohorts ------------------------------------------------------- +connection <- connect(connectionDetails) +sql <- readSql(system.file("sql", "sql_server", "covariateCohorts.sql", package = "FeatureExtraction")) +renderTranslateExecuteSql(connection = connection, + sql = sql, + cdm_database_schema = cdmDatabaseSchema, + cohort_database_schema = cohortDatabaseSchema, + cohort_table = cohortTable) + +# Check number of subjects per cohort: +sql <- paste("SELECT cohort_definition_id, + COUNT(*) AS count", + "FROM @cohort_database_schema.@cohort_table", + "GROUP BY cohort_definition_id") +renderTranslateQuerySql(connection = connection, + sql = sql, + cohort_database_schema = cohortDatabaseSchema, + cohort_table = cohortTable) +disconnect(connection) + + +# Construct covariates ----------------------------------------------- +covariateCohorts <- tibble(cohortId = 2, + cohortName = "Type 2 diabetes") + +covariateSettings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "binary", + startDay = -365, + endDay = 0) + +covariateData <- getDbCovariateData(connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTable, + cohortId = 1, + rowIdField = "subject_id", + covariateSettings = covariateSettings) + +saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesPerPerson")) +# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesPerPerson")) +summary(covariateData) +covariateData$covariateRef + + +covariateSettings1 <- createCovariateSettings(useDemographicsGender = TRUE, + useDemographicsAgeGroup = TRUE, + useDemographicsRace = TRUE, + useDemographicsEthnicity = TRUE, + useDemographicsIndexYear = TRUE, + useDemographicsIndexMonth = TRUE) + +covariateCohorts <- tibble(cohortId = 2, + cohortName = "Type 2 diabetes") + +covariateSettings2 <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "binary", + startDay = -365, + endDay = 0) + +covariateSettingsList <- list(covariateSettings1, covariateSettings2) + +covariateData <- getDbCovariateData(connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTable, + cohortId = 1, + rowIdField = "subject_id", + covariateSettings = covariateSettingsList, + aggregated = TRUE) + +saveCovariateData(covariateData, file.path(vignetteFolder, "covariatesAggregated")) +# covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesAggregated")) +summary(covariateData) + +# Clean up --------------------------------------------------------------------- +connection <- connect(connectionDetails) +sql <- "DROP TABLE @cohort_database_schema.@cohort_table" +renderTranslateExecuteSql(connection = connection, + sql = sql, + cohort_database_schema = cohortDatabaseSchema, + cohort_table = cohortTable) +disconnect(connection) diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R index 7fa810da..7ec402ea 100644 --- a/extras/PackageMaintenance.R +++ b/extras/PackageMaintenance.R @@ -57,6 +57,13 @@ rmarkdown::render("vignettes/UsingFeatureExtraction.Rmd", number_sections = TRUE)) unlink("inst/doc/UsingFeatureExtraction.tex") +rmarkdown::render("vignettes/CreatingCovariatesBasedOnOtherCohorts.Rmd", + output_file = "../inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf", + rmarkdown::pdf_document(latex_engine = "pdflatex", + toc = TRUE, + number_sections = TRUE)) +unlink("inst/doc/CreatingCovariatesBasedOnOtherCohorts.tex") + # Note: these LaTex packages are required to render the Korean vignettes, but for # some reason are not installed automatically: # - kotex* diff --git a/inst/csv/OtherSqlToLoad.csv b/inst/csv/OtherSqlToLoad.csv new file mode 100644 index 00000000..7a8d93f9 --- /dev/null +++ b/inst/csv/OtherSqlToLoad.csv @@ -0,0 +1,3 @@ +analysisName,sqlFileName +cohort,CohortBasedBinaryCovariates.sql +cohortCount,CohortBasedCountCovariates.sql diff --git a/inst/csv/jarChecksum.txt b/inst/csv/jarChecksum.txt index 6d09f530..80c68cbd 100644 --- a/inst/csv/jarChecksum.txt +++ b/inst/csv/jarChecksum.txt @@ -1 +1 @@ -4c4c70d5446f1b6a33cf0f11faddc1b958fe9795798e3dc4f960ae09d65320af +5133cf2f456e6ac9c6b0cf9ea8be76bc8b16867baa0bc63f796cf1b92510b56b diff --git a/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf b/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf new file mode 100644 index 00000000..0835fda5 Binary files /dev/null and b/inst/doc/CreatingCovariatesBasedOnOtherCohorts.pdf differ diff --git a/inst/java/featureExtraction-3.2.0-SNAPSHOT.jar b/inst/java/featureExtraction-3.2.0-SNAPSHOT.jar index 33e6f464..355905cb 100644 Binary files a/inst/java/featureExtraction-3.2.0-SNAPSHOT.jar and b/inst/java/featureExtraction-3.2.0-SNAPSHOT.jar differ diff --git a/inst/sql/sql_server/CohortBasedBinaryCovariates.sql b/inst/sql/sql_server/CohortBasedBinaryCovariates.sql new file mode 100644 index 00000000..b3db28db --- /dev/null +++ b/inst/sql/sql_server/CohortBasedBinaryCovariates.sql @@ -0,0 +1,107 @@ +-- Feature construction +SELECT + CAST(covariate_cohort_id AS BIGINT) * 1000 + @analysis_id AS covariate_id, +{@temporal | @temporal_sequence} ? { + time_id, +} +{@aggregated} ? { + cohort_definition_id, + COUNT(*) AS sum_value +} : { + row_id, + 1 AS covariate_value +} +INTO @covariate_table +FROM ( + SELECT DISTINCT covariate_cohort.cohort_definition_id AS covariate_cohort_id, +{@temporal} ? { + time_id, +} +{@temporal_sequence} ? { + FLOOR(DATEDIFF(@time_part, covariate_cohort.cohort_start_date, cohort.cohort_start_date)*1.0/@time_interval ) as time_id, +} +{@aggregated} ? { + cohort.cohort_definition_id, + cohort.subject_id, + cohort.cohort_start_date +} : { + cohort.@row_id_field AS row_id +} + FROM @cohort_table cohort + INNER JOIN @covariate_cohort_table covariate_cohort + ON cohort.subject_id = covariate_cohort.subject_id + INNER JOIN #covariate_cohort_ref covariate_cohort_ref + ON covariate_cohort.cohort_definition_id = covariate_cohort_ref.cohort_id +{@temporal} ? { + INNER JOIN #time_period time_period + ON covariate_cohort.cohort_start_date <= DATEADD(DAY, time_period.end_day, cohort.cohort_start_date) + WHERE CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, time_period.start_day, cohort.cohort_start_date) +} : { + WHERE covariate_cohort.cohort_start_date <= DATEADD(DAY, {@temporal_sequence} ? {@sequence_end_day} : {@end_day}, cohort.cohort_start_date) +{@start_day != 'anyTimePrior'} ? { + AND CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, {@temporal_sequence} ? {@sequence_start_day} : {@start_day}, cohort.cohort_start_date) +} +} +{@included_cov_table != ''} ? { AND CAST(covariate_cohort.cohort_definition_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)} +{@cohort_definition_id != -1} ? { AND cohort.cohort_definition_id IN (@cohort_definition_id)} +) by_row_id +{@aggregated} ? { +GROUP BY cohort_definition_id, + covariate_cohort_id +{@temporal | @temporal_sequence} ? { + ,time_id +} +} +; + +-- Reference construction +INSERT INTO #cov_ref ( + covariate_id, + covariate_name, + analysis_id, + concept_id + ) +SELECT covariate_id, +{@temporal | @temporal_sequence} ? { + CAST(CONCAT('cohort: ', cohort_name) AS VARCHAR(512)) AS covariate_name, +} : { +{@start_day == 'anyTimePrior'} ? { + CAST(CONCAT('cohort any time prior through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name, +} : { + CAST(CONCAT('cohort during day @start_day through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name, +} +} + @analysis_id AS analysis_id, + 0 AS concept_id +FROM ( + SELECT DISTINCT covariate_id + FROM @covariate_table + ) t1 +LEFT JOIN #covariate_cohort_ref + ON cohort_id = CAST((covariate_id - @analysis_id) / 1000 AS INT); + +INSERT INTO #analysis_ref ( + analysis_id, + analysis_name, + domain_id, +{!@temporal} ? { + start_day, + end_day, +} + is_binary, + missing_means_zero + ) +SELECT @analysis_id AS analysis_id, + CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name, + CAST('cohort' AS VARCHAR(20)) AS domain_id, +{!@temporal} ? { +{@start_day == 'anyTimePrior'} ? { + CAST(NULL AS INT) AS start_day, +} : { + + {@temporal_sequence} ? {@sequence_start_day} : {@start_day} AS start_day, +} + {@temporal_sequence} ? {@sequence_end_day} : {@end_day} AS end_day, +} + CAST('Y' AS VARCHAR(1)) AS is_binary, + CAST(NULL AS VARCHAR(1)) AS missing_means_zero; diff --git a/inst/sql/sql_server/CohortBasedCountCovariates.sql b/inst/sql/sql_server/CohortBasedCountCovariates.sql new file mode 100644 index 00000000..60f5a42a --- /dev/null +++ b/inst/sql/sql_server/CohortBasedCountCovariates.sql @@ -0,0 +1,259 @@ +-- Feature construction +{@aggregated} ? { +DROP TABLE IF EXISTS #occ_count_data; +DROP TABLE IF EXISTS #occ_count_stats; +DROP TABLE IF EXISTS #occ_count_prep; +DROP TABLE IF EXISTS #occ_count_prep2; +} + +SELECT CAST(covariate_cohort.cohort_definition_id AS BIGINT) * 1000 + @analysis_id AS covariate_id, +{@temporal} ? { + time_id, +} +{@aggregated} ? { + COUNT(DISTINCT covariate_cohort.cohort_start_date) AS occurrence_count, + main_cohort.cohort_definition_id, + main_cohort.subject_id, + main_cohort.cohort_start_date +INTO #occ_count_data +} : { + COUNT(DISTINCT covariate_cohort.cohort_start_date) AS covariate_value, + main_cohort.@row_id_field AS row_id +INTO @covariate_table +} +FROM @cohort_table main_cohort +INNER JOIN @covariate_cohort_table covariate_cohort + ON main_cohort.subject_id = covariate_cohort.subject_id +INNER JOIN #covariate_cohort_ref covariate_cohort_ref + ON covariate_cohort.cohort_definition_id = covariate_cohort_ref.cohort_id +{@temporal} ? { +INNER JOIN #time_period time_period + ON covariate_cohort.cohort_start_date <= DATEADD(DAY, time_period.end_day, main_cohort.cohort_start_date) + AND CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, time_period.start_day, main_cohort.cohort_start_date) +} : { +WHERE covariate_cohort.cohort_start_date <= DATEADD(DAY, @end_day, main_cohort.cohort_start_date) +{@start_day != 'anyTimePrior'} ? { + AND CASE WHEN covariate_cohort.cohort_end_date IS NULL THEN covariate_cohort.cohort_start_date ELSE covariate_cohort.cohort_end_date END >= DATEADD(DAY, @start_day, main_cohort.cohort_start_date) +} +} +{@included_cov_table != ''} ? { AND CAST(covariate_cohort.cohort_definition_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)} +{@cohort_definition_id != -1} ? { AND main_cohort.cohort_definition_id IN (@cohort_definition_id)} +GROUP BY covariate_cohort.cohort_definition_id, +{@temporal} ? { + time_id, +} +{@aggregated} ? { + main_cohort.cohort_definition_id, + main_cohort.subject_id, + main_cohort.cohort_start_date +} : { + main_cohort.@row_id_field +} +; + +{@aggregated} ? { +WITH t1 AS ( + SELECT cohort_definition_id, + COUNT(*) AS cnt + FROM @cohort_table +{@cohort_definition_id != -1} ? { WHERE cohort_definition_id IN (@cohort_definition_id)} + GROUP BY cohort_definition_id + ), +t2 AS ( + SELECT cohort_definition_id, + COUNT(*) AS cnt, + covariate_id, +{@temporal} ? { + time_id, +} + MIN(occurrence_count) AS min_occurrence_count, + MAX(occurrence_count) AS max_occurrence_count, + SUM(CAST(occurrence_count AS BIGINT)) AS sum_occurrence_count, + SUM(CAST(occurrence_count AS BIGINT) * CAST(occurrence_count AS BIGINT)) AS squared_occurrence_count + FROM #occ_count_data + GROUP BY cohort_definition_id, +{@temporal} ? { + time_id, +} + covariate_id + ) +SELECT t1.cohort_definition_id, + CASE WHEN t2.cnt = t1.cnt THEN t2.min_occurrence_count ELSE 0 END AS min_value, + t2.max_occurrence_count AS max_value, + covariate_id, +{@temporal} ? { + time_id, +} + CAST(t2.sum_occurrence_count / (1.0 * t1.cnt) AS FLOAT) AS average_value, + CAST(CASE + WHEN t2.cnt = 1 THEN 0 + ELSE SQRT((1.0 * t2.cnt*t2.squared_occurrence_count - 1.0 * t2.sum_occurrence_count*t2.sum_occurrence_count) / (1.0 * t2.cnt*(1.0 * t2.cnt - 1))) + END AS FLOAT) AS standard_deviation, + t2.cnt AS count_value, + t1.cnt - t2.cnt AS count_no_value, + t1.cnt AS population_size +INTO #occ_count_stats +FROM t1 +INNER JOIN t2 + ON t1.cohort_definition_id = t2.cohort_definition_id; + +SELECT cohort_definition_id, +{@temporal} ? { + time_id, +} + occurrence_count, + COUNT(*) AS total, + covariate_id, +{@temporal} ? { + ROW_NUMBER() OVER (PARTITION BY cohort_definition_id, covariate_id, time_id ORDER BY occurrence_count) AS rn +} : { + ROW_NUMBER() OVER (PARTITION BY cohort_definition_id, covariate_id ORDER BY occurrence_count) AS rn +} +INTO #occ_count_prep +FROM #occ_count_data +GROUP BY cohort_definition_id, +{@temporal} ? { + time_id, +} + covariate_id, + occurrence_count; + +SELECT s.cohort_definition_id, +{@temporal} ? { + s.time_id, +} + s.covariate_id, + s.occurrence_count, + SUM(p.total) AS accumulated +INTO #occ_count_prep2 +FROM #occ_count_prep s +INNER JOIN #occ_count_prep p + ON p.rn <= s.rn + AND p.cohort_definition_id = s.cohort_definition_id + AND p.covariate_id = s.covariate_id +{@temporal} ? { + AND p.time_id = s.time_id +} +GROUP BY s.cohort_definition_id, +{@temporal} ? { + s.time_id, +} + s.covariate_id, + s.occurrence_count; + +SELECT o.covariate_id, + o.cohort_definition_id, +{@temporal} ? { + o.time_id, +} + o.count_value, + o.min_value, + o.max_value, + CAST(o.average_value AS FLOAT) average_value, + CAST(o.standard_deviation AS FLOAT) standard_deviation, + CASE + WHEN .50 * o.population_size < count_no_value THEN 0 + ELSE MIN(CASE WHEN p.accumulated + count_no_value >= .50 * o.population_size THEN occurrence_count END) + END AS median_value, + CASE + WHEN .10 * o.population_size < count_no_value THEN 0 + ELSE MIN(CASE WHEN p.accumulated + count_no_value >= .10 * o.population_size THEN occurrence_count END) + END AS p10_value, + CASE + WHEN .25 * o.population_size < count_no_value THEN 0 + ELSE MIN(CASE WHEN p.accumulated + count_no_value >= .25 * o.population_size THEN occurrence_count END) + END AS p25_value, + CASE + WHEN .75 * o.population_size < count_no_value THEN 0 + ELSE MIN(CASE WHEN p.accumulated + count_no_value >= .75 * o.population_size THEN occurrence_count END) + END AS p75_value, + CASE + WHEN .90 * o.population_size < count_no_value THEN 0 + ELSE MIN(CASE WHEN p.accumulated + count_no_value >= .90 * o.population_size THEN occurrence_count END) + END AS p90_value +INTO @covariate_table +FROM #occ_count_prep2 p +INNER JOIN #occ_count_stats o + ON p.covariate_id = o.covariate_id + AND p.cohort_definition_id = o.cohort_definition_id +{@temporal} ? { + AND p.time_id = o.time_id +} +{@included_cov_table != ''} ? {WHERE covariate_id IN (SELECT id FROM @included_cov_table)} +GROUP BY o.cohort_definition_id, +{@temporal} ? { + o.time_id, +} + o.count_value, + o.count_no_value, + o.min_value, + o.max_value, + o.average_value, + o.standard_deviation, + o.covariate_id, + o.population_size; + +TRUNCATE TABLE #occ_count_data; +DROP TABLE #occ_count_data; + +TRUNCATE TABLE #occ_count_stats; +DROP TABLE #occ_count_stats; + +TRUNCATE TABLE #occ_count_prep; +DROP TABLE #occ_count_prep; + +TRUNCATE TABLE #occ_count_prep2; +DROP TABLE #occ_count_prep2; +} + +-- Reference construction +INSERT INTO #cov_ref ( + covariate_id, + covariate_name, + analysis_id, + concept_id + ) +SELECT covariate_id, +{@temporal | @temporal_sequence} ? { + CAST(CONCAT('cohort count: ', cohort_name) AS VARCHAR(512)) AS covariate_name, +} : { +{@start_day == 'anyTimePrior'} ? { + CAST(CONCAT('cohort count any time prior through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name, +} : { + CAST(CONCAT('cohort count during day @start_day through @end_day days relative to index: ', cohort_name) AS VARCHAR(512)) AS covariate_name, +} +} + @analysis_id AS analysis_id, + 0 AS concept_id +FROM ( + SELECT DISTINCT covariate_id + FROM @covariate_table + ) t1 +LEFT JOIN #covariate_cohort_ref + ON cohort_id = CAST((covariate_id - @analysis_id) / 1000 AS INT); + +INSERT INTO #analysis_ref ( + analysis_id, + analysis_name, + domain_id, +{!@temporal} ? { + start_day, + end_day, +} + is_binary, + missing_means_zero + ) +SELECT @analysis_id AS analysis_id, + CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name, + CAST('cohort' AS VARCHAR(20)) AS domain_id, +{!@temporal} ? { +{@start_day == 'anyTimePrior'} ? { + CAST(NULL AS INT) AS start_day, +} : { + + {@temporal_sequence} ? {@sequence_start_day} : {@start_day} AS start_day, +} + {@temporal_sequence} ? {@sequence_end_day} : {@end_day} AS end_day, +} + CAST('N' AS VARCHAR(1)) AS is_binary, + CAST('Y' AS VARCHAR(1)) AS missing_means_zero; diff --git a/inst/sql/sql_server/covariateCohorts.sql b/inst/sql/sql_server/covariateCohorts.sql new file mode 100644 index 00000000..87d60a9f --- /dev/null +++ b/inst/sql/sql_server/covariateCohorts.sql @@ -0,0 +1,56 @@ +/************************ +File covariateCohorts.sql +*************************/ +DROP TABLE IF EXISTS @cohort_database_schema.@cohort_table; + +CREATE TABLE @cohort_database_schema.@cohort_table ( + cohort_definition_id INT, + subject_id BIGINT, + cohort_start_date DATE, + cohort_end_date DATE + ); + +INSERT INTO @cohort_database_schema.@cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) +SELECT 1, + person_id, + MIN(drug_era_start_date), + MIN(drug_era_end_date) +FROM @cdm_database_schema.drug_era +WHERE drug_concept_id = 1124300 --diclofenac +GROUP BY person_id; + +INSERT INTO @cohort_database_schema.@cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) +SELECT 2, + condition_occurrence.person_id, + MIN(condition_start_date), + MIN(observation_period_end_date) +FROM @cdm_database_schema.condition_occurrence +INNER JOIN @cdm_database_schema.drug_exposure + ON condition_occurrence.person_id = drug_exposure.person_id + AND drug_exposure_start_date >= condition_start_date + AND drug_exposure_start_date < DATEADD(DAY, 30, condition_start_date) +INNER JOIN @cdm_database_schema.observation_period + ON condition_occurrence.person_id = observation_period.person_id + AND condition_start_date >= observation_period_start_date + AND condition_start_date <= observation_period_end_date +WHERE condition_concept_id IN ( + SELECT descendant_concept_id + FROM @cdm_database_schema.concept_ancestor + WHERE ancestor_concept_id = 201826 -- Type 2 diabetes mellitus + ) + AND drug_concept_id IN ( + SELECT descendant_concept_id + FROM @cdm_database_schema.concept_ancestor + WHERE ancestor_concept_id = 21600712 -- DRUGS USED IN DIABETES (ATC A10) + ) +GROUP BY condition_occurrence.person_id; \ No newline at end of file diff --git a/java/org/ohdsi/featureExtraction/FeatureExtraction.java b/java/org/ohdsi/featureExtraction/FeatureExtraction.java index 0661aed7..ae259058 100644 --- a/java/org/ohdsi/featureExtraction/FeatureExtraction.java +++ b/java/org/ohdsi/featureExtraction/FeatureExtraction.java @@ -30,7 +30,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.concurrent.locks.ReentrantLock; @@ -76,20 +75,20 @@ public class FeatureExtraction { private static String ADD_DESCENDANTS_SQL = "SELECT descendant_concept_id AS id\nINTO @target_temp\nFROM @cdm_database_schema.concept_ancestor\nINNER JOIN @source_temp\n\tON ancestor_concept_id = id;\n\n"; public static void main(String[] args) { - //init("C:/Users/mschuemi/git/FeatureExtraction/inst"); + init("C:/Users/mschuemi/git/FeatureExtraction/inst"); // init("C:/R/R-3.3.1/library/FeatureExtraction"); - init("D:/git/OHDSI/FeatureExtraction/inst"); +// init("D:/git/OHDSI/FeatureExtraction/inst"); // System.out.println(convertSettingsPrespecToDetails("{\"temporal\":false,\"DemographicsGender\":true,\"DemographicsAge\":true,\"longTermStartDays\":-365,\"mediumTermStartDays\":-180,\"shortTermStartDays\":-30,\"endDays\":0,\"includedCovariateConceptIds\":[],\"addDescendantsToInclude\":false,\"excludedCovariateConceptIds\":[1,2,3],\"addDescendantsToExclude\":false,\"includedCovariateIds\":[]}")); // System.out.println(getDefaultPrespecAnalyses()); // // System.out.println(getDefaultPrespecAnalyses()); // //String settings = getDefaultPrespecTemporalAnalyses(); - String settings = getDefaultPrespecTemporalSequenceAnalyses(); +// String settings = getDefaultPrespecTemporalSequenceAnalyses(); // String settings = convertSettingsPrespecToDetails(getDefaultPrespecTemporalAnalyses()); // System.out.println(convertSettingsPrespecToDetails(getDefaultPrespecAnalyses())); - // String settings = - // "{\"temporal\":false,\"analyses\":[{\"analysisId\":301,\"sqlFileName\":\"DomainConcept.sql\",\"parameters\":{\"analysisId\":301,\"startDay\":-365,\"endDay\":0,\"inpatient\":\"\",\"domainTable\":\"drug_exposure\",\"domainConceptId\":\"drug_concept_id\",\"domainStartDate\":\"drug_exposure_start_date\",\"domainEndDate\":\"drug_exposure_start_date\"},\"addDescendantsToExclude\":true,\"includedCovariateConceptIds\":[1,2,21600537410],\"excludedCovariateConceptIds\":{},\"addDescendantsToInclude\":true,\"includedCovariateIds\":12301}]}"; + String settings = + "{\"temporal\":false,\"temporalSequence\":false,\"analyses\":[{\"analysisId\":999,\"sqlFileName\":\"CohortBasedBinaryCovariates.sql\",\"parameters\":{\"covariateCohortTable\":\"cohort\",\"analysisId\":999,\"analysisName\":\"Cohort\",\"startDay\":-365,\"endDay\":0},\"includedCovariateConceptIds\":[],\"addDescendantsToInclude\":false,\"excludedCovariateConceptIds\":[],\"addDescendantsToExclude\":false,\"includedCovariateIds\":[]}]}"; // String settings = convertSettingsPrespecToDetails(getDefaultPrespecAnalyses()); System.out.println(createSql(settings, true, "#temp_cohort", "row_id", -1, "cdm_synpuf")); // System.out.println(createSql(getDefaultPrespecAnalyses(), true, "#temp_cohort", "row_id", -1, "cdm_synpuf")); @@ -115,6 +114,7 @@ public static void init(String packageFolder) { nameToPrespecAnalysis = loadPrespecAnalysis(packageFolder, "PrespecAnalyses.csv"); nameToPrespecTemporalAnalysis = loadPrespecAnalysis(packageFolder, "PrespecTemporalAnalyses.csv"); nameToPrespecTemporalSequenceAnalysis = loadPrespecAnalysis(packageFolder, "PrespecTemporalSequenceAnalyses.csv"); + loadPrespecAnalysis(packageFolder, "OtherSqlToLoad.csv"); // Called for side-effect of adding SQL filenames to nameToSql keys. loadTemplateSql(packageFolder); createCovRefTableSql = loadSqlFile(packageFolder, "CreateCovAnalysisRefTables.sql"); } diff --git a/man/createCohortBasedCovariateSettings.Rd b/man/createCohortBasedCovariateSettings.Rd new file mode 100644 index 00000000..23bde00c --- /dev/null +++ b/man/createCohortBasedCovariateSettings.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetCovariatesFromOtherCohorts.R +\name{createCohortBasedCovariateSettings} +\alias{createCohortBasedCovariateSettings} +\title{Create settings for covariates based on other cohorts} +\usage{ +createCohortBasedCovariateSettings( + analysisId, + covariateCohortDatabaseSchema = NULL, + covariateCohortTable = NULL, + covariateCohorts, + valueType = "binary", + startDay = -365, + endDay = 0, + includedCovariateIds = c(), + warnOnAnalysisIdOverlap = TRUE +) +} +\arguments{ +\item{analysisId}{A unique identifier for this analysis.} + +\item{covariateCohortDatabaseSchema}{The database schema where the cohorts used to define the covariates +can be found. If set to \code{NULL}, the database schema will be +guessed, for example using the same one as for the main cohorts.} + +\item{covariateCohortTable}{The table where the cohorts used to define the covariates +can be found. If set to \code{NULL}, the table will be +guessed, for example using the same one as for the main cohorts.} + +\item{covariateCohorts}{A data frame with at least two columns: 'cohortId' and 'cohortName'. The +cohort ID should correspond to the \code{cohort_definition_id} of the cohort +to use for creating a covariate.} + +\item{valueType}{Either 'binary' or 'count'. When \code{valueType = 'count'}, the covariate +value will be the number of times the cohort was observed in the window.} + +\item{startDay}{What is the start day (relative to the index date) of the covariate window?} + +\item{endDay}{What is the end day (relative to the index date) of the covariate window?} + +\item{includedCovariateIds}{A list of covariate IDs that should be restricted to.} + +\item{warnOnAnalysisIdOverlap}{Warn if the provided `analysisId` overlaps with any predefined analysis as +available in the `createCoverateSettings()` function.} +} +\value{ +An object of type \code{covariateSettings}, to be used in other functions. +} +\description{ +Create settings for covariates based on other cohorts +} +\details{ +Creates an object specifying covariates to be constructed based on the presence of other cohorts. +} diff --git a/man/createCohortBasedTemporalCovariateSettings.Rd b/man/createCohortBasedTemporalCovariateSettings.Rd new file mode 100644 index 00000000..166c73fa --- /dev/null +++ b/man/createCohortBasedTemporalCovariateSettings.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetCovariatesFromOtherCohorts.R +\name{createCohortBasedTemporalCovariateSettings} +\alias{createCohortBasedTemporalCovariateSettings} +\title{Create settings for temporal covariates based on other cohorts} +\usage{ +createCohortBasedTemporalCovariateSettings( + analysisId, + covariateCohortDatabaseSchema = NULL, + covariateCohortTable = NULL, + covariateCohorts, + valueType = "binary", + temporalStartDays = -365:-1, + temporalEndDays = -365:-1, + includedCovariateIds = c(), + warnOnAnalysisIdOverlap = TRUE +) +} +\arguments{ +\item{analysisId}{A unique identifier for this analysis.} + +\item{covariateCohortDatabaseSchema}{The database schema where the cohorts used to define the covariates +can be found. If set to \code{NULL}, the database schema will be +guessed, for example using the same one as for the main cohorts.} + +\item{covariateCohortTable}{The table where the cohorts used to define the covariates +can be found. If set to \code{NULL}, the table will be +guessed, for example using the same one as for the main cohorts.} + +\item{covariateCohorts}{A data frame with at least two columns: 'cohortId' and 'cohortName'. The +cohort ID should correspond to the \code{cohort_definition_id} of the cohort +to use for creating a covariate.} + +\item{valueType}{Either 'binary' or 'count'. When \code{valueType = 'count'}, the covariate +value will be the number of times the cohort was observed in the window.} + +\item{temporalStartDays}{A list of integers representing the start of a time +period, relative to the index date. 0 indicates the +index date, -1 indicates the day before the index +date, etc. The start day is included in the time +period.} + +\item{temporalEndDays}{A list of integers representing the end of a time +period, relative to the index date. 0 indicates the +index date, -1 indicates the day before the index +date, etc. The end day is included in the time +period.} + +\item{includedCovariateIds}{A list of covariate IDs that should be restricted to.} + +\item{warnOnAnalysisIdOverlap}{Warn if the provided `analysisId` overlaps with any predefined analysis as +available in the `createTemporalCovariateSettings()` function.} +} +\value{ +An object of type \code{covariateSettings}, to be used in other functions. +} +\description{ +Create settings for temporal covariates based on other cohorts +} +\details{ +Creates an object specifying temporal covariates to be constructed based on the presence of other cohorts. +} diff --git a/man/getDbCohortBasedCovariatesData.Rd b/man/getDbCohortBasedCovariatesData.Rd new file mode 100644 index 00000000..0633b1e6 --- /dev/null +++ b/man/getDbCohortBasedCovariatesData.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetCovariatesFromOtherCohorts.R +\name{getDbCohortBasedCovariatesData} +\alias{getDbCohortBasedCovariatesData} +\title{Get covariate information from the database based on other cohorts} +\usage{ +getDbCohortBasedCovariatesData( + connection, + oracleTempSchema = NULL, + cdmDatabaseSchema, + cohortTable = "#cohort_person", + cohortId = -1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings, + aggregated = FALSE +) +} +\arguments{ +\item{connection}{A connection to the server containing the schema as created using the +\code{connect} function in the \code{DatabaseConnector} package.} + +\item{oracleTempSchema}{A schema where temp tables can be created in Oracle.} + +\item{cdmDatabaseSchema}{The name of the database schema that contains the OMOP CDM instance. +Requires read permissions to this database. On SQL Server, this should +specifiy both the database and the schema, so for example +'cdm_instance.dbo'.} + +\item{cohortTable}{Name of the table holding the cohort for which we want to construct +covariates. If it is a temp table, the name should have a hash prefix, +e.g. '#temp_table'. If it is a non-temp table, it should include the +database schema, e.g. 'cdm_database.cohort'.} + +\item{cohortId}{For which cohort ID should covariates be constructed? If set to -1, +covariates will be constructed for all cohorts in the specified cohort +table.} + +\item{cdmVersion}{The version of the Common Data Model used. Currently only +\code{cdmVersion = "5"} is supported.} + +\item{rowIdField}{The name of the field in the cohort temp table that is to be used as the +row_id field in the output table. This can be especially usefull if there +is more than one period per person.} + +\item{covariateSettings}{An object of type \code{covariateSettings} as created using the +\code{\link{createCohortBasedCovariateSettings}} or +\code{\link{createCohortBasedTemporalCovariateSettings}} functions.} + +\item{aggregated}{Should aggregate statistics be computed instead of covariates per +cohort entry?} +} +\value{ +Returns an object of type \code{CovariateData}, which is an Andromeda object containing information on the baseline covariates. +Information about multiple outcomes can be captured at once for efficiency reasons. This object is +a list with the following components: \describe{ \item{covariates}{An ffdf object listing the +baseline covariates per person in the cohorts. This is done using a sparse representation: +covariates with a value of 0 are omitted to save space. The covariates object will have three +columns: rowId, covariateId, and covariateValue. The rowId is usually equal to the person_id, +unless specified otherwise in the rowIdField argument.} \item{covariateRef}{A table +describing the covariates that have been extracted.} }. The CovariateData object will also have a \code{metaData} attribute, a list of objects with +information on how the covariateData object was constructed. +} +\description{ +Constructs covariates using other cohorts. +} +\details{ +This function uses the data in the CDM to construct a large set of covariates for the provided +cohort. The cohort is assumed to be in an existing temp table with these fields: 'subject_id', +'cohort_definition_id', 'cohort_start_date'. Optionally, an extra field can be added containing the +unique identifier that will be used as rowID in the output. Typically, users don't call this +function directly but rather use the \code{\link{getDbCovariateData}} function instead. +} diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 9d08e691..829100d1 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -16,8 +16,6 @@ withr::defer({ # Get all environment variables to determine which DBMS to use for testing -# AGS: At the moment, Oracle & SQL Server have some issues with the structure -# of the visit_detail table so commenting those tests out for now. runTestsOnPostgreSQL <- !(Sys.getenv("CDM5_POSTGRESQL_USER") == "" & Sys.getenv("CDM5_POSTGRESQL_PASSWORD") == "" & Sys.getenv("CDM5_POSTGRESQL_SERVER") == "" & Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA") == "" & Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA") == "") runTestsOnSQLServer <- !(Sys.getenv("CDM5_SQL_SERVER_USER") == "" & Sys.getenv("CDM5_SQL_SERVER_PASSWORD") == "" & Sys.getenv("CDM5_SQL_SERVER_SERVER") == "" & Sys.getenv("CDM5_SQL_SERVER_CDM_SCHEMA") == "" & Sys.getenv("CDM5_SQL_SERVER_OHDSI_SCHEMA") == "") runTestsOnOracle <- !(Sys.getenv("CDM5_ORACLE_USER") == "" & Sys.getenv("CDM5_ORACLE_PASSWORD") == "" & Sys.getenv("CDM5_ORACLE_SERVER") == "" & Sys.getenv("CDM5_ORACLE_CDM_SCHEMA") == "" & Sys.getenv("CDM5_ORACLE_OHDSI_SCHEMA") == "") diff --git a/tests/testthat/test-GetCohortBasedCovariates.R b/tests/testthat/test-GetCohortBasedCovariates.R new file mode 100644 index 00000000..ce4bfa3a --- /dev/null +++ b/tests/testthat/test-GetCohortBasedCovariates.R @@ -0,0 +1,822 @@ +# View coverage for this file using +# library(testthat); library(FeatureExtraction) +# covr::file_report(covr::file_coverage("R/GetCovariates.R", "tests/testthat/test-GetCohortBasedCovariates.R")) +library(testthat) + + +covariateCohorts <- data.frame(cohortId = c(101, 102), + cohortName = c("Foo", "Bar")) + +createCohortBasedCovariateTestData <- function(connection, + databaseSchema, + cohortTableName) { + cohort <- data.frame(cohortDefinitionId = c(1, 1, 101, 101), + cohortStartDate = as.Date(c("2000-02-01", "2000-01-01", "2000-01-01", "2000-01-02")), + cohortEndDate = as.Date(c("2000-02-14", "2000-01-14", "2000-01-01", "2000-01-02")), + subjectId = c(1, 2, 1, 1)) + tempTable <- substr(cohortTableName, 1, 1) == "#" + if (tempTable) { + DatabaseConnector::insertTable(connection = connection, + tableName = cohortTableName, + data = cohort, + dropTableIfExists = TRUE, + tempTable = tempTable, + createTable = TRUE, + progressBar = FALSE, + camelCaseToSnakeCase = TRUE) + } else { + DatabaseConnector::insertTable(connection = connection, + databaseSchema = databaseSchema, + tableName = cohortTableName, + data = cohort, + dropTableIfExists = TRUE, + tempTable = tempTable, + createTable = TRUE, + progressBar = FALSE, + camelCaseToSnakeCase = TRUE) + } +} + +dropCohortBasedCovariateTestData <- function(connection, + databaseSchema, + cohortTableName) { + # Handle temp table + if (substr(cohortTableName, 1, 1) == "#") { + DatabaseConnector::renderTranslateExecuteSql(connection = connection, + sql = "DROP TABLE IF EXISTS @cohort_table;", + progressBar = FALSE, + reportOverallTime = FALSE, + cohort_table = cohortTableName) + } else { + DatabaseConnector::renderTranslateExecuteSql(connection = connection, + sql = "DROP TABLE IF EXISTS @database_schema.@cohort_table;", + progressBar = FALSE, + reportOverallTime = FALSE, + database_schema = databaseSchema, + cohort_table = cohortTableName) + } +} + +# Database specific tests --------------- +runCohortBasedBinaryNonAggTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + settings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "binary") + + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = FALSE) + + covariates <- dplyr::collect(covs$covariates) + + expectedCovariates <- data.frame(rowId = 1, + covariateId = 101999, + covariateValue = 1) + expect_equivalent(covariates, expectedCovariates) +} + +runCohortBasedBinaryAggTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + settings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "binary") + + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = TRUE) + + covariates <- dplyr::collect(covs$covariates) + + expectedCovariates <- data.frame(cohortDefinitionId = 1, + covariateId = 101999, + sumValue = 1, + averageValue = 0.5) + expect_equivalent(covariates, expectedCovariates) +} + +runCohortBasedBinaryNonAggTemporalTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedTemporalCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts) + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = FALSE) + + covariates <- dplyr::collect(covs$covariates) + covariates <- dplyr::arrange(covariates, timeId) + + expectedCovariates <- data.frame(rowId = c(1, 1), + covariateId = c(101999, 101999), + covariateValue = c(1,1), + timeId = c(335,336)) + expect_equivalent(covariates, expectedCovariates) +} + +runCohortBasedBinaryAggTemporalTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedTemporalCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts) + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = TRUE) + + covariates <- dplyr::collect(covs$covariates) + covariates <- dplyr::arrange(covariates, timeId) + + expectedCovariates <- data.frame(cohortDefinitionId = c(1, 1), + covariateId = c(101999, 101999), + timeId = c(335,336), + sumValue = c(1,1), + averageValue = c(0.5, 0.5)) + expect_equivalent(covariates, expectedCovariates) +} + +runCohortBasedCountsNonAggTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "count") + + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTable = cohortTable, + cohortTableIsTemp = tempTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = FALSE) + + covariates <- dplyr::collect(covs$covariates) + + expectedCovariates <- data.frame(rowId = 1, + covariateId = 101999, + covariateValue = 2) + expect_equivalent(covariates, expectedCovariates) +} + +runCohortBasedCountsAggTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "count") + + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = TRUE) + + covariatesContinuous <- dplyr::collect(covs$covariatesContinuous) + + expectedCovariates <- data.frame(cohortDefinitionId = 1, + covariateId = 101999, + countValue = 1, + minValue = 0, + maxValue = 2, + averageValue = 1) + expect_equivalent(covariatesContinuous[, names(expectedCovariates)], expectedCovariates) +} + +runCohortBasedCountsNonAggTemporalTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedTemporalCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "count") + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = FALSE) + + covariates <- dplyr::collect(covs$covariates) + covariates <- dplyr::arrange(covariates, timeId) + + expectedCovariates <- data.frame(rowId = c(1, 1), + covariateId = c(101999, 101999), + covariateValue = c(1,1), + timeId = c(335,336)) + expect_equivalent(covariates, expectedCovariates) +} + +runCohortBasedCountsAggTemporalTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedTemporalCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "count") + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTable = cohortTable, + cohortTableIsTemp = tempTable, + cohortId = 1, + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = TRUE) + + covariatesContinuous <- dplyr::collect(covs$covariatesContinuous) + covariatesContinuous <- dplyr::arrange(covariatesContinuous, timeId) + + expectedCovariates <- data.frame(cohortDefinitionId = 1, + covariateId = 101999, + countValue = 1, + minValue = 0, + maxValue = 1, + averageValue = c(0.5, 0.5), + timeId = c(335,336)) + expect_equivalent(covariatesContinuous[, names(expectedCovariates)], expectedCovariates) +} + +runCohortBasedCountsAggMultiCohortTest <- function(connection, cdmDatabaseSchema, ohdsiDatabaseSchema, cohortTable) { + createCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable) + + on.exit(dropCohortBasedCovariateTestData(connection = connection, + databaseSchema = ohdsiDatabaseSchema, + cohortTableName = cohortTable)) + + settings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "count") + + tempTable <- substr(cohortTable, 1, 1) == "#" + covs <- getDbCovariateData(connection = connection, + oracleTempSchema = getOption("sqlRenderTempEmulationSchema"), + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = ohdsiDatabaseSchema, + cohortTableIsTemp = tempTable, + cohortTable = cohortTable, + cohortId = c(1, 101), + cdmVersion = "5", + rowIdField = "subject_id", + covariateSettings = settings, + aggregated = TRUE) + covariatesContinuous <- dplyr::collect(covs$covariatesContinuous) + covariatesContinuous <- dplyr::arrange(covariatesContinuous, cohortDefinitionId) + expectedCovariates <- data.frame(cohortDefinitionId = c(1, 101), + covariateId = c(101999, 101999), + countValue = c(1, 2), + minValue = c(0, 1), + maxValue = c(2, 2), + averageValue = c(1, 1.5)) + expect_equivalent(covariatesContinuous[, names(expectedCovariates)], expectedCovariates) +} + +# Eunomia tests ------------ +test_that("Cohort-based covariates: binary, non-aggregated on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedBinaryNonAggTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: binary, aggregated on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedBinaryAggTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: binary, non-aggregated, temporal on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedBinaryNonAggTemporalTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: binary, aggregated, temporal on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedBinaryAggTemporalTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: counts, non-aggregated on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedCountsNonAggTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: counts, aggregated on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedCountsAggTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: counts, non-aggregated, temporal on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedCountsNonAggTemporalTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: counts, aggregated, temporal on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedCountsAggTemporalTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +test_that("Cohort-based covariates: counts, aggregated, using multiple cohort IDs on Eunomia", { + skip_if_not(runTestsOnEunomia) + runCohortBasedCountsAggMultiCohortTest(connection = eunomiaConnection, + cdmDatabaseSchema = eunomiaCdmDatabaseSchema, + ohdsiDatabaseSchema = eunomiaOhdsiDatabaseSchema, + cohortTable = "cohort_cov") +}) + +# Postgres tests ------------ +test_that("Cohort-based covariates: binary, non-aggregated on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, non-aggregated, temporal on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated, temporal on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTemporalTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated, temporal on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, temporal on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTemporalTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, using multiple cohort IDs on Postgres", { + skip_if_not(runTestsOnPostgreSQL) + connection <- DatabaseConnector::connect(pgConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggMultiCohortTest(connection = connection, + cdmDatabaseSchema = pgCdmDatabaseSchema, + ohdsiDatabaseSchema = pgOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +# SQL Server tests ------------ +test_that("Cohort-based covariates: binary, non-aggregated on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, non-aggregated, temporal on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated, temporal on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTemporalTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated, temporal on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, temporal on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTemporalTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, using multiple cohort IDs on SQL Server", { + skip_if_not(runTestsOnSQLServer) + connection <- DatabaseConnector::connect(sqlServerConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggMultiCohortTest(connection = connection, + cdmDatabaseSchema = sqlServerCdmDatabaseSchema, + ohdsiDatabaseSchema = sqlServerOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +# Oracle tests ------------ +test_that("Cohort-based covariates: binary, non-aggregated on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, non-aggregated, temporal on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated, temporal on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTemporalTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated, temporal on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, temporal on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTemporalTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, using multiple cohort IDs on Oracle", { + skip_if_not(runTestsOnOracle) + connection <- DatabaseConnector::connect(oracleConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggMultiCohortTest(connection = connection, + cdmDatabaseSchema = oracleCdmDatabaseSchema, + ohdsiDatabaseSchema = oracleOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +# RedShift tests ------------ +test_that("Cohort-based covariates: binary, non-aggregated on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, non-aggregated, temporal on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: binary, aggregated, temporal on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedBinaryAggTemporalTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, non-aggregated, temporal on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsNonAggTemporalTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, temporal on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggTemporalTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + +test_that("Cohort-based covariates: counts, aggregated, using multiple cohort IDs on RedShift", { + skip_if_not(runTestsOnRedshift) + connection <- DatabaseConnector::connect(redshiftConnectionDetails) + on.exit(DatabaseConnector::disconnect(connection)) + runCohortBasedCountsAggMultiCohortTest(connection = connection, + cdmDatabaseSchema = redshiftCdmDatabaseSchema, + ohdsiDatabaseSchema = redshiftOhdsiDatabaseSchema, + cohortTable = cohortTable) +}) + + +# Non-database specific tests --------------- +test_that("Cohort-based covariates: warning if using pre-defined analysis ID", { + expect_warning(createCohortBasedCovariateSettings(analysisId = 1, + covariateCohorts = covariateCohorts, + valueType = "count"), + "Analysis ID [0-9+] also used for prespecified analysis") + expect_warning(createCohortBasedTemporalCovariateSettings(analysisId = 1, + covariateCohorts = covariateCohorts), + "Analysis ID [0-9+] also used for prespecified analysis") +}) diff --git a/vignettes/CreatingCovariatesBasedOnOtherCohorts.Rmd b/vignettes/CreatingCovariatesBasedOnOtherCohorts.Rmd new file mode 100644 index 00000000..7dd7d614 --- /dev/null +++ b/vignettes/CreatingCovariatesBasedOnOtherCohorts.Rmd @@ -0,0 +1,229 @@ +--- +title: "Creating covariates based on other cohorts" +author: "Martijn J. Schuemie" +date: "`r Sys.Date()`" +output: + pdf_document: + number_sections: yes + toc: yes + html_document: + number_sections: yes + toc: yes +vignette: > + %\VignetteIndexEntry{Creating covariates based on other cohorts} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE} +library(FeatureExtraction) +vignetteFolder <- "s:/temp/vignetteFeatureExtractionCohortBased" +``` + +# Introduction + +This vignette assumes you are already familiar with the `FeatureExtraction` package. + +The `FeatureExtraction` package can generate a default set of covariates, such as one covariate for each condition found in the `condition_occurrence` table. These covariates are based on the concepts found in the data in the Common Data Model (CDM). For example, if we wish to have a covariate for 'prior diabetes', the default set of covariates includes a covariate based on the condition concept `Diabetes melitus` and all of its descendant concepts. + +In this vignette we review how we can create covariates based on other cohorts instead of concepts by themselves. In our diabetes example, we may wish to define diabetes not just as the occurrence of a diagnose code, but also require records of antidiabetic treatments, or blood glucose values exceeding certain thresholds. We can construct a complicated cohort definition, for example using the OHDSI ATLAS tool, encoding this logic. We can instantiate this cohort, and then construct covariates based on the presence or absence of this cohort in a patient's history. + +# Overview + +To construct covariates based on other cohorts, the following steps must be taken: + +1. Populate a table with the cohorts to be used for covariate construction. +2. Use the `createCohortBasedCovariateSettings()` or `createCohortBasedTemporalCovariateSettings()` function to create a `covariateSettings` object pointing to the cohorts mentioned in the previous steps. + +## Populate a table with the cohorts to be used for covariate construction. + +The cohorts should be loaded into a cohort table with the standard fields. **This can be the same table as the one containing the main cohorts**. The table should at least have the following fields: + +* `cohort_definition_id`, A unique identifier for the cohort. This ID will be used (together with the analysis ID) to construct the covariate ID, so **ensure that the `cohort_definition_id` fits in a 32-bit integer*** (so between -2,147,483,648 and 2,147,483,647). +* `subject_id`, The unique identifier for the person. Should match the `person_id` in the CDM. +* `cohort_start_date`, The date the person enters the cohort. +* `cohort_end_date`, The date the the person exits the cohort. If `null`, the person is assumed to exit the cohort on the same date as entering it. + +A person can enter and exit a cohort multiple times, but cannot be in the same cohort at the same time multiple times (this is true for all cohorts in OHDSI). + +# Example + +## Creating the cohort attributes and attributes definitions + +In this example we will create two cohorts: the main cohort is people initiating diclofenac treatment for the first time. The cohort we wish to use to construct a covariate is type 2 diabetes, requiring both a diagnosis code and a treatment code (in the 30 days following the diagnosis). The diabetes cohort is assumed to start at the first diagnosis meeting the criteria, and end when observation ends (chronic). + +```sql +/************************ +File covariateCohorts.sql +*************************/ +DROP TABLE IF EXISTS @cohort_database_schema.@cohort_table; + +CREATE TABLE @cohort_database_schema.@cohort_table ( + cohort_definition_id INT, + subject_id BIGINT, + cohort_start_date DATE, + cohort_end_date DATE + ); + +INSERT INTO @cohort_database_schema.@cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) +SELECT 1, + person_id, + MIN(drug_era_start_date), + MIN(drug_era_end_date) +FROM @cdm_database_schema.drug_era +WHERE drug_concept_id = 1124300 --diclofenac +GROUP BY person_id; + +INSERT INTO @cohort_database_schema.@cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) +SELECT 2, + condition_occurrence.person_id, + MIN(condition_start_date), + MIN(observation_period_end_date) +FROM @cdm_database_schema.condition_occurrence +INNER JOIN @cdm_database_schema.drug_exposure + ON condition_occurrence.person_id = drug_exposure.person_id + AND drug_exposure_start_date >= condition_start_date + AND drug_exposure_start_date < DATEADD(DAY, 30, condition_start_date) +INNER JOIN @cdm_database_schema.observation_period + ON condition_occurrence.person_id = observation_period.person_id + AND condition_start_date >= observation_period_start_date + AND condition_start_date <= observation_period_end_date +WHERE condition_concept_id IN ( + SELECT descendant_concept_id + FROM @cdm_database_schema.concept_ancestor + WHERE ancestor_concept_id = 201826 -- Type 2 diabetes mellitus + ) + AND drug_concept_id IN ( + SELECT descendant_concept_id + FROM @cdm_database_schema.concept_ancestor + WHERE ancestor_concept_id = 21600712 -- DRUGS USED IN DIABETES (ATC A10) + ) +GROUP BY condition_occurrence.person_id; +``` +We substitute the arguments in this SQL with actual values, translate it to the right SQL dialect, and execute the SQL: + +```{r tidy=FALSE,eval=FALSE} +library(SqlRender) +sql <- readSql("covariateCohorts.sql") +connection <- connect(connectionDetails) +renderTranslateExecuteSql(connection = connection, + sql = sql, + cdm_database_schema = cdmDatabaseSchema, + cohort_database_schema = cohortDatabaseSchema, + cohort_table = cohortTable) +``` + +If all went well, we now have a table with the cohorts. We can see how many cohorts per type: + +```{r eval=FALSE} +sql <- paste("SELECT cohort_definition_id, + COUNT(*) AS count", + "FROM @cohort_database_schema.@cohort_table", + "GROUP BY cohort_definition_id") +renderTranslateQuerySql(connection = connection, + sql = sql, + cohort_database_schema = cohortDatabaseSchema, + cohort_table = cohortTable) +``` +```{r echo=FALSE,message=FALSE} +data.frame(cohort_concept_id = c(1, 2),count = c(954179, 979874)) +``` + +## Using the cohort as covariate + +To use the constructed diabetes cohort as a covariate, we need to create a `covariateSettings` object: + +```{r eval=FALSE} +covariateCohorts <- tibble(cohortId = 2, + cohortName = "Type 2 diabetes") + +covariateSettings <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "binary", + startDay = -365, + endDay = 0) +``` + +Here we first create a tibble containing two columns. The `cohortId` column lists the cohort definition IDs used in our cohort table. The `cohortName` column will be used to create the covariate names. In this case, we only have one cohort in this table, but there could be many. A separate covariate will be created for each cohort. + +We then specify an analysis which we assign a unique ID (between 1 and 999). We specify we want to create binary covariates, meaning the covariate will have value = 1 if the cohort is found during the window, and value = 0 if not found. (Because we typically use sparse matrices to represent our covariates, entries with value = 0 will not be included). Alternatively, we could have set `valueType = "count"`, in which case the covariate value would be the number of times the cohort was observed in the time window. Finally, we specify the covariate capture window spans the 365 days before (and including) entry into the main (diclofenac) cohort. + +We could also specify the cohort database schema and table where the cohorts for constructing the covariates can be found. However, because both the main cohorts and covariate cohorts are in the same table this is not necessary. + +We can now construct the covariates: + +```{r eval=FALSE} +covariateData <- getDbCovariateData(connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTable, + cohortId = 1, + rowIdField = "subject_id", + covariateSettings = covariateSettings) +summary(covariateData) +``` +```{r echo=FALSE,message=FALSE} +if (file.exists(file.path(vignetteFolder, "covariatesPerPerson"))) { + covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesPerPerson")) + summary(covariateData) +} +``` +```{r eval=FALSE} +covariateData$covariateRef +``` +```{r echo=FALSE,message=FALSE} +if (file.exists(file.path(vignetteFolder, "covariatesPerPerson"))) { + covariateData$covariateRef +} +``` + +In this case we will have only one covariate, diabetes in the year before index. In most cases, we will want our custom covariates in addition to the default covariates. We can do this by creating a list of covariate settings: + +```{r eval=FALSE} +covariateSettings1 <- createCovariateSettings(useDemographicsGender = TRUE, + useDemographicsAgeGroup = TRUE, + useDemographicsRace = TRUE, + useDemographicsEthnicity = TRUE, + useDemographicsIndexYear = TRUE, + useDemographicsIndexMonth = TRUE) + +covariateCohorts <- tibble(cohortId = 2, + cohortName = "Type 2 diabetes") + +covariateSettings2 <- createCohortBasedCovariateSettings(analysisId = 999, + covariateCohorts = covariateCohorts, + valueType = "binary", + startDay = -365, + endDay = 0) + +covariateSettingsList <- list(covariateSettings1, covariateSettings2) + +covariateData <- getDbCovariateData(connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTable, + cohortId = 1, + rowIdField = "subject_id", + covariateSettings = covariateSettingsList, + aggregated = TRUE) +summary(covariateData) +``` +```{r echo=FALSE,message=FALSE} +if (file.exists(file.path(vignetteFolder, "covariatesAggregated"))) { + covariateData <- loadCovariateData(file.path(vignetteFolder, "covariatesAggregated")) + summary(covariateData) +} +``` + +In this example both demographic covariates and our diabetes covariate were generated. Note that, for illustration purposes, here we opted for aggregated covariates. +