j

OHDSI · Nov 18, 2024 · 302e719 · 302e719
1 parent 188cb28
commit 302e719
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 98 deletions.
diff --git a/R/GetConceptRecordCount.R b/R/GetConceptRecordCount.R
@@ -91,28 +91,30 @@ getConceptRecordCount <- function(conceptIds = NULL,
     )
     on.exit(DatabaseConnector::disconnect(connection), add = TRUE)
   }
-
+  
   uploadedConceptTable <- ""
   if (!is.null(conceptIds)) {
     uploadedConceptTable <-
       loadTempConceptTable(
         conceptIds = conceptIds,
-        connection = connection
+        connection = connection,
+        bulkLoad = Sys.getenv("DATABASE_CONNECTOR_BULK_UPLOAD"),
+        tempEmulationSchema = tempEmulationSchema
       )
   }
-
+  
   domainInformation <-
     getDomainInformation()
-
+  
   domainsWide <- domainInformation$wide |>
     dplyr::filter(.data$domainTable %in% c(domainTableName)) |>
     dplyr::filter(.data$isEraTable == FALSE)
-
+  
   domainsLong <- domainInformation$long |>
     dplyr::filter(.data$domainTable %in% c(domainTableName)) |>
     dplyr::filter(.data$eraTable == FALSE)
   # filtering out ERA tables because they are supposed to be derived tables, and counting them is double counting
-
+  
   limitToCohort <- FALSE
   if (all(
     !is.null(cohortDatabaseSchema),
@@ -121,25 +123,24 @@ getConceptRecordCount <- function(conceptIds = NULL,
   )) {
     limitToCohort <- TRUE
   }
-
+  
   # REASON for many SQL --DISTINCT subject_count cannot be computed from aggregation query of calendar month level data
   sql <- "
 
           DROP TABLE IF EXISTS #concept_count_table;
 
           {@concept_id_universe != ''} ? {
             DROP TABLE IF EXISTS #concept_id_unv_2;
-            CREATE TABLE #concept_id_unv_2 as
-              (
-                SELECT DISTINCT u.concept_id
-                FROM @concept_id_universe u
-                INNER JOIN (
-                  SELECT concept_id
-                  FROM @vocabulary_database_schema.CONCEPT
-                  WHERE concept_id > 0
-                            ) c
-               ON u.concept_id = c.concept_id
-              ) ;
+
+            SELECT DISTINCT u.concept_id
+            INTO #concept_id_unv_2
+            FROM @concept_id_universe u
+            INNER JOIN (
+              SELECT concept_id
+              FROM @vocabulary_database_schema.CONCEPT
+              WHERE concept_id > 0
+                        ) c
+           ON u.concept_id = c.concept_id;
           }
 
           --PERCENTILES are difficult and will need subqueries
@@ -227,7 +228,7 @@ getConceptRecordCount <- function(conceptIds = NULL,
                 {@use_date_quarter} ? {DATEPART(quarter, @domain_start_date),}};
 
             "
-
+  
   iterations <- domainsLong |>
     tidyr::crossing(dplyr::tibble(includeConceptId = c("Y", "N", ""))) |>
     tidyr::crossing(dplyr::tibble(genderConceptId = c(0, 8507, 8532))) |>
@@ -263,72 +264,68 @@ getConceptRecordCount <- function(conceptIds = NULL,
     tidyr::crossing(dplyr::tibble(useAgeGroup = c("Y", "N"))) |>
     dplyr::arrange() |>
     dplyr::mutate(combination = dplyr::row_number())
-
+  
   if (!stratifyByGender) {
     iterations <- iterations |>
       dplyr::filter(!.data$genderConceptId %in% c(8507, 8532))
   }
-
+  
   if (!stratifyByYear) {
     iterations <- iterations |>
       dplyr::filter(.data$calendarType != "Y")
   }
-
+  
   if (!stratifyByYearQuarter) {
     iterations <- iterations |>
       dplyr::filter(.data$calendarType != "Q")
   }
-
+  
   if (!stratifyByYearMonth) {
     iterations <- iterations |>
       dplyr::filter(.data$calendarType != "M")
   }
-
+  
   if (!stratifyByAgeGroup) {
     iterations <- iterations |>
       dplyr::filter(.data$useAgeGroup != "Y")
   }
-
+  
   if (!stratifyByIncidence) {
     iterations <- iterations |>
       dplyr::filter(.data$incidence != "Y")
   }
-
+  
   if (!getOverallCounts) {
     iterations <- iterations |>
       dplyr::filter(.data$includeConceptId == "Y")
   }
-
+  
   existingOutput <- c()
-
+  
   for (i in (1:nrow(iterations))) {
     rowData <- iterations[i, ]
-
+    
     extraMessage <-
-      paste0(
-        "Working on ",
-        rowData$domainTable,
-        ".",
-        rowData$domainField,
-        "."
-      )
+      paste0("Working on ",
+             rowData$domainTable,
+             ".",
+             rowData$domainField,
+             ".")
     progress <- (i / nrow(iterations)) * 100
     message <-
-      sprintf(
-        "\rProgress: %d/%d (%0.2f%%)",
-        i,
-        nrow(iterations),
-        progress
-      )
-
+      sprintf("\rProgress: %d/%d (%0.2f%%)",
+              i,
+              nrow(iterations),
+              progress)
+
     ParallelLogger::logInfo(message)
-
+    
     showProgress(
       currentIteration = i,
       totalIterations = nrow(iterations),
       extraMessage = extraMessage
     )
-
+    
     sqlRendered <- SqlRender::render(
       sql = sql,
       cdm_database_schema = cdmDatabaseSchema,
@@ -363,41 +360,38 @@ getConceptRecordCount <- function(conceptIds = NULL,
       cohort_definition_id = cohortDefinitionId,
       use_age_group = (rowData$useAgeGroup == "Y")
     )
-
+    
     # Regular expression to find a comma followed by any whitespace (including line breaks) and a semicolon
     regexPattern <- ",[\\s\\n\\r]*;"
-
+    
     # Replace the pattern with just a semicolon
     sqlRendered <- gsub(regexPattern, ";", sqlRendered, perl = TRUE)
-
+    
     sqlTranslated <-
       SqlRender::translate(
         sql = sqlRendered,
         targetDialect = connection@dbms,
         tempEmulationSchema = tempEmulationSchema
       )
-
+    
     DatabaseConnector::executeSql(
       connection = connection,
       sql = sqlTranslated,
       progressBar = FALSE,
       reportOverallTime = FALSE,
       profile = FALSE
     )
-
+    
     output <- DatabaseConnector::renderTranslateQuerySql(
       connection = connection,
       sql = "SELECT * FROM #concept_count_table;",
       snakeCaseToCamelCase = TRUE
     )
-
-    existingOutput <- dplyr::bind_rows(
-      existingOutput,
-      output
-    ) |>
+
+    existingOutput <- dplyr::bind_rows(existingOutput, output) |>
       dplyr::tibble()
   }
-
+  
   DatabaseConnector::renderTranslateExecuteSql(
     connection = connection,
     profile = FALSE,
@@ -406,7 +400,7 @@ getConceptRecordCount <- function(conceptIds = NULL,
     sql = "DROP TABLE IF EXISTS #concept_count_table;
            DROP TABLE IF EXISTS #concept_id_unv_2;"
   )
-
+  
   existingOutput <- existingOutput |>
     dplyr::inner_join(
       domainInformation$long |>
@@ -419,11 +413,11 @@ getConceptRecordCount <- function(conceptIds = NULL,
       by = c("domainTableShort", "domainFieldShort")
     ) |>
     dplyr::select(-"domainFieldShort", -"domainTableShort")
-
+  
   if (!is.null(minCellCount)) {
     existingOutput <-
       existingOutput |> dplyr::filter(.data$subjectCount > minCellCount)
   }
-
+  
   return(existingOutput)
 }
diff --git a/R/GetStandardMappingRecommendationsForNonStandard.R b/R/GetStandardMappingRecommendationsForNonStandard.R
@@ -38,7 +38,8 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
                                                             tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
                                                             sourceVocabularyId = c("ICD10CM"),
                                                             sourceCodes,
-                                                            removeSpecialCharacters = TRUE) {
+                                                            removeSpecialCharacters = TRUE,
+                                                            includeDescendants = FALSE) {
   if (is.null(vocabularyDatabaseSchema)) {
     stop("vocabularyDatabaseSchema cannot be NULL.")
   }
@@ -85,7 +86,7 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     tempEmulationSchema = tempEmulationSchema
   ) |>
     dplyr::tibble()
-
+  
 
   #do the fuzzy match - but we expect to 100%match for this work.
   #message will be shown if there are approximate match
@@ -136,42 +137,66 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSourceDf, by = "givenConceptId")
 
 
-  # get descendants of all standard concepts
-  descendantsOfStandardConcept <- ConceptSetDiagnostics::getConceptDescendant(
-    conceptIds = mappedStandard$conceptId,
-    connection = connection,
-    vocabularyDatabaseSchema = vocabularyDatabaseSchema
-  ) |>
-    dplyr::filter(minLevelsOfSeparation > 0) |>
-    dplyr::select(ancestorConceptId, descendantConceptId) |>
-    dplyr::distinct()
-
-  #get mapped concept for the standard esp descendants
-  mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts(
-    conceptIds = c(
-      mappedStandard$conceptId,
-      descendantsOfStandardConcept$descendantConceptId
-    ) |> unique(),
-    connection = connection,
-    vocabularyDatabaseSchema = vocabularyDatabaseSchema,
-    tempEmulationSchema = tempEmulationSchema
-  )
-
-  #filter to desired vocabulary
-  mappedSourceFiltered <- mappedSource |>
-    dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |>
-    dplyr::left_join(
-      output$codesWithConceptId |>
-        dplyr::select(conceptId) |>
-        dplyr::distinct() |>
-        dplyr::mutate(isInputConceptId = 1)
+  if (includeDescendants) {
+    # get descendants of all standard concepts
+    descendantsOfStandardConcept <- ConceptSetDiagnostics::getConceptDescendant(
+      conceptIds = mappedStandard$conceptId,
+      connection = connection,
+      vocabularyDatabaseSchema = vocabularyDatabaseSchema
     ) |>
-    tidyr::replace_na(list(isInputConceptId = 0))
+      dplyr::filter(minLevelsOfSeparation > 0) |>
+      dplyr::select(ancestorConceptId, descendantConceptId) |>
+      dplyr::distinct()
+
+    #get mapped concept for the standard esp descendants
+    mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts(
+      conceptIds = c(
+        mappedStandard$conceptId,
+        descendantsOfStandardConcept$descendantConceptId
+      ) |> unique(),
+      connection = connection,
+      vocabularyDatabaseSchema = vocabularyDatabaseSchema,
+      tempEmulationSchema = tempEmulationSchema
+    )
+
+    #filter to desired vocabulary
+    mappedSourceFiltered <- mappedSource |>
+      dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |>
+      dplyr::left_join(
+        output$codesWithConceptId |>
+          dplyr::select(conceptId) |>
+          dplyr::distinct() |>
+          dplyr::mutate(isInputConceptId = 1)
+      ) |>
+      tidyr::replace_na(list(isInputConceptId = 0))
+
+  } else {
+    #get mapped concept for the standard esp descendants
+    mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts(
+      conceptIds = c(mappedStandard$conceptId) |> unique(),
+      connection = connection,
+      vocabularyDatabaseSchema = vocabularyDatabaseSchema,
+      tempEmulationSchema = tempEmulationSchema
+    )
+
+    #filter to desired vocabulary
+    mappedSourceFiltered <- mappedSource |>
+      dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |>
+      dplyr::left_join(
+        output$codesWithConceptId |>
+          dplyr::select(conceptId) |>
+          dplyr::distinct() |>
+          dplyr::mutate(isInputConceptId = 1)
+      ) |>
+      tidyr::replace_na(list(isInputConceptId = 0))
+  }
 
   #find all concept id and get their detail
-  conceptIds <- c(output$codesWithConceptId$conceptId,
-                  mappedStandard$conceptId,
-                  mappedSource$conceptId) |>
+  conceptIds <- c(
+    output$codesWithConceptId$conceptId,
+    mappedStandard$conceptId,
+    mappedSource$conceptId
+  ) |>
     unique()
 
   output$conceptIdDetails <- ConceptSetDiagnostics::getConceptIdDetails(
@@ -181,9 +206,7 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
   ) |>
     dplyr::arrange(conceptId)
 
-  browser()
-
-  #this is the main output. it has the source and mapped standard  
+  #this is the main output. it has the source and mapped standard
   output$sourceMappedToStandard <- mappedStandard |>
     dplyr::rename(sourceConceptId = givenConceptId, standardConceptId = conceptId) |>
     dplyr::select(

diff --git a/man/getStandardMappingRecommendationsForNonStandard.Rd b/man/getStandardMappingRecommendationsForNonStandard.Rd