Add new function to map source to standard

OHDSI · Nov 7, 2024 · cd0ac4e · cd0ac4e
1 parent 9d01f9e
commit cd0ac4e
Show file tree

Hide file tree

Showing 6 changed files with 392 additions and 4 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,8 +4,10 @@ export(convertConceptSetDataFrameToExpression)
 export(convertConceptSetExpressionToDataFrame)
 export(extractConceptSetsInCohortDefinition)
 export(extractConceptSetsInCohortDefinitionSet)
+export(findClosestMatch)
 export(findOrphanConcepts)
 export(findOrphanConceptsForConceptSetExpression)
+export(fuzzyStringJoinDataFrame)
 export(getConceptAncestor)
 export(getConceptDescendant)
 export(getConceptIdDetails)
@@ -25,6 +27,7 @@ export(getRecommendationForConceptSetExpression)
 export(getRecommendedSource)
 export(getRecommendedStandard)
 export(getRelationship)
+export(getStandardMappingRecommendationsForNonStandard)
 export(getVocabulary)
 export(getVocabularyVersion)
 export(instantiateCohortFromConceptSetExpression)

diff --git a/R/FindClosestMatch.R b/R/FindClosestMatch.R
@@ -0,0 +1,50 @@
+# Function to find closest match
+#' @export
+findClosestMatch <- function(sourceString, targetVector) {
+  distances <- stringdist::stringdist(sourceString, targetVector)
+  targetVector[which.min(distances)]
+}
+
+
+# Function to perform fuzzy string join on two data frames
+#' @export
+fuzzyStringJoinDataFrame <- function(df1, df2, field1, field2) {
+  library(tidyverse)
+  library(stringdist)
+
+  # Calculate the closest matches using stringdist and include match score
+  findClosestMatch <- function(sourceString, targetVector) {
+    distances <- stringdist::stringdist(sourceString, targetVector)
+    minIndex <- which.min(distances)
+    score <- 1 - (distances[minIndex] / max(nchar(sourceString), nchar(targetVector[minIndex])))
+    return(c(match = targetVector[minIndex], score = score))
+  }
+
+  # Map each entry in df1's field1 to the closest entry in df2's field2 and include scores
+  closestMatches <- dplyr::tibble(field1 = df1[[field1]]) |> 
+    dplyr::rowwise() |> 
+    dplyr::mutate(
+      closestField2 = findClosestMatch(field1, df2[[field2]])[1],
+      matchScore = as.numeric(findClosestMatch(field1, df2[[field2]])[2])
+    ) |> 
+    dplyr::ungroup()
+
+  # Join the closest matches with df2 to retrieve all corresponding columns
+  mappedData <- closestMatches |> 
+    dplyr::left_join(df2, by = c("closestField2" = field2))
+
+  # Join the mapped data back with df1
+  finalResult <- mappedData |> 
+    dplyr::right_join(df1, by = c("field1" = field1))
+
+  # Rename columns to maintain the original naming convention from df1 and df2
+  finalResult <- finalResult |> 
+    dplyr::rename_with(.cols = everything(), .fn = ~gsub("field1", field1, .)) |> 
+    dplyr::rename_with(.cols = everything(), .fn = ~gsub("closestField2", field2, .))
+
+  finalResult <- df1 |> 
+    dplyr::left_join(finalResult)
+
+  return(finalResult)
+}
+
diff --git a/R/GetStandardMappingRecommendationsForNonStandard.R b/R/GetStandardMappingRecommendationsForNonStandard.R
@@ -0,0 +1,295 @@
+# Copyright 2024 Observational Health Data Sciences and Informatics
+#
+# This file is part of ConceptSetDiagnostics
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+#' Get standard mapping recommendations for non standard.
+#'
+#' @description
+#' Get standard mapping recommendations for non standard.
+#'
+#' @template Connection
+#'
+#' @template VocabularyDatabaseSchema
+#'
+#' @template TempEmulationSchema
+#'
+#' @return
+#' Returns a list of objects (to be described.)
+#'
+#' @export
+getStandardMappingRecommendationsForNonStandard <- function(connectionDetails = NULL,
+                                                            connection = NULL,
+                                                            vocabularyDatabaseSchema = cdmDatabaseSchema,
+                                                            cdmDatabaseSchema = NULL,
+                                                            tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+                                                            sourceVocabularyId = c("ICD10CM"),
+                                                            sourceCodes,
+                                                            removeSpecialCharacters = TRUE) {
+  if (is.null(vocabularyDatabaseSchema)) {
+    stop("vocabularyDatabaseSchema cannot be NULL.")
+  }
+
+  output <- c()
+
+  vocabularyIdsToFilter <- quoteAndJoinArray(sourceVocabularyId |> unique())
+
+  if (is.null(connection)) {
+    connection <- DatabaseConnector::connect(connectionDetails)
+    on.exit(
+      DatabaseConnector::dropEmulatedTempTables(connection = connection, tempEmulationSchema = tempEmulationSchema)
+    )
+    on.exit(DatabaseConnector::disconnect(connection), add = TRUE)
+  }
+
+  # Retrieve vocabulary IDs from OMOP to ensure consistency.
+  omopVocabularyId <- ConceptSetDiagnostics::getVocabulary(
+    connection = connection,
+    vocabularyDatabaseSchema = vocabularyDatabaseSchema,
+    tempEmulationSchema = tempEmulationSchema
+  )
+
+  sourceVocabularyIdNotInOmop <- setdiff(sourceVocabularyId, omopVocabularyId$vocabularyId)
+
+  if (length(sourceVocabularyIdNotInOmop) > 1) {
+    stop(paste0(
+      "The following sourceVocabularyId is not in OMOP. ",
+      paste0(sourceVocabularyIdNotInOmop, collapse = ", ")
+    ))
+  }
+
+  omopVocabularyToMatch <- DatabaseConnector::renderTranslateQuerySql(
+    connection = connection,
+    sql = "SELECT concept_id, concept_code, vocabulary_id
+            FROM @vocabulary_database_schema.concept
+            WHERE vocabulary_id IN (@vocabulary_ids);",
+    snakeCaseToCamelCase = TRUE,
+    vocabulary_database_schema = vocabularyDatabaseSchema,
+    vocabulary_ids = vocabularyIdsToFilter,
+    tempEmulationSchema = tempEmulationSchema
+  ) |>
+    dplyr::tibble()
+
+  sourceDf <- dplyr::tibble(conceptCodeSourceOriginal = sourceCodes) |>
+    dplyr::distinct() |>
+    dplyr::mutate(conceptCodeSource = conceptCodeSourceOriginal)
+
+  targetDf <- omopVocabularyToMatch |>
+    dplyr::rename(conceptCodeOmopOriginal = conceptCode) |>
+    dplyr::distinct() |>
+    dplyr::mutate(conceptCodeOmop = conceptCodeOmopOriginal)
+
+  if (removeSpecialCharacters) {
+    #note: in current implementation removeSpecialCharacters only removes periods. This is mostly useful in ICD codes.
+    sourceDf <- sourceDf |>
+      dplyr::mutate(
+        conceptCodeSource = stringr::str_remove_all(string = conceptCodeSourceOriginal, pattern = stringr::fixed("."))
+      )
+
+    targetDf <- targetDf |>
+      dplyr::mutate(
+        conceptCodeOmop = stringr::str_remove_all(string = conceptCodeOmopOriginal, pattern = stringr::fixed("."))
+      )
+  }
+
+  # #fuzzy string matching is slow. we also restrict by vocabularyId
+  codesWithConceptId <- fuzzyStringJoinDataFrame(
+    df1 = sourceDf,
+    df2 = targetDf,
+    field1 = "conceptCodeSource",
+    field2 = "conceptCodeOmop"
+  ) |>
+    dplyr::distinct()
+
+  # find imperfect matches
+  approximateMatch <- codesWithConceptId |>
+    dplyr::filter(conceptCodeSource != conceptCodeOmop)
+  perfectMatch <- codesWithConceptId |>
+    dplyr::filter(conceptCodeSource == conceptCodeOmop)
+
+  approximateMatch <- approximateMatch |>
+    dplyr::anti_join(perfectMatch |>
+                       dplyr::select(conceptCodeSource) |>
+                       dplyr::distinct())
+
+  output$approximateMatch <- approximateMatch
+
+  if (nrow(output$approximateMatch) > 0) {
+    message("There are codes without perfect match. Please look at approximateMatch in output.")
+  }
+
+  mappedStandard <- ConceptSetDiagnostics::getMappedStandardConcepts(
+    conceptIds = codesWithConceptId$conceptId |> unique(),
+    connection = connection,
+    vocabularyDatabaseSchema = vocabularyDatabaseSchema,
+    tempEmulationSchema = tempEmulationSchema
+  )
+
+  output$unmappedCodes <- codesWithConceptId |>
+    dplyr::anti_join(
+      mappedStandard |>
+        dplyr::select(givenConceptId) |>
+        dplyr::rename(conceptId = givenConceptId) |>
+        dplyr::distinct()
+    )
+
+  if (nrow(output$unmappedCodes) > 0) {
+    message("There are concept id without mapped standard. Please look at unmappedCodes.")
+  }
+
+  numberOfMappedStandardConceptsMappedToGivenSource <-
+    mappedStandard |>
+    dplyr::select(givenConceptId, conceptId) |>
+    dplyr::distinct() |>
+    dplyr::group_by(givenConceptId) |>
+    dplyr::summarise(numberOfMappedStandardConceptsMappedToGivenSource = n())
+
+  mappedStandard <- mappedStandard |>
+    dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSource, by = "givenConceptId")
+
+  descendantsOfStandardConcept <- ConceptSetDiagnostics::getConceptDescendant(
+    conceptIds = mappedStandard$conceptId,
+    connection = connection,
+    vocabularyDatabaseSchema = vocabularyDatabaseSchema
+  ) |>
+    dplyr::filter(minLevelsOfSeparation > 0) |>
+    dplyr::select(ancestorConceptId, descendantConceptId) |>
+    dplyr::distinct()
+
+  mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts(
+    conceptIds = c(
+      mappedStandard$conceptId,
+      descendantsOfStandardConcept$descendantConceptId
+    ) |> unique(),
+    connection = connection,
+    vocabularyDatabaseSchema = vocabularyDatabaseSchema,
+    tempEmulationSchema = tempEmulationSchema
+  )
+
+  mappedSourceFiltered <- mappedSource |>
+    dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |>
+    dplyr::left_join(
+      codesWithConceptId |>
+        dplyr::select(conceptId) |>
+        dplyr::distinct() |>
+        dplyr::mutate(isInputConceptId = 1)
+    ) |>
+    tidyr::replace_na(list(isInputConceptId = 0))
+
+  conceptIds <- c(codesWithConceptId$conceptId,
+                  mappedStandard$conceptId,
+                  mappedSource$conceptId) |>
+    unique()
+
+
+  output$conceptIdDetails <- ConceptSetDiagnostics::getConceptIdDetails(
+    conceptIds = conceptIds,
+    connection = connection,
+    vocabularyDatabaseSchema = vocabularyDatabaseSchema
+  ) |>
+    dplyr::arrange(conceptId)
+
+
+  browser()
+
+  output$sourceMappedToStandard <- mappedStandard |>
+    dplyr::rename(sourceConceptId = givenConceptId, standardConceptId = conceptId) |>
+    dplyr::select(
+      sourceConceptId,
+      standardConceptId,
+      numberOfMappedStandardConceptsMappedToGivenSource
+    ) |>
+    dplyr::left_join(
+      output$conceptIdDetails |>
+        dplyr::rename(
+          sourceConceptId = conceptId,
+          sourceConceptName = conceptName,
+          sourceConceptCode = conceptCode,
+          sourceVocabularyId = vocabularyId
+        ) |>
+        dplyr::select(dplyr::starts_with("source")),
+      by = "sourceConceptId"
+    ) |>
+    dplyr::left_join(
+      output$conceptIdDetails |>
+        dplyr::rename(
+          standardConceptId = conceptId,
+          standardConceptName = conceptName,
+          standardConceptCode = conceptCode,
+          standardVocabularyId = vocabularyId
+        ) |>
+        dplyr::select(dplyr::starts_with("standard")),
+      by = "standardConceptId"
+    ) |>
+    dplyr::relocate(dplyr::starts_with("source"),
+                    dplyr::starts_with("standard")) |>
+    dplyr::arrange(sourceConceptId) |>
+    dplyr::mutate(
+      stringDistanceSourceToStandard = stringdist::stringdist(sourceConceptName, standardConceptName, method = "lcs")
+    ) |>
+    dplyr::left_join(
+      mappedSourceFiltered |>
+        dplyr::rename(standardConceptId = givenConceptId) |>
+        dplyr::group_by(standardConceptId) |>
+        dplyr::summarise(mappedNonStandardDirect = n()),
+      by = "standardConceptId"
+    ) |>
+    dplyr::left_join(
+      mappedSourceFiltered |>
+        dplyr::rename(standardConceptId = givenConceptId) |>
+        dplyr::group_by(standardConceptId) |>
+        dplyr::summarise(mappedNonStandardDirectCodes = paste0(conceptCode, collapse = "\n")),
+      by = "standardConceptId"
+    ) |>
+    dplyr::left_join(
+      mappedSourceFiltered |>
+        dplyr::rename(standardConceptId = givenConceptId) |>
+        dplyr::group_by(standardConceptId) |>
+        dplyr::summarise(mappedNonStandardDirectNames = paste0(conceptName, collapse = "\n")),
+      by = "standardConceptId"
+    ) |>
+    dplyr::left_join(
+      mappedSourceFiltered |>
+        dplyr::rename(standardConceptId = givenConceptId) |>
+        dplyr::filter(isInputConceptId == 0) |>
+        dplyr::group_by(standardConceptId) |>
+        dplyr::summarise(mappedNonStandardDirectNotInInput = n()),
+      by = "standardConceptId"
+    ) |>
+    dplyr::left_join(
+      mappedSourceFiltered |>
+        dplyr::rename(standardConceptId = givenConceptId) |>
+        dplyr::filter(isInputConceptId == 0) |>
+        dplyr::group_by(standardConceptId) |>
+        dplyr::summarise(
+          mappedNonStandardDirectCodesNotInInput = paste0(conceptCode, collapse = "\n")
+        ),
+      by = "standardConceptId"
+    ) |>
+    dplyr::left_join(
+      mappedSourceFiltered |>
+        dplyr::rename(standardConceptId = givenConceptId) |>
+        dplyr::filter(isInputConceptId == 0) |>
+        dplyr::group_by(standardConceptId) |>
+        dplyr::summarise(
+          mappedNonStandardDirectNamesNotInInput = paste0(conceptName, collapse = "\n")
+        ),
+      by = "standardConceptId"
+    )
+
+  return(output)
+
+}
diff --git a/R/Private.R b/R/Private.R
@@ -15,6 +15,9 @@
 # limitations under the License.
 #
 
+quoteAndJoinArray <- function(stringArray) {
+  return(paste0(paste0("'", stringArray, "'"), collapse = ", "))
+}
 
 checkIfCohortDefinitionSet <- function(cohortDefinitionSet) {
   errorMessage <- checkmate::makeAssertCollection()

diff --git a/man/getConceptRecordCount.Rd b/man/getConceptRecordCount.Rd