seperate fuzzy match

OHDSI · Nov 18, 2024 · 188cb28 · 188cb28
1 parent cd0ac4e
commit 188cb28
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 53 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,7 @@ export(getCountOfSourceCodesMappedToStandardConcept)
 export(getDomain)
 export(getDrugIngredients)
 export(getExcludedConceptsInConceptSetExpression)
+export(getFuzzyMatchOfCodesToOmopConceptCode)
 export(getMappedSourceConcepts)
 export(getMappedStandardConcepts)
 export(getMedraRelationship)

diff --git a/R/GetFuzzyMatchOfCodesToOmopConceptCode.R b/R/GetFuzzyMatchOfCodesToOmopConceptCode.R
@@ -0,0 +1,88 @@
+# Copyright 2024 Observational Health Data Sciences and Informatics
+#
+# This file is part of ConceptSetDiagnostics
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+#' Peform fuzzy match of codes.
+#'
+#' @description
+#' Given two data frames, one with codes from external source (e.g. read from spreadsheet) vs
+#' another data frame with OMOP vocabulary, perform fuzzy string match.
+#'
+#' @param sourceCodes An array of codes to match to omop.
+#'
+#' @param omopConcepts A data frame with atleast conceptId, conceptCode
+#'
+#' @param removeSpecialCharacters During matching do you want to remove special characters.e.g. when
+#'        source codes have period removed, but they exist in omop vocabulary concept code.
+#'
+#' @return
+#' Returns a list of objects (to be described.)
+#'
+#' @export
+getFuzzyMatchOfCodesToOmopConceptCode <- function(sourceCodes,
+                                                  omopConcepts,
+                                                  removeSpecialCharacters = TRUE) {
+  sourceDf <- dplyr::tibble(conceptCodeSourceOriginal = sourceCodes) |>
+    dplyr::distinct() |>
+    dplyr::mutate(conceptCodeSource = conceptCodeSourceOriginal)
+
+  targetDf <- omopConcepts |>
+    dplyr::rename(conceptCodeOmopOriginal = conceptCode) |>
+    dplyr::distinct() |>
+    dplyr::mutate(conceptCodeOmop = conceptCodeOmopOriginal)
+
+  if (removeSpecialCharacters) {
+    #note: in current implementation removeSpecialCharacters only removes periods. This is mostly useful in ICD codes.
+    sourceDf <- sourceDf |>
+      dplyr::mutate(
+        conceptCodeSource = stringr::str_remove_all(string = conceptCodeSourceOriginal, pattern = stringr::fixed("."))
+      )
+
+    targetDf <- targetDf |>
+      dplyr::mutate(
+        conceptCodeOmop = stringr::str_remove_all(string = conceptCodeOmopOriginal, pattern = stringr::fixed("."))
+      )
+  }
+
+  codesWithConceptId <- fuzzyStringJoinDataFrame(
+    df1 = sourceDf,
+    df2 = targetDf,
+    field1 = "conceptCodeSource",
+    field2 = "conceptCodeOmop"
+  ) |>
+    dplyr::distinct()
+
+  output <- c()
+
+  # find imperfect matches
+  output$approximateMatch <- codesWithConceptId |>
+    dplyr::filter(conceptCodeSource != conceptCodeOmop)
+  output$perfectMatch <- codesWithConceptId |>
+    dplyr::filter(conceptCodeSource == conceptCodeOmop)
+
+  output$approximateMatch <- output$approximateMatch |>
+    dplyr::anti_join(output$perfectMatch |>
+                       dplyr::select(conceptCodeSource) |>
+                       dplyr::distinct())
+
+  if (nrow(output$approximateMatch) > 0) {
+    message("There are codes without perfect match. Please look at approximateMatch in output.")
+  }
+
+  return(output)
+
+}
diff --git a/R/GetStandardMappingRecommendationsForNonStandard.R b/R/GetStandardMappingRecommendationsForNonStandard.R
@@ -62,6 +62,7 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     tempEmulationSchema = tempEmulationSchema
   )
 
+  #check if the request is for a vocabulary Id that does not map to OMOP.
   sourceVocabularyIdNotInOmop <- setdiff(sourceVocabularyId, omopVocabularyId$vocabularyId)
 
   if (length(sourceVocabularyIdNotInOmop) > 1) {
@@ -71,6 +72,8 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     ))
   }
 
+
+  #download from remote vocabulary table a subset of omop concepts to map.
   omopVocabularyToMatch <- DatabaseConnector::renderTranslateQuerySql(
     connection = connection,
     sql = "SELECT concept_id, concept_code, vocabulary_id
@@ -82,63 +85,34 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     tempEmulationSchema = tempEmulationSchema
   ) |>
     dplyr::tibble()
+
 
-  sourceDf <- dplyr::tibble(conceptCodeSourceOriginal = sourceCodes) |>
-    dplyr::distinct() |>
-    dplyr::mutate(conceptCodeSource = conceptCodeSourceOriginal)
-
-  targetDf <- omopVocabularyToMatch |>
-    dplyr::rename(conceptCodeOmopOriginal = conceptCode) |>
-    dplyr::distinct() |>
-    dplyr::mutate(conceptCodeOmop = conceptCodeOmopOriginal)
-
-  if (removeSpecialCharacters) {
-    #note: in current implementation removeSpecialCharacters only removes periods. This is mostly useful in ICD codes.
-    sourceDf <- sourceDf |>
-      dplyr::mutate(
-        conceptCodeSource = stringr::str_remove_all(string = conceptCodeSourceOriginal, pattern = stringr::fixed("."))
-      )
-
-    targetDf <- targetDf |>
-      dplyr::mutate(
-        conceptCodeOmop = stringr::str_remove_all(string = conceptCodeOmopOriginal, pattern = stringr::fixed("."))
-      )
-  }
-
-  # #fuzzy string matching is slow. we also restrict by vocabularyId
-  codesWithConceptId <- fuzzyStringJoinDataFrame(
-    df1 = sourceDf,
-    df2 = targetDf,
-    field1 = "conceptCodeSource",
-    field2 = "conceptCodeOmop"
-  ) |>
-    dplyr::distinct()
-
-  # find imperfect matches
-  approximateMatch <- codesWithConceptId |>
-    dplyr::filter(conceptCodeSource != conceptCodeOmop)
-  perfectMatch <- codesWithConceptId |>
-    dplyr::filter(conceptCodeSource == conceptCodeOmop)
+  #do the fuzzy match - but we expect to 100%match for this work.
+  #message will be shown if there are approximate match
+  fuzzyMatch <- getFuzzyMatchOfCodesToOmopConceptCode(
+    sourceCodes = sourceCodes,
+    omopConcepts = omopVocabularyToMatch,
+    removeSpecialCharacters = removeSpecialCharacters
+  )
 
-  approximateMatch <- approximateMatch |>
-    dplyr::anti_join(perfectMatch |>
-                       dplyr::select(conceptCodeSource) |>
-                       dplyr::distinct())
+  #only use perfect match moving forward, return all matches to user at codesWithConceptId. return all approximate
+  # matches as FYI. they are ignored moving forward.
 
-  output$approximateMatch <- approximateMatch
+  output$codesWithConceptId <- fuzzyMatch$perfectMatch
+  output$codesWithConceptIdApproximate <- fuzzyMatch$approximateMatch
 
-  if (nrow(output$approximateMatch) > 0) {
-    message("There are codes without perfect match. Please look at approximateMatch in output.")
-  }
+  #only use perfect match moving forward
 
+  # find standard mapping for non standard
   mappedStandard <- ConceptSetDiagnostics::getMappedStandardConcepts(
-    conceptIds = codesWithConceptId$conceptId |> unique(),
+    conceptIds = output$codesWithConceptId$conceptId |> unique(),
     connection = connection,
     vocabularyDatabaseSchema = vocabularyDatabaseSchema,
     tempEmulationSchema = tempEmulationSchema
   )
 
-  output$unmappedCodes <- codesWithConceptId |>
+  #are there any source codes without mapping? there should not be
+  output$unmappedCodes <- output$codesWithConceptId |>
     dplyr::anti_join(
       mappedStandard |>
         dplyr::select(givenConceptId) |>
@@ -150,16 +124,19 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     message("There are concept id without mapped standard. Please look at unmappedCodes.")
   }
 
-  numberOfMappedStandardConceptsMappedToGivenSource <-
+  # a source concept may be mapped to more than one standard. count if that is occurring
+  numberOfMappedStandardConceptsMappedToGivenSourceDf <-
     mappedStandard |>
     dplyr::select(givenConceptId, conceptId) |>
     dplyr::distinct() |>
     dplyr::group_by(givenConceptId) |>
     dplyr::summarise(numberOfMappedStandardConceptsMappedToGivenSource = n())
 
   mappedStandard <- mappedStandard |>
-    dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSource, by = "givenConceptId")
+    dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSourceDf, by = "givenConceptId")
+
 
+  # get descendants of all standard concepts
   descendantsOfStandardConcept <- ConceptSetDiagnostics::getConceptDescendant(
     conceptIds = mappedStandard$conceptId,
     connection = connection,
@@ -169,6 +146,7 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     dplyr::select(ancestorConceptId, descendantConceptId) |>
     dplyr::distinct()
 
+  #get mapped concept for the standard esp descendants
   mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts(
     conceptIds = c(
       mappedStandard$conceptId,
@@ -179,32 +157,33 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
     tempEmulationSchema = tempEmulationSchema
   )
 
+  #filter to desired vocabulary
   mappedSourceFiltered <- mappedSource |>
     dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |>
     dplyr::left_join(
-      codesWithConceptId |>
+      output$codesWithConceptId |>
         dplyr::select(conceptId) |>
         dplyr::distinct() |>
         dplyr::mutate(isInputConceptId = 1)
     ) |>
     tidyr::replace_na(list(isInputConceptId = 0))
 
-  conceptIds <- c(codesWithConceptId$conceptId,
+  #find all concept id and get their detail
+  conceptIds <- c(output$codesWithConceptId$conceptId,
                   mappedStandard$conceptId,
                   mappedSource$conceptId) |>
     unique()
 
-
   output$conceptIdDetails <- ConceptSetDiagnostics::getConceptIdDetails(
     conceptIds = conceptIds,
     connection = connection,
     vocabularyDatabaseSchema = vocabularyDatabaseSchema
   ) |>
     dplyr::arrange(conceptId)
 
-
   browser()
-
+
+  #this is the main output. it has the source and mapped standard  
   output$sourceMappedToStandard <- mappedStandard |>
     dplyr::rename(sourceConceptId = givenConceptId, standardConceptId = conceptId) |>
     dplyr::select(

diff --git a/man/getFuzzyMatchOfCodesToOmopConceptCode.Rd b/man/getFuzzyMatchOfCodesToOmopConceptCode.Rd