Skip to content

Commit

Permalink
seperate fuzzy match
Browse files Browse the repository at this point in the history
  • Loading branch information
gowthamrao committed Nov 18, 2024
1 parent cd0ac4e commit 188cb28
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 53 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(getCountOfSourceCodesMappedToStandardConcept)
export(getDomain)
export(getDrugIngredients)
export(getExcludedConceptsInConceptSetExpression)
export(getFuzzyMatchOfCodesToOmopConceptCode)
export(getMappedSourceConcepts)
export(getMappedStandardConcepts)
export(getMedraRelationship)
Expand Down
88 changes: 88 additions & 0 deletions R/GetFuzzyMatchOfCodesToOmopConceptCode.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of ConceptSetDiagnostics
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


#' Peform fuzzy match of codes.
#'
#' @description
#' Given two data frames, one with codes from external source (e.g. read from spreadsheet) vs
#' another data frame with OMOP vocabulary, perform fuzzy string match.
#'
#' @param sourceCodes An array of codes to match to omop.
#'
#' @param omopConcepts A data frame with atleast conceptId, conceptCode
#'
#' @param removeSpecialCharacters During matching do you want to remove special characters.e.g. when
#' source codes have period removed, but they exist in omop vocabulary concept code.
#'
#' @return
#' Returns a list of objects (to be described.)
#'
#' @export
getFuzzyMatchOfCodesToOmopConceptCode <- function(sourceCodes,
omopConcepts,
removeSpecialCharacters = TRUE) {
sourceDf <- dplyr::tibble(conceptCodeSourceOriginal = sourceCodes) |>
dplyr::distinct() |>
dplyr::mutate(conceptCodeSource = conceptCodeSourceOriginal)

targetDf <- omopConcepts |>
dplyr::rename(conceptCodeOmopOriginal = conceptCode) |>
dplyr::distinct() |>
dplyr::mutate(conceptCodeOmop = conceptCodeOmopOriginal)

if (removeSpecialCharacters) {
#note: in current implementation removeSpecialCharacters only removes periods. This is mostly useful in ICD codes.
sourceDf <- sourceDf |>
dplyr::mutate(
conceptCodeSource = stringr::str_remove_all(string = conceptCodeSourceOriginal, pattern = stringr::fixed("."))
)

targetDf <- targetDf |>
dplyr::mutate(
conceptCodeOmop = stringr::str_remove_all(string = conceptCodeOmopOriginal, pattern = stringr::fixed("."))
)
}

codesWithConceptId <- fuzzyStringJoinDataFrame(
df1 = sourceDf,
df2 = targetDf,
field1 = "conceptCodeSource",
field2 = "conceptCodeOmop"
) |>
dplyr::distinct()

output <- c()

# find imperfect matches
output$approximateMatch <- codesWithConceptId |>
dplyr::filter(conceptCodeSource != conceptCodeOmop)
output$perfectMatch <- codesWithConceptId |>
dplyr::filter(conceptCodeSource == conceptCodeOmop)

output$approximateMatch <- output$approximateMatch |>
dplyr::anti_join(output$perfectMatch |>
dplyr::select(conceptCodeSource) |>
dplyr::distinct())

if (nrow(output$approximateMatch) > 0) {
message("There are codes without perfect match. Please look at approximateMatch in output.")
}

return(output)

}
85 changes: 32 additions & 53 deletions R/GetStandardMappingRecommendationsForNonStandard.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
tempEmulationSchema = tempEmulationSchema
)

#check if the request is for a vocabulary Id that does not map to OMOP.
sourceVocabularyIdNotInOmop <- setdiff(sourceVocabularyId, omopVocabularyId$vocabularyId)

if (length(sourceVocabularyIdNotInOmop) > 1) {
Expand All @@ -71,6 +72,8 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
))
}


#download from remote vocabulary table a subset of omop concepts to map.
omopVocabularyToMatch <- DatabaseConnector::renderTranslateQuerySql(
connection = connection,
sql = "SELECT concept_id, concept_code, vocabulary_id
Expand All @@ -82,63 +85,34 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
tempEmulationSchema = tempEmulationSchema
) |>
dplyr::tibble()


sourceDf <- dplyr::tibble(conceptCodeSourceOriginal = sourceCodes) |>
dplyr::distinct() |>
dplyr::mutate(conceptCodeSource = conceptCodeSourceOriginal)

targetDf <- omopVocabularyToMatch |>
dplyr::rename(conceptCodeOmopOriginal = conceptCode) |>
dplyr::distinct() |>
dplyr::mutate(conceptCodeOmop = conceptCodeOmopOriginal)

if (removeSpecialCharacters) {
#note: in current implementation removeSpecialCharacters only removes periods. This is mostly useful in ICD codes.
sourceDf <- sourceDf |>
dplyr::mutate(
conceptCodeSource = stringr::str_remove_all(string = conceptCodeSourceOriginal, pattern = stringr::fixed("."))
)

targetDf <- targetDf |>
dplyr::mutate(
conceptCodeOmop = stringr::str_remove_all(string = conceptCodeOmopOriginal, pattern = stringr::fixed("."))
)
}

# #fuzzy string matching is slow. we also restrict by vocabularyId
codesWithConceptId <- fuzzyStringJoinDataFrame(
df1 = sourceDf,
df2 = targetDf,
field1 = "conceptCodeSource",
field2 = "conceptCodeOmop"
) |>
dplyr::distinct()

# find imperfect matches
approximateMatch <- codesWithConceptId |>
dplyr::filter(conceptCodeSource != conceptCodeOmop)
perfectMatch <- codesWithConceptId |>
dplyr::filter(conceptCodeSource == conceptCodeOmop)
#do the fuzzy match - but we expect to 100%match for this work.
#message will be shown if there are approximate match
fuzzyMatch <- getFuzzyMatchOfCodesToOmopConceptCode(
sourceCodes = sourceCodes,
omopConcepts = omopVocabularyToMatch,
removeSpecialCharacters = removeSpecialCharacters
)

approximateMatch <- approximateMatch |>
dplyr::anti_join(perfectMatch |>
dplyr::select(conceptCodeSource) |>
dplyr::distinct())
#only use perfect match moving forward, return all matches to user at codesWithConceptId. return all approximate
# matches as FYI. they are ignored moving forward.

output$approximateMatch <- approximateMatch
output$codesWithConceptId <- fuzzyMatch$perfectMatch
output$codesWithConceptIdApproximate <- fuzzyMatch$approximateMatch

if (nrow(output$approximateMatch) > 0) {
message("There are codes without perfect match. Please look at approximateMatch in output.")
}
#only use perfect match moving forward

# find standard mapping for non standard
mappedStandard <- ConceptSetDiagnostics::getMappedStandardConcepts(
conceptIds = codesWithConceptId$conceptId |> unique(),
conceptIds = output$codesWithConceptId$conceptId |> unique(),
connection = connection,
vocabularyDatabaseSchema = vocabularyDatabaseSchema,
tempEmulationSchema = tempEmulationSchema
)

output$unmappedCodes <- codesWithConceptId |>
#are there any source codes without mapping? there should not be
output$unmappedCodes <- output$codesWithConceptId |>
dplyr::anti_join(
mappedStandard |>
dplyr::select(givenConceptId) |>
Expand All @@ -150,16 +124,19 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
message("There are concept id without mapped standard. Please look at unmappedCodes.")
}

numberOfMappedStandardConceptsMappedToGivenSource <-
# a source concept may be mapped to more than one standard. count if that is occurring
numberOfMappedStandardConceptsMappedToGivenSourceDf <-
mappedStandard |>
dplyr::select(givenConceptId, conceptId) |>
dplyr::distinct() |>
dplyr::group_by(givenConceptId) |>
dplyr::summarise(numberOfMappedStandardConceptsMappedToGivenSource = n())

mappedStandard <- mappedStandard |>
dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSource, by = "givenConceptId")
dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSourceDf, by = "givenConceptId")


# get descendants of all standard concepts
descendantsOfStandardConcept <- ConceptSetDiagnostics::getConceptDescendant(
conceptIds = mappedStandard$conceptId,
connection = connection,
Expand All @@ -169,6 +146,7 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
dplyr::select(ancestorConceptId, descendantConceptId) |>
dplyr::distinct()

#get mapped concept for the standard esp descendants
mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts(
conceptIds = c(
mappedStandard$conceptId,
Expand All @@ -179,32 +157,33 @@ getStandardMappingRecommendationsForNonStandard <- function(connectionDetails =
tempEmulationSchema = tempEmulationSchema
)

#filter to desired vocabulary
mappedSourceFiltered <- mappedSource |>
dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |>
dplyr::left_join(
codesWithConceptId |>
output$codesWithConceptId |>
dplyr::select(conceptId) |>
dplyr::distinct() |>
dplyr::mutate(isInputConceptId = 1)
) |>
tidyr::replace_na(list(isInputConceptId = 0))

conceptIds <- c(codesWithConceptId$conceptId,
#find all concept id and get their detail
conceptIds <- c(output$codesWithConceptId$conceptId,
mappedStandard$conceptId,
mappedSource$conceptId) |>
unique()


output$conceptIdDetails <- ConceptSetDiagnostics::getConceptIdDetails(
conceptIds = conceptIds,
connection = connection,
vocabularyDatabaseSchema = vocabularyDatabaseSchema
) |>
dplyr::arrange(conceptId)


browser()


#this is the main output. it has the source and mapped standard
output$sourceMappedToStandard <- mappedStandard |>
dplyr::rename(sourceConceptId = givenConceptId, standardConceptId = conceptId) |>
dplyr::select(
Expand Down
27 changes: 27 additions & 0 deletions man/getFuzzyMatchOfCodesToOmopConceptCode.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 188cb28

Please sign in to comment.