diff --git a/NAMESPACE b/NAMESPACE index ecf0beb..c0fe8d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,8 +4,10 @@ export(convertConceptSetDataFrameToExpression) export(convertConceptSetExpressionToDataFrame) export(extractConceptSetsInCohortDefinition) export(extractConceptSetsInCohortDefinitionSet) +export(findClosestMatch) export(findOrphanConcepts) export(findOrphanConceptsForConceptSetExpression) +export(fuzzyStringJoinDataFrame) export(getConceptAncestor) export(getConceptDescendant) export(getConceptIdDetails) @@ -25,6 +27,7 @@ export(getRecommendationForConceptSetExpression) export(getRecommendedSource) export(getRecommendedStandard) export(getRelationship) +export(getStandardMappingRecommendationsForNonStandard) export(getVocabulary) export(getVocabularyVersion) export(instantiateCohortFromConceptSetExpression) diff --git a/R/FindClosestMatch.R b/R/FindClosestMatch.R new file mode 100644 index 0000000..7354a0e --- /dev/null +++ b/R/FindClosestMatch.R @@ -0,0 +1,50 @@ +# Function to find closest match +#' @export +findClosestMatch <- function(sourceString, targetVector) { + distances <- stringdist::stringdist(sourceString, targetVector) + targetVector[which.min(distances)] +} + + +# Function to perform fuzzy string join on two data frames +#' @export +fuzzyStringJoinDataFrame <- function(df1, df2, field1, field2) { + library(tidyverse) + library(stringdist) + + # Calculate the closest matches using stringdist and include match score + findClosestMatch <- function(sourceString, targetVector) { + distances <- stringdist::stringdist(sourceString, targetVector) + minIndex <- which.min(distances) + score <- 1 - (distances[minIndex] / max(nchar(sourceString), nchar(targetVector[minIndex]))) + return(c(match = targetVector[minIndex], score = score)) + } + + # Map each entry in df1's field1 to the closest entry in df2's field2 and include scores + closestMatches <- dplyr::tibble(field1 = df1[[field1]]) |> + dplyr::rowwise() |> + dplyr::mutate( + closestField2 = findClosestMatch(field1, df2[[field2]])[1], + matchScore = as.numeric(findClosestMatch(field1, df2[[field2]])[2]) + ) |> + dplyr::ungroup() + + # Join the closest matches with df2 to retrieve all corresponding columns + mappedData <- closestMatches |> + dplyr::left_join(df2, by = c("closestField2" = field2)) + + # Join the mapped data back with df1 + finalResult <- mappedData |> + dplyr::right_join(df1, by = c("field1" = field1)) + + # Rename columns to maintain the original naming convention from df1 and df2 + finalResult <- finalResult |> + dplyr::rename_with(.cols = everything(), .fn = ~gsub("field1", field1, .)) |> + dplyr::rename_with(.cols = everything(), .fn = ~gsub("closestField2", field2, .)) + + finalResult <- df1 |> + dplyr::left_join(finalResult) + + return(finalResult) +} + diff --git a/R/GetStandardMappingRecommendationsForNonStandard.R b/R/GetStandardMappingRecommendationsForNonStandard.R new file mode 100644 index 0000000..f2372aa --- /dev/null +++ b/R/GetStandardMappingRecommendationsForNonStandard.R @@ -0,0 +1,295 @@ +# Copyright 2024 Observational Health Data Sciences and Informatics +# +# This file is part of ConceptSetDiagnostics +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +#' Get standard mapping recommendations for non standard. +#' +#' @description +#' Get standard mapping recommendations for non standard. +#' +#' @template Connection +#' +#' @template VocabularyDatabaseSchema +#' +#' @template TempEmulationSchema +#' +#' @return +#' Returns a list of objects (to be described.) +#' +#' @export +getStandardMappingRecommendationsForNonStandard <- function(connectionDetails = NULL, + connection = NULL, + vocabularyDatabaseSchema = cdmDatabaseSchema, + cdmDatabaseSchema = NULL, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + sourceVocabularyId = c("ICD10CM"), + sourceCodes, + removeSpecialCharacters = TRUE) { + if (is.null(vocabularyDatabaseSchema)) { + stop("vocabularyDatabaseSchema cannot be NULL.") + } + + output <- c() + + vocabularyIdsToFilter <- quoteAndJoinArray(sourceVocabularyId |> unique()) + + if (is.null(connection)) { + connection <- DatabaseConnector::connect(connectionDetails) + on.exit( + DatabaseConnector::dropEmulatedTempTables(connection = connection, tempEmulationSchema = tempEmulationSchema) + ) + on.exit(DatabaseConnector::disconnect(connection), add = TRUE) + } + + # Retrieve vocabulary IDs from OMOP to ensure consistency. + omopVocabularyId <- ConceptSetDiagnostics::getVocabulary( + connection = connection, + vocabularyDatabaseSchema = vocabularyDatabaseSchema, + tempEmulationSchema = tempEmulationSchema + ) + + sourceVocabularyIdNotInOmop <- setdiff(sourceVocabularyId, omopVocabularyId$vocabularyId) + + if (length(sourceVocabularyIdNotInOmop) > 1) { + stop(paste0( + "The following sourceVocabularyId is not in OMOP. ", + paste0(sourceVocabularyIdNotInOmop, collapse = ", ") + )) + } + + omopVocabularyToMatch <- DatabaseConnector::renderTranslateQuerySql( + connection = connection, + sql = "SELECT concept_id, concept_code, vocabulary_id + FROM @vocabulary_database_schema.concept + WHERE vocabulary_id IN (@vocabulary_ids);", + snakeCaseToCamelCase = TRUE, + vocabulary_database_schema = vocabularyDatabaseSchema, + vocabulary_ids = vocabularyIdsToFilter, + tempEmulationSchema = tempEmulationSchema + ) |> + dplyr::tibble() + + sourceDf <- dplyr::tibble(conceptCodeSourceOriginal = sourceCodes) |> + dplyr::distinct() |> + dplyr::mutate(conceptCodeSource = conceptCodeSourceOriginal) + + targetDf <- omopVocabularyToMatch |> + dplyr::rename(conceptCodeOmopOriginal = conceptCode) |> + dplyr::distinct() |> + dplyr::mutate(conceptCodeOmop = conceptCodeOmopOriginal) + + if (removeSpecialCharacters) { + #note: in current implementation removeSpecialCharacters only removes periods. This is mostly useful in ICD codes. + sourceDf <- sourceDf |> + dplyr::mutate( + conceptCodeSource = stringr::str_remove_all(string = conceptCodeSourceOriginal, pattern = stringr::fixed(".")) + ) + + targetDf <- targetDf |> + dplyr::mutate( + conceptCodeOmop = stringr::str_remove_all(string = conceptCodeOmopOriginal, pattern = stringr::fixed(".")) + ) + } + + # #fuzzy string matching is slow. we also restrict by vocabularyId + codesWithConceptId <- fuzzyStringJoinDataFrame( + df1 = sourceDf, + df2 = targetDf, + field1 = "conceptCodeSource", + field2 = "conceptCodeOmop" + ) |> + dplyr::distinct() + + # find imperfect matches + approximateMatch <- codesWithConceptId |> + dplyr::filter(conceptCodeSource != conceptCodeOmop) + perfectMatch <- codesWithConceptId |> + dplyr::filter(conceptCodeSource == conceptCodeOmop) + + approximateMatch <- approximateMatch |> + dplyr::anti_join(perfectMatch |> + dplyr::select(conceptCodeSource) |> + dplyr::distinct()) + + output$approximateMatch <- approximateMatch + + if (nrow(output$approximateMatch) > 0) { + message("There are codes without perfect match. Please look at approximateMatch in output.") + } + + mappedStandard <- ConceptSetDiagnostics::getMappedStandardConcepts( + conceptIds = codesWithConceptId$conceptId |> unique(), + connection = connection, + vocabularyDatabaseSchema = vocabularyDatabaseSchema, + tempEmulationSchema = tempEmulationSchema + ) + + output$unmappedCodes <- codesWithConceptId |> + dplyr::anti_join( + mappedStandard |> + dplyr::select(givenConceptId) |> + dplyr::rename(conceptId = givenConceptId) |> + dplyr::distinct() + ) + + if (nrow(output$unmappedCodes) > 0) { + message("There are concept id without mapped standard. Please look at unmappedCodes.") + } + + numberOfMappedStandardConceptsMappedToGivenSource <- + mappedStandard |> + dplyr::select(givenConceptId, conceptId) |> + dplyr::distinct() |> + dplyr::group_by(givenConceptId) |> + dplyr::summarise(numberOfMappedStandardConceptsMappedToGivenSource = n()) + + mappedStandard <- mappedStandard |> + dplyr::left_join(numberOfMappedStandardConceptsMappedToGivenSource, by = "givenConceptId") + + descendantsOfStandardConcept <- ConceptSetDiagnostics::getConceptDescendant( + conceptIds = mappedStandard$conceptId, + connection = connection, + vocabularyDatabaseSchema = vocabularyDatabaseSchema + ) |> + dplyr::filter(minLevelsOfSeparation > 0) |> + dplyr::select(ancestorConceptId, descendantConceptId) |> + dplyr::distinct() + + mappedSource <- ConceptSetDiagnostics::getMappedSourceConcepts( + conceptIds = c( + mappedStandard$conceptId, + descendantsOfStandardConcept$descendantConceptId + ) |> unique(), + connection = connection, + vocabularyDatabaseSchema = vocabularyDatabaseSchema, + tempEmulationSchema = tempEmulationSchema + ) + + mappedSourceFiltered <- mappedSource |> + dplyr::filter(vocabularyId %in% c(sourceVocabularyId)) |> + dplyr::left_join( + codesWithConceptId |> + dplyr::select(conceptId) |> + dplyr::distinct() |> + dplyr::mutate(isInputConceptId = 1) + ) |> + tidyr::replace_na(list(isInputConceptId = 0)) + + conceptIds <- c(codesWithConceptId$conceptId, + mappedStandard$conceptId, + mappedSource$conceptId) |> + unique() + + + output$conceptIdDetails <- ConceptSetDiagnostics::getConceptIdDetails( + conceptIds = conceptIds, + connection = connection, + vocabularyDatabaseSchema = vocabularyDatabaseSchema + ) |> + dplyr::arrange(conceptId) + + + browser() + + output$sourceMappedToStandard <- mappedStandard |> + dplyr::rename(sourceConceptId = givenConceptId, standardConceptId = conceptId) |> + dplyr::select( + sourceConceptId, + standardConceptId, + numberOfMappedStandardConceptsMappedToGivenSource + ) |> + dplyr::left_join( + output$conceptIdDetails |> + dplyr::rename( + sourceConceptId = conceptId, + sourceConceptName = conceptName, + sourceConceptCode = conceptCode, + sourceVocabularyId = vocabularyId + ) |> + dplyr::select(dplyr::starts_with("source")), + by = "sourceConceptId" + ) |> + dplyr::left_join( + output$conceptIdDetails |> + dplyr::rename( + standardConceptId = conceptId, + standardConceptName = conceptName, + standardConceptCode = conceptCode, + standardVocabularyId = vocabularyId + ) |> + dplyr::select(dplyr::starts_with("standard")), + by = "standardConceptId" + ) |> + dplyr::relocate(dplyr::starts_with("source"), + dplyr::starts_with("standard")) |> + dplyr::arrange(sourceConceptId) |> + dplyr::mutate( + stringDistanceSourceToStandard = stringdist::stringdist(sourceConceptName, standardConceptName, method = "lcs") + ) |> + dplyr::left_join( + mappedSourceFiltered |> + dplyr::rename(standardConceptId = givenConceptId) |> + dplyr::group_by(standardConceptId) |> + dplyr::summarise(mappedNonStandardDirect = n()), + by = "standardConceptId" + ) |> + dplyr::left_join( + mappedSourceFiltered |> + dplyr::rename(standardConceptId = givenConceptId) |> + dplyr::group_by(standardConceptId) |> + dplyr::summarise(mappedNonStandardDirectCodes = paste0(conceptCode, collapse = "\n")), + by = "standardConceptId" + ) |> + dplyr::left_join( + mappedSourceFiltered |> + dplyr::rename(standardConceptId = givenConceptId) |> + dplyr::group_by(standardConceptId) |> + dplyr::summarise(mappedNonStandardDirectNames = paste0(conceptName, collapse = "\n")), + by = "standardConceptId" + ) |> + dplyr::left_join( + mappedSourceFiltered |> + dplyr::rename(standardConceptId = givenConceptId) |> + dplyr::filter(isInputConceptId == 0) |> + dplyr::group_by(standardConceptId) |> + dplyr::summarise(mappedNonStandardDirectNotInInput = n()), + by = "standardConceptId" + ) |> + dplyr::left_join( + mappedSourceFiltered |> + dplyr::rename(standardConceptId = givenConceptId) |> + dplyr::filter(isInputConceptId == 0) |> + dplyr::group_by(standardConceptId) |> + dplyr::summarise( + mappedNonStandardDirectCodesNotInInput = paste0(conceptCode, collapse = "\n") + ), + by = "standardConceptId" + ) |> + dplyr::left_join( + mappedSourceFiltered |> + dplyr::rename(standardConceptId = givenConceptId) |> + dplyr::filter(isInputConceptId == 0) |> + dplyr::group_by(standardConceptId) |> + dplyr::summarise( + mappedNonStandardDirectNamesNotInInput = paste0(conceptName, collapse = "\n") + ), + by = "standardConceptId" + ) + + return(output) + +} diff --git a/R/Private.R b/R/Private.R index 2bb7a03..613e743 100644 --- a/R/Private.R +++ b/R/Private.R @@ -15,6 +15,9 @@ # limitations under the License. # +quoteAndJoinArray <- function(stringArray) { + return(paste0(paste0("'", stringArray, "'"), collapse = ", ")) +} checkIfCohortDefinitionSet <- function(cohortDefinitionSet) { errorMessage <- checkmate::makeAssertCollection() diff --git a/man/getConceptRecordCount.Rd b/man/getConceptRecordCount.Rd index d042636..8209572 100644 --- a/man/getConceptRecordCount.Rd +++ b/man/getConceptRecordCount.Rd @@ -73,10 +73,6 @@ tables, provide a schema with write privileges where temp tables can be created. \item{domainTableName}{Vector of strings Domains to look for concept IDs. Supported domains include "drug_exposure", "condition_occurrence", "procedure_occurrence", "measurement", "observation".} - -\item{domain}{domains to look for concept id} - -\item{limitToCohort}{Do you want to limit to a cohort_definition_id?} } \value{ Returns a tibble data frame. diff --git a/man/getStandardMappingRecommendationsForNonStandard.Rd b/man/getStandardMappingRecommendationsForNonStandard.Rd new file mode 100644 index 0000000..4c6c08a --- /dev/null +++ b/man/getStandardMappingRecommendationsForNonStandard.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in +% R/GetStandardMappingRecommendationsForNonStandard.R +\name{getStandardMappingRecommendationsForNonStandard} +\alias{getStandardMappingRecommendationsForNonStandard} +\title{Get standard mapping recommendations for non standard.} +\usage{ +getStandardMappingRecommendationsForNonStandard( + connectionDetails = NULL, + connection = NULL, + vocabularyDatabaseSchema = cdmDatabaseSchema, + cdmDatabaseSchema = NULL, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"), + sourceVocabularyId = c("ICD10CM"), + sourceCodes, + removeSpecialCharacters = TRUE +) +} +\arguments{ +\item{connectionDetails}{An object of type \code{connectionDetails} as created using the +\code{\link[DatabaseConnector]{createConnectionDetails}} function in the +DatabaseConnector package. Can be left NULL if \code{connection} is +provided.} + +\item{connection}{An object of type \code{connection} as created using the +\code{\link[DatabaseConnector]{connect}} function in the +DatabaseConnector package. Can be left NULL if \code{connectionDetails} +is provided, in which case a new connection will be opened at the start +of the function, and closed when the function finishes.} + +\item{vocabularyDatabaseSchema}{The schema name of containing the vocabulary tables.} + +\item{tempEmulationSchema}{Some database platforms like Oracle and Impala do not truly support temp tables. To emulate temp +tables, provide a schema with write privileges where temp tables can be created.} +} +\value{ +Returns a list of objects (to be described.) +} +\description{ +Get standard mapping recommendations for non standard. +}