From 0aa1bb86ca6a3669032fa8a42af4d1ce5fbbc22a Mon Sep 17 00:00:00 2001
From: Lukas Fuchs <86153843+f-lukas@users.noreply.github.com>
Date: Fri, 16 Aug 2024 09:09:21 +0200
Subject: [PATCH] fix cluster ids (#239)

* fix cluster ids

* extract logic into function

* fix name of df

---------

Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de>
---
 DESCRIPTION                     |  2 +-
 NEWS.md                         |  5 +++++
 R/01-estimateMap.R              |  5 +++++
 R/01-predictNearestCluster.R    | 15 +++++++++++++++
 man/makeClusterIdsContinuous.Rd | 16 ++++++++++++++++
 5 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 man/makeClusterIdsContinuous.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 6f1aca6b..bd56ac4f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: DSSM
 Title: Pandora & IsoMemo spatiotemporal modeling
-Version: 24.08.0
+Version: 24.08.1
 Authors@R: c(
             person("Marcus", "Gross", email = "marcus.gross@inwt-statistics.de", role = c("cre", "aut")),
             person("Antonia", "Runge", email = "antonia.runge@inwt-statistics.de", role = c("aut"))
diff --git a/NEWS.md b/NEWS.md
index 2deff5ff..3aef2c61 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,8 @@
+# DSSM 24.08.1
+
+## Bug Fixes
+- fixes cluster ids being non continuous in some cases (#238)
+
 # DSSM 24.08.0
 
 ## New Features
diff --git a/R/01-estimateMap.R b/R/01-estimateMap.R
index f67509e7..57c29499 100644
--- a/R/01-estimateMap.R
+++ b/R/01-estimateMap.R
@@ -2117,6 +2117,8 @@ estimateMapKernel <- function(data,
     cluster_centers$cluster <- 1:nrow(cluster_centers)
     data2 <- merge(data2, cluster_centers, sort = FALSE)
     colnames(data2)[colnames(data2)=="cluster"] <- "spatial_cluster"
+
+    data2$spatial_cluster <- data2$spatial_cluster %>% makeClusterIdsContinuous()
   }
   if(!is.null(Weighting) & !(Weighting == "")){
     model <- try(lapply(1:nSim, function(x){
@@ -2526,6 +2528,9 @@ estimateMap3DKernel <- function(data,
     clust$cluster <- 1:nrow(clust)
     data <- merge(data, clust, sort = FALSE)
     colnames(data)[colnames(data)=="cluster"] <- "temporal_group"
+
+    data$temporal_group <- data$temporal_group %>% makeClusterIdsContinuous()
+    data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
   }
   if ( class(model)[1] == "try-error") {return("Error in Model Fitting.")}
   sc <- NULL
diff --git a/R/01-predictNearestCluster.R b/R/01-predictNearestCluster.R
index 3642a4b2..0d0159ef 100644
--- a/R/01-predictNearestCluster.R
+++ b/R/01-predictNearestCluster.R
@@ -5,3 +5,18 @@ predictNearestCluster <- function(object, newdata){
   dist_mat <- dist_mat[-seq(n_centers), seq(n_centers)]
   max.col(-dist_mat)
 }
+
+#' Make cluster ids continuous
+#'
+#' When Mclust is used with e.g. 10 clusters, it can still happen that some clusters are empty
+#' In this case we would see a jump in cluster ids e.g. 1,2,5,...
+#' To prevent this, we change the cluster ids in the last step.
+#'
+#' @param column_with_ids A vector with cluster ids
+makeClusterIdsContinuous <- function(column_with_ids) {
+  if (length(column_with_ids) == 0) {
+    return(column_with_ids)
+  }
+
+  match(column_with_ids, sort(unique(column_with_ids)))
+}
diff --git a/man/makeClusterIdsContinuous.Rd b/man/makeClusterIdsContinuous.Rd
new file mode 100644
index 00000000..fc8b5ab1
--- /dev/null
+++ b/man/makeClusterIdsContinuous.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/01-predictNearestCluster.R
+\name{makeClusterIdsContinuous}
+\alias{makeClusterIdsContinuous}
+\title{Make cluster ids continuous}
+\usage{
+makeClusterIdsContinuous(column_with_ids)
+}
+\arguments{
+\item{column_with_ids}{A vector with cluster ids}
+}
+\description{
+When Mclust is used with e.g. 10 clusters, it can still happen that some clusters are empty
+In this case we would see a jump in cluster ids e.g. 1,2,5,...
+To prevent this, we change the cluster ids in the last step.
+}