DSSM 24.08.5: Feature/204 tclust (#244)

* tclust wip * adjust tclust inputs * add tclust as clustering method * fix check * Feature/204 tclust review (#246) * refactor duplicated code * do clustering (on original data and after modelling) exactly as for kernelTimeR * fix export of data * remove duplicated code * update news.md * update namespace --------- Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de>
Pandora-IsoMemo · Sep 3, 2024 · afe8317 · afe8317
1 parent 61db709
commit afe8317
Show file tree

Hide file tree

Showing 14 changed files with 291 additions and 134 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: DSSM
 Title: Pandora & IsoMemo spatiotemporal modeling
-Version: 24.08.4
+Version: 24.08.5
 Authors@R: c(
             person("Marcus", "Gross", email = "marcus.gross@inwt-statistics.de", role = c("cre", "aut")),
             person("Antonia", "Runge", email = "antonia.runge@inwt-statistics.de", role = c("aut"))
@@ -52,6 +52,7 @@ Imports:
     sf,
     sp (>= 1.2.7),
     stringi (>= 1.1.7),
+    tclust,
     webshot,
     yaml,
     zip

diff --git a/NAMESPACE b/NAMESPACE
@@ -90,6 +90,7 @@ importFrom(coda,raftery.diag)
 importFrom(colourpicker,colourInput)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)
+importFrom(dplyr,desc)
 importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
@@ -273,6 +274,7 @@ importFrom(stats,runif)
 importFrom(stats,sd)
 importFrom(stats,var)
 importFrom(stringi,stri_escape_unicode)
+importFrom(tclust,tclust)
 importFrom(utils,available.packages)
 importFrom(utils,capture.output)
 importFrom(utils,compareVersion)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# DSSM 24.08.5
+
+## New Features
+- Adds tclust as clustering method in KernelR and KernelTimeR (#204)
+
+## Bug Fixes
+- solves issue with the plotting and export of cluster data in KernelR
+
 # DSSM 24.08.4
 
 ## New Features

diff --git a/R/00-Namespace.R b/R/00-Namespace.R
@@ -7,7 +7,7 @@
 #'  fillIsoData handleDescription has_internet importDataUI importDataServer importUI importServer
 #'  importOptions prefixSysTime previewDataUI previewDataServer tryCatchWithWarningsAndErrors
 #'  updateNameEntryIfDuplicate
-#' @importFrom dplyr "%>%" arrange distinct filter group_by group_by_at left_join select summarise
+#' @importFrom dplyr "%>%" arrange desc distinct filter group_by group_by_at left_join select summarise
 #'  ungroup
 #' @importFrom DT datatable renderDataTable dataTableOutput
 #' @importFrom elevatr get_elev_point
@@ -49,6 +49,7 @@
 #'  qnorm quantile rbeta residuals rgamma rnorm runif sd var
 #' @importFrom sp point.in.polygon plot SpatialPoints SpatialPixelsDataFrame
 #' @importFrom stringi stri_escape_unicode
+#' @importFrom tclust tclust
 #' @importFrom utils available.packages compareVersion install.packages head zip
 #'  packageVersion read.csv read.csv2 write.table installed.packages capture.output
 #' @importFrom webshot is_phantomjs_installed

diff --git a/R/01-estimateMap.R b/R/01-estimateMap.R
@@ -1992,6 +1992,7 @@ invLogit <- function(x){
 #' @param Weighting character: name of weighting variable
 #' @param clusterMethod character: cluster method
 #' @param kMeansAlgo character: kmeans algorithm as in stats:kmeans
+#' @param trimRatio numeric: proportion of observations to be trimmed by tclust
 #' @param nClust numeric: how many clusters
 #' @param nClustRange numeric: range of potential mclust cluster
 #' @param restriction numeric vector: spatially restricts model data 4 entries for latitude (min/max) and longitude(min/max)
@@ -2024,6 +2025,7 @@ estimateMapKernel <- function(data,
                               nClust = 5,
                               nClustRange = c(2,10),
                               kMeansAlgo = "Hartigan-Wong",
+                              trimRatio = 0.05,
                               restriction = c(-90, 90, -180, 180),
                               nSim = 10,
                               kdeType = "1"){
@@ -2082,39 +2084,6 @@ estimateMapKernel <- function(data,
 
   # calculate model ----
   set.seed(1234)
-  if(clusterMethod == "kmeans"){
-    clust <- kmeans(cbind(data2$Longitude, data2$Latitude), nClust, nstart = 25, algorithm = kMeansAlgo)
-    data2$cluster <- clust$cluster
-    clust <- as.data.frame(clust$centers)
-    names(clust) <- c("long_centroid_spatial_cluster", "lat_centroid_spatial_cluster")
-    clust$cluster <- 1:nrow(clust)
-    data2 <- merge(data2, clust, sort = FALSE)
-    colnames(data2)[colnames(data2)=="cluster"] <- "spatial_cluster"
-  } else if (clusterMethod == "mclust"){
-
-    numClusters <- seq(nClustRange[1],nClustRange[2])
-    cluster_list <- vector("list", length(numClusters))
-    for(i in 1:length(numClusters)){
-        set.seed(1234)
-      cluster_list[[i]] <- mclust::Mclust(data2[,c("Longitude","Latitude")], G = numClusters[i])
-    }
-
-    # select best cluster solution based on bic
-    cluster_solution <- cluster_list[[which.max(sapply(1:length(cluster_list),
-                                             function(x) cluster_list[[x]]$bic))]]
-
-    # assign cluster to data
-    data2$cluster <- cluster_solution$classification
-
-    # merge cluster centers
-    cluster_centers <- data.frame(t(cluster_solution$parameters$mean))
-    colnames(cluster_centers) <- c("long_centroid_spatial_cluster", "lat_centroid_spatial_cluster")
-    cluster_centers$cluster <- 1:nrow(cluster_centers)
-    data2 <- merge(data2, cluster_centers, sort = FALSE)
-    colnames(data2)[colnames(data2)=="cluster"] <- "spatial_cluster"
-
-    data2$spatial_cluster <- data2$spatial_cluster %>% makeClusterIdsContinuous()
-  }
   if(!is.null(Weighting) & !(Weighting == "")){
     model <- try(lapply(1:nSim, function(x){
       data3 <- data2[sample(1:nrow(data2), nrow(data2), replace = T), ]
@@ -2152,6 +2121,48 @@ estimateMapKernel <- function(data,
   if ( class(model)[1] == "try-error") {return("Error in Model Fitting.")}
   sc <- NULL
   class(model) <- c(class(model), "kde")
+
+  # cluster data ----
+  ## cluster on original data
+  if(clusterMethod == "kmeans"){
+    clust <- kmeans(cbind(data$Longitude, data$Latitude), nClust, nstart = 25, algorithm = kMeansAlgo)
+
+    data <- data %>%
+      addCentroids(cluster = clust$cluster,
+                   centers = clust$centers)
+
+    colnames(data)[colnames(data)=="cluster"] <- "spatial_cluster"
+  } else if (clusterMethod == "mclust"){
+    numClusters <- seq(nClustRange[1],nClustRange[2])
+    cluster_list <- vector("list", length(numClusters))
+    for(i in 1:length(numClusters)){
+      set.seed(1234)
+      cluster_list[[i]] <- mclust::Mclust(data[,c("Longitude","Latitude")], G = numClusters[i])
+    }
+
+    # select best cluster solution based on bic
+    cluster_solution <- cluster_list[[which.max(sapply(1:length(cluster_list),
+                                                       function(x) cluster_list[[x]]$bic))]]
+
+    # merge cluster centers
+    data <- data %>%
+      addCentroids(cluster = cluster_solution$classification,
+                   centers = t(cluster_solution$parameters$mean))
+
+    colnames(data)[colnames(data) == "cluster"] <- "spatial_cluster"
+    data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
+  } else if (clusterMethod == "tclust") {
+    cluster_solution <- tclust(data[,c("Longitude","Latitude")], k = nClust, alpha = trimRatio)
+
+    # merge cluster centers
+    data <- data %>%
+      addCentroids(cluster = cluster_solution$cluster,
+                   centers = t(cluster_solution$centers))
+
+    colnames(data)[colnames(data) == "cluster"] <- "spatial_cluster"
+    data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
+  }
+
   return(list(model = model, data = data, sc = sc, independent = independent))
 }
 
@@ -2193,6 +2204,7 @@ estimateMapKernelWrapper <- function(data, input) {
 #' @param nClust numeric: how many clusters
 #' @param nClustRange numeric: range of potential mclust cluster
 #' @param kMeansAlgo character: kmeans algorithm as in stats:kmeans
+#' @param trimRatio numeric: proportion of observations to be trimmed by tclust
 #' @param clusterTimeRange numeric vector: time range of cluster
 #' @param modelUnc boolean: Include dating uncertainty
 #' @param dateUnc character: one of "uniform", "normal", "point"
@@ -2225,6 +2237,7 @@ estimateMap3DKernel <- function(data,
                                 nClust = 5,
                                 nClustRange = c(2,10),
                                 kMeansAlgo = "Hartigan-Wong",
+                                trimRatio = 0.05,
                                 clusterTimeRange = c(0,1000),
                                 modelUnc = FALSE,
                                 dateUnc = "point",
@@ -2387,6 +2400,11 @@ estimateMap3DKernel <- function(data,
       }
       kde(cbind(data3$Longitude, data3$Latitude, data3$Date2), H = H)}), silent = TRUE)
   }
+
+  if (class(model)[1] == "try-error") {return("Error in Model Fitting.")}
+  sc <- NULL
+  class(model) <- c(class(model), "kde")
+
   if(clusterMethod == "kmeans"){
 # K-Means Clustering ----
     # discussion about the clustering here: https://github.com/Pandora-IsoMemo/iso-app/issues/54
@@ -2418,15 +2436,13 @@ estimateMap3DKernel <- function(data,
     # data$cluster <- NULL
 
     ## Filtered data ----
-    dataC$cluster <- clust$cluster
-    clust_centroid <- data.frame(cluster=1:nrow(clust$centers),clust$centers)
-    names(clust_centroid) <- c("cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")
-    dataC <- merge(dataC, clust_centroid, by = "cluster", sort = FALSE)
-    data <- data %>% left_join(dataC[,c("id","cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")], by = "id")
-    data$id <- NULL
-    colnames(data)[colnames(data)=="cluster"] <- "spatial_cluster"
+    dataC <- dataC %>%
+      addCentroids(cluster = clust$cluster,
+                   centers = clust$centers)
+
+    data <- data %>% joinCentroidData(dataC)
+
     dataC$cluster <- NULL
-    dataC <- dataC[order(dataC$id),]
 
     ## Optimal Centroids ----
     clustDens <- sapply(1:nrow(dataC), function(z) {rowMeans(sapply(1:nSim, function(k) predict(model[[k]], x = cbind(dataC[rep(z, 100), c("Longitude", "Latitude")],
@@ -2447,27 +2463,28 @@ estimateMap3DKernel <- function(data,
                            function(x) which.min(rowSums((data[rep(x, nClust), c("Longitude", "Latitude")] -
                                                         as.matrix(clusterCentroids))^2)))
 
-    clust <- clusterCentroids
-    names(clust) <- c("long_temporal_group_reference_point", "lat_temporal_group_reference_point")
-    clust$cluster <- 1:nrow(clust)
-    data <- merge(data, clust, sort = FALSE)
-    colnames(data)[colnames(data)=="cluster"] <- "temporal_group"
-  } else if (clusterMethod == "mclust"){
-  # MCLUST Clustering ----
+    data <- data %>%
+      addCentroids(cluster = data$cluster,
+                   centers = clusterCentroids,
+                   type = c("temporal_group_reference_point"))
+
+    colnames(data)[colnames(data) == "cluster"] <- "temporal_group"
+  } else if (clusterMethod %in% c("mclust","tclust")){
+  # MCLUST & TCLUST Clustering ----
     data$id <- 1:nrow(data)
 
     # Clustering on filtered data
     dataC <- data[((data$Date - 2*data$Uncertainty) <= clusterTimeRange[2] & (data$Date - 2*data$Uncertainty) >= clusterTimeRange[1]) |
                     ((data$Date + 2*data$Uncertainty) <= clusterTimeRange[2] & (data$Date + 2*data$Uncertainty) >= clusterTimeRange[1]) |
                     ((data$Date) <= clusterTimeRange[2] & (data$Date) >= clusterTimeRange[1]), ]
 
+    if(clusterMethod == "mclust"){
     numClusters <- seq(nClustRange[1],nClustRange[2])
     cluster_list <- vector("list", length(numClusters))
     for(i in 1:length(numClusters)){
       set.seed(1234)
       cluster_list[[i]] <- mclust::Mclust(dataC[,c("Longitude","Latitude")], G = numClusters[i])
     }
-
     # select best cluster solution based on bic
     best_solution_idx <- which.max(sapply(1:length(cluster_list),function(x) cluster_list[[x]]$bic))
     best_solution_cluster <- numClusters[[best_solution_idx]]
@@ -2486,50 +2503,54 @@ estimateMap3DKernel <- function(data,
     # data$cluster <- NULL
 
     ## Filtered data ----
-    dataC$cluster <- cluster_solution$classification
-    clust_centroid <- data.frame(cluster=1:nrow(t(cluster_solution$parameters$mean)),t(cluster_solution$parameters$mean))
-    names(clust_centroid) <- c("cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")
-    dataC <- merge(dataC, clust_centroid, by = "cluster", sort = FALSE)
-    data <- data %>% left_join(dataC[,c("id","cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")], by = "id")
-    data$id <- NULL
-    colnames(data)[colnames(data)=="cluster"] <- "spatial_cluster"
-    dataC$cluster <- NULL
-    dataC <- dataC[order(dataC$id),]
+    dataC <- dataC %>%
+      addCentroids(cluster = cluster_solution$classification,
+                   centers = t(cluster_solution$parameters$mean))
+
+    data <- data %>% joinCentroidData(dataC)
+    } else {
+      # tclust
+      cluster_solution <- tclust(dataC[,c("Longitude","Latitude")], k = nClust, alpha = trimRatio)
+
+      ## Filtered data ----
+      dataC <- dataC %>%
+        addCentroids(cluster = cluster_solution$cluster,
+                     centers = t(cluster_solution$centers))
+
+      data <- data %>% joinCentroidData(dataC)
+    }
 
     ## optimal centroids: ----
     clustDens <- sapply(1:nrow(dataC), function(z) {rowMeans(sapply(1:nSim, function(k) predict(model[[k]], x = cbind(dataC[rep(z, 100), c("Longitude", "Latitude")],
                                                                                                                       Date2 = (seq(clusterTimeRange[1], clusterTimeRange[2],
                                                                                                                                    length.out = 100) - mean(data$Date)) / (sd(data$Date))))))})
-    # assign cluster to data
-    dataC$cluster <- cluster_solution$classification
 
     densM <- colMeans(clustDens)
     densSD <- apply(clustDens, 2, sd)
     densQ <- densM / densSD
 
-    clusterCentroids <- do.call("rbind", (lapply(1:best_solution_cluster, function(j){
+    clusterCentroids <- do.call("rbind", (lapply(1:max(dataC$cluster), function(j){
       dataC[dataC$cluster == j, ][which.max(densQ[dataC$cluster == j]), c("Longitude", "Latitude")]
     })))
 
     data$cluster <- sapply(1:nrow(data),
-                           function(x) which.min(rowSums((data[rep(x, best_solution_cluster), c("Longitude", "Latitude")] -
+                           function(x) which.min(rowSums((data[rep(x, max(dataC$cluster)), c("Longitude", "Latitude")] -
                                                             as.matrix(clusterCentroids))^2)))
     if(length(unique(data$cluster)) < length(unique(dataC$cluster))){
-    showNotification(paste0("Note: mclust selected ",length(unique(dataC$cluster))," cluster. However the temporal algorithm assigned all data to only ",length(unique(data$cluster))," of these clusters."))
+    showNotification(paste0("Note: mclust/tclust selected ",length(unique(dataC$cluster))," cluster. However the temporal algorithm assigned all data to only ",length(unique(data$cluster))," of these clusters."))
     }
 
-    clust <- clusterCentroids
-    names(clust) <- c("long_temporal_group_reference_point", "lat_temporal_group_reference_point")
-    clust$cluster <- 1:nrow(clust)
-    data <- merge(data, clust, sort = FALSE)
-    colnames(data)[colnames(data)=="cluster"] <- "temporal_group"
+    data <- data %>%
+      addCentroids(cluster = data$cluster,
+                   centers = clusterCentroids,
+                   type = c("temporal_group_reference_point"))
+
+    colnames(data)[colnames(data) == "cluster"] <- "temporal_group"
 
     data$temporal_group <- data$temporal_group %>% makeClusterIdsContinuous()
     data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
   }
-  if ( class(model)[1] == "try-error") {return("Error in Model Fitting.")}
-  sc <- NULL
-  class(model) <- c(class(model), "kde")
+
   return(list(model = model, data = data, sc = sc, independent = independent))
 }
 

diff --git a/R/01-predictNearestCluster.R b/R/01-predictNearestCluster.R
@@ -20,3 +20,41 @@ makeClusterIdsContinuous <- function(column_with_ids) {
 
   match(column_with_ids, sort(unique(column_with_ids)))
 }
+
+#' Add centroid of clusters
+#'
+#' @param dataC Filtered data
+#' @param cluster Cluster ids
+#' @param centers Cluster centers
+#' @param type Type of centroid
+#'
+#' @return Data with centroids
+addCentroids <- function(dataC, cluster, centers, type = c("centroid_spatial_cluster", "temporal_group_reference_point")) {
+  type <- match.arg(type)
+
+  clust_centroid = data.frame(cluster = 1:nrow(centers), centers)
+  names(clust_centroid) <- c("cluster", sprintf("long_%s", type), sprintf("lat_%s", type))
+
+  dataC$cluster <- cluster
+  dataC <- merge(dataC, clust_centroid, by = "cluster", sort = FALSE)
+
+  if (!is.null(dataC$id)) {
+    dataC <- dataC[order(dataC$id),]
+  }
+
+  dataC
+}
+
+#' Join centroid data to data
+#'
+#' @param data Data
+#' @param dataC Filtered data with centroid
+#'
+#' @return Data with centroid
+joinCentroidData <- function(data, dataC) {
+  data <- data %>% left_join(dataC[,c("id", "cluster", "long_centroid_spatial_cluster", "lat_centroid_spatial_cluster")], by = "id")
+  data$id <- NULL
+  colnames(data)[colnames(data) == "cluster"] <- "spatial_cluster"
+
+  data
+}