Skip to content

Commit

Permalink
DSSM 24.08.5: Feature/204 tclust (#244)
Browse files Browse the repository at this point in the history
* tclust wip

* adjust tclust inputs

* add tclust as clustering method

* fix check

* Feature/204 tclust review (#246)

* refactor duplicated code

* do clustering (on original data and after modelling) exactly as for kernelTimeR

* fix export of data

* remove duplicated code

* update news.md

* update namespace

---------

Co-authored-by: Antonia Runge <antonia.runge@inwt-statistics.de>
  • Loading branch information
f-lukas and arunge authored Sep 3, 2024
1 parent 61db709 commit afe8317
Show file tree
Hide file tree
Showing 14 changed files with 291 additions and 134 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: DSSM
Title: Pandora & IsoMemo spatiotemporal modeling
Version: 24.08.4
Version: 24.08.5
Authors@R: c(
person("Marcus", "Gross", email = "marcus.gross@inwt-statistics.de", role = c("cre", "aut")),
person("Antonia", "Runge", email = "antonia.runge@inwt-statistics.de", role = c("aut"))
Expand Down Expand Up @@ -52,6 +52,7 @@ Imports:
sf,
sp (>= 1.2.7),
stringi (>= 1.1.7),
tclust,
webshot,
yaml,
zip
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ importFrom(coda,raftery.diag)
importFrom(colourpicker,colourInput)
importFrom(dplyr,"%>%")
importFrom(dplyr,arrange)
importFrom(dplyr,desc)
importFrom(dplyr,distinct)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
Expand Down Expand Up @@ -273,6 +274,7 @@ importFrom(stats,runif)
importFrom(stats,sd)
importFrom(stats,var)
importFrom(stringi,stri_escape_unicode)
importFrom(tclust,tclust)
importFrom(utils,available.packages)
importFrom(utils,capture.output)
importFrom(utils,compareVersion)
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# DSSM 24.08.5

## New Features
- Adds tclust as clustering method in KernelR and KernelTimeR (#204)

## Bug Fixes
- solves issue with the plotting and export of cluster data in KernelR

# DSSM 24.08.4

## New Features
Expand Down
3 changes: 2 additions & 1 deletion R/00-Namespace.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' fillIsoData handleDescription has_internet importDataUI importDataServer importUI importServer
#' importOptions prefixSysTime previewDataUI previewDataServer tryCatchWithWarningsAndErrors
#' updateNameEntryIfDuplicate
#' @importFrom dplyr "%>%" arrange distinct filter group_by group_by_at left_join select summarise
#' @importFrom dplyr "%>%" arrange desc distinct filter group_by group_by_at left_join select summarise
#' ungroup
#' @importFrom DT datatable renderDataTable dataTableOutput
#' @importFrom elevatr get_elev_point
Expand Down Expand Up @@ -49,6 +49,7 @@
#' qnorm quantile rbeta residuals rgamma rnorm runif sd var
#' @importFrom sp point.in.polygon plot SpatialPoints SpatialPixelsDataFrame
#' @importFrom stringi stri_escape_unicode
#' @importFrom tclust tclust
#' @importFrom utils available.packages compareVersion install.packages head zip
#' packageVersion read.csv read.csv2 write.table installed.packages capture.output
#' @importFrom webshot is_phantomjs_installed
Expand Down
163 changes: 92 additions & 71 deletions R/01-estimateMap.R
Original file line number Diff line number Diff line change
Expand Up @@ -1992,6 +1992,7 @@ invLogit <- function(x){
#' @param Weighting character: name of weighting variable
#' @param clusterMethod character: cluster method
#' @param kMeansAlgo character: kmeans algorithm as in stats:kmeans
#' @param trimRatio numeric: proportion of observations to be trimmed by tclust
#' @param nClust numeric: how many clusters
#' @param nClustRange numeric: range of potential mclust cluster
#' @param restriction numeric vector: spatially restricts model data 4 entries for latitude (min/max) and longitude(min/max)
Expand Down Expand Up @@ -2024,6 +2025,7 @@ estimateMapKernel <- function(data,
nClust = 5,
nClustRange = c(2,10),
kMeansAlgo = "Hartigan-Wong",
trimRatio = 0.05,
restriction = c(-90, 90, -180, 180),
nSim = 10,
kdeType = "1"){
Expand Down Expand Up @@ -2082,39 +2084,6 @@ estimateMapKernel <- function(data,

# calculate model ----
set.seed(1234)
if(clusterMethod == "kmeans"){
clust <- kmeans(cbind(data2$Longitude, data2$Latitude), nClust, nstart = 25, algorithm = kMeansAlgo)
data2$cluster <- clust$cluster
clust <- as.data.frame(clust$centers)
names(clust) <- c("long_centroid_spatial_cluster", "lat_centroid_spatial_cluster")
clust$cluster <- 1:nrow(clust)
data2 <- merge(data2, clust, sort = FALSE)
colnames(data2)[colnames(data2)=="cluster"] <- "spatial_cluster"
} else if (clusterMethod == "mclust"){

numClusters <- seq(nClustRange[1],nClustRange[2])
cluster_list <- vector("list", length(numClusters))
for(i in 1:length(numClusters)){
set.seed(1234)
cluster_list[[i]] <- mclust::Mclust(data2[,c("Longitude","Latitude")], G = numClusters[i])
}

# select best cluster solution based on bic
cluster_solution <- cluster_list[[which.max(sapply(1:length(cluster_list),
function(x) cluster_list[[x]]$bic))]]

# assign cluster to data
data2$cluster <- cluster_solution$classification

# merge cluster centers
cluster_centers <- data.frame(t(cluster_solution$parameters$mean))
colnames(cluster_centers) <- c("long_centroid_spatial_cluster", "lat_centroid_spatial_cluster")
cluster_centers$cluster <- 1:nrow(cluster_centers)
data2 <- merge(data2, cluster_centers, sort = FALSE)
colnames(data2)[colnames(data2)=="cluster"] <- "spatial_cluster"

data2$spatial_cluster <- data2$spatial_cluster %>% makeClusterIdsContinuous()
}
if(!is.null(Weighting) & !(Weighting == "")){
model <- try(lapply(1:nSim, function(x){
data3 <- data2[sample(1:nrow(data2), nrow(data2), replace = T), ]
Expand Down Expand Up @@ -2152,6 +2121,48 @@ estimateMapKernel <- function(data,
if ( class(model)[1] == "try-error") {return("Error in Model Fitting.")}
sc <- NULL
class(model) <- c(class(model), "kde")

# cluster data ----
## cluster on original data
if(clusterMethod == "kmeans"){
clust <- kmeans(cbind(data$Longitude, data$Latitude), nClust, nstart = 25, algorithm = kMeansAlgo)

data <- data %>%
addCentroids(cluster = clust$cluster,
centers = clust$centers)

colnames(data)[colnames(data)=="cluster"] <- "spatial_cluster"
} else if (clusterMethod == "mclust"){
numClusters <- seq(nClustRange[1],nClustRange[2])
cluster_list <- vector("list", length(numClusters))
for(i in 1:length(numClusters)){
set.seed(1234)
cluster_list[[i]] <- mclust::Mclust(data[,c("Longitude","Latitude")], G = numClusters[i])
}

# select best cluster solution based on bic
cluster_solution <- cluster_list[[which.max(sapply(1:length(cluster_list),
function(x) cluster_list[[x]]$bic))]]

# merge cluster centers
data <- data %>%
addCentroids(cluster = cluster_solution$classification,
centers = t(cluster_solution$parameters$mean))

colnames(data)[colnames(data) == "cluster"] <- "spatial_cluster"
data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
} else if (clusterMethod == "tclust") {
cluster_solution <- tclust(data[,c("Longitude","Latitude")], k = nClust, alpha = trimRatio)

# merge cluster centers
data <- data %>%
addCentroids(cluster = cluster_solution$cluster,
centers = t(cluster_solution$centers))

colnames(data)[colnames(data) == "cluster"] <- "spatial_cluster"
data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
}

return(list(model = model, data = data, sc = sc, independent = independent))
}

Expand Down Expand Up @@ -2193,6 +2204,7 @@ estimateMapKernelWrapper <- function(data, input) {
#' @param nClust numeric: how many clusters
#' @param nClustRange numeric: range of potential mclust cluster
#' @param kMeansAlgo character: kmeans algorithm as in stats:kmeans
#' @param trimRatio numeric: proportion of observations to be trimmed by tclust
#' @param clusterTimeRange numeric vector: time range of cluster
#' @param modelUnc boolean: Include dating uncertainty
#' @param dateUnc character: one of "uniform", "normal", "point"
Expand Down Expand Up @@ -2225,6 +2237,7 @@ estimateMap3DKernel <- function(data,
nClust = 5,
nClustRange = c(2,10),
kMeansAlgo = "Hartigan-Wong",
trimRatio = 0.05,
clusterTimeRange = c(0,1000),
modelUnc = FALSE,
dateUnc = "point",
Expand Down Expand Up @@ -2387,6 +2400,11 @@ estimateMap3DKernel <- function(data,
}
kde(cbind(data3$Longitude, data3$Latitude, data3$Date2), H = H)}), silent = TRUE)
}

if (class(model)[1] == "try-error") {return("Error in Model Fitting.")}
sc <- NULL
class(model) <- c(class(model), "kde")

if(clusterMethod == "kmeans"){
# K-Means Clustering ----
# discussion about the clustering here: https://github.com/Pandora-IsoMemo/iso-app/issues/54
Expand Down Expand Up @@ -2418,15 +2436,13 @@ estimateMap3DKernel <- function(data,
# data$cluster <- NULL

## Filtered data ----
dataC$cluster <- clust$cluster
clust_centroid <- data.frame(cluster=1:nrow(clust$centers),clust$centers)
names(clust_centroid) <- c("cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")
dataC <- merge(dataC, clust_centroid, by = "cluster", sort = FALSE)
data <- data %>% left_join(dataC[,c("id","cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")], by = "id")
data$id <- NULL
colnames(data)[colnames(data)=="cluster"] <- "spatial_cluster"
dataC <- dataC %>%
addCentroids(cluster = clust$cluster,
centers = clust$centers)

data <- data %>% joinCentroidData(dataC)

dataC$cluster <- NULL
dataC <- dataC[order(dataC$id),]

## Optimal Centroids ----
clustDens <- sapply(1:nrow(dataC), function(z) {rowMeans(sapply(1:nSim, function(k) predict(model[[k]], x = cbind(dataC[rep(z, 100), c("Longitude", "Latitude")],
Expand All @@ -2447,27 +2463,28 @@ estimateMap3DKernel <- function(data,
function(x) which.min(rowSums((data[rep(x, nClust), c("Longitude", "Latitude")] -
as.matrix(clusterCentroids))^2)))

clust <- clusterCentroids
names(clust) <- c("long_temporal_group_reference_point", "lat_temporal_group_reference_point")
clust$cluster <- 1:nrow(clust)
data <- merge(data, clust, sort = FALSE)
colnames(data)[colnames(data)=="cluster"] <- "temporal_group"
} else if (clusterMethod == "mclust"){
# MCLUST Clustering ----
data <- data %>%
addCentroids(cluster = data$cluster,
centers = clusterCentroids,
type = c("temporal_group_reference_point"))

colnames(data)[colnames(data) == "cluster"] <- "temporal_group"
} else if (clusterMethod %in% c("mclust","tclust")){
# MCLUST & TCLUST Clustering ----
data$id <- 1:nrow(data)

# Clustering on filtered data
dataC <- data[((data$Date - 2*data$Uncertainty) <= clusterTimeRange[2] & (data$Date - 2*data$Uncertainty) >= clusterTimeRange[1]) |
((data$Date + 2*data$Uncertainty) <= clusterTimeRange[2] & (data$Date + 2*data$Uncertainty) >= clusterTimeRange[1]) |
((data$Date) <= clusterTimeRange[2] & (data$Date) >= clusterTimeRange[1]), ]

if(clusterMethod == "mclust"){
numClusters <- seq(nClustRange[1],nClustRange[2])
cluster_list <- vector("list", length(numClusters))
for(i in 1:length(numClusters)){
set.seed(1234)
cluster_list[[i]] <- mclust::Mclust(dataC[,c("Longitude","Latitude")], G = numClusters[i])
}

# select best cluster solution based on bic
best_solution_idx <- which.max(sapply(1:length(cluster_list),function(x) cluster_list[[x]]$bic))
best_solution_cluster <- numClusters[[best_solution_idx]]
Expand All @@ -2486,50 +2503,54 @@ estimateMap3DKernel <- function(data,
# data$cluster <- NULL

## Filtered data ----
dataC$cluster <- cluster_solution$classification
clust_centroid <- data.frame(cluster=1:nrow(t(cluster_solution$parameters$mean)),t(cluster_solution$parameters$mean))
names(clust_centroid) <- c("cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")
dataC <- merge(dataC, clust_centroid, by = "cluster", sort = FALSE)
data <- data %>% left_join(dataC[,c("id","cluster","long_centroid_spatial_cluster","lat_centroid_spatial_cluster")], by = "id")
data$id <- NULL
colnames(data)[colnames(data)=="cluster"] <- "spatial_cluster"
dataC$cluster <- NULL
dataC <- dataC[order(dataC$id),]
dataC <- dataC %>%
addCentroids(cluster = cluster_solution$classification,
centers = t(cluster_solution$parameters$mean))

data <- data %>% joinCentroidData(dataC)
} else {
# tclust
cluster_solution <- tclust(dataC[,c("Longitude","Latitude")], k = nClust, alpha = trimRatio)

## Filtered data ----
dataC <- dataC %>%
addCentroids(cluster = cluster_solution$cluster,
centers = t(cluster_solution$centers))

data <- data %>% joinCentroidData(dataC)
}

## optimal centroids: ----
clustDens <- sapply(1:nrow(dataC), function(z) {rowMeans(sapply(1:nSim, function(k) predict(model[[k]], x = cbind(dataC[rep(z, 100), c("Longitude", "Latitude")],
Date2 = (seq(clusterTimeRange[1], clusterTimeRange[2],
length.out = 100) - mean(data$Date)) / (sd(data$Date))))))})
# assign cluster to data
dataC$cluster <- cluster_solution$classification

densM <- colMeans(clustDens)
densSD <- apply(clustDens, 2, sd)
densQ <- densM / densSD

clusterCentroids <- do.call("rbind", (lapply(1:best_solution_cluster, function(j){
clusterCentroids <- do.call("rbind", (lapply(1:max(dataC$cluster), function(j){
dataC[dataC$cluster == j, ][which.max(densQ[dataC$cluster == j]), c("Longitude", "Latitude")]
})))

data$cluster <- sapply(1:nrow(data),
function(x) which.min(rowSums((data[rep(x, best_solution_cluster), c("Longitude", "Latitude")] -
function(x) which.min(rowSums((data[rep(x, max(dataC$cluster)), c("Longitude", "Latitude")] -
as.matrix(clusterCentroids))^2)))
if(length(unique(data$cluster)) < length(unique(dataC$cluster))){
showNotification(paste0("Note: mclust selected ",length(unique(dataC$cluster))," cluster. However the temporal algorithm assigned all data to only ",length(unique(data$cluster))," of these clusters."))
showNotification(paste0("Note: mclust/tclust selected ",length(unique(dataC$cluster))," cluster. However the temporal algorithm assigned all data to only ",length(unique(data$cluster))," of these clusters."))
}

clust <- clusterCentroids
names(clust) <- c("long_temporal_group_reference_point", "lat_temporal_group_reference_point")
clust$cluster <- 1:nrow(clust)
data <- merge(data, clust, sort = FALSE)
colnames(data)[colnames(data)=="cluster"] <- "temporal_group"
data <- data %>%
addCentroids(cluster = data$cluster,
centers = clusterCentroids,
type = c("temporal_group_reference_point"))

colnames(data)[colnames(data) == "cluster"] <- "temporal_group"

data$temporal_group <- data$temporal_group %>% makeClusterIdsContinuous()
data$spatial_cluster <- data$spatial_cluster %>% makeClusterIdsContinuous()
}
if ( class(model)[1] == "try-error") {return("Error in Model Fitting.")}
sc <- NULL
class(model) <- c(class(model), "kde")

return(list(model = model, data = data, sc = sc, independent = independent))
}

Expand Down
38 changes: 38 additions & 0 deletions R/01-predictNearestCluster.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,41 @@ makeClusterIdsContinuous <- function(column_with_ids) {

match(column_with_ids, sort(unique(column_with_ids)))
}

#' Add centroid of clusters
#'
#' @param dataC Filtered data
#' @param cluster Cluster ids
#' @param centers Cluster centers
#' @param type Type of centroid
#'
#' @return Data with centroids
addCentroids <- function(dataC, cluster, centers, type = c("centroid_spatial_cluster", "temporal_group_reference_point")) {
type <- match.arg(type)

clust_centroid = data.frame(cluster = 1:nrow(centers), centers)
names(clust_centroid) <- c("cluster", sprintf("long_%s", type), sprintf("lat_%s", type))

dataC$cluster <- cluster
dataC <- merge(dataC, clust_centroid, by = "cluster", sort = FALSE)

if (!is.null(dataC$id)) {
dataC <- dataC[order(dataC$id),]
}

dataC
}

#' Join centroid data to data
#'
#' @param data Data
#' @param dataC Filtered data with centroid
#'
#' @return Data with centroid
joinCentroidData <- function(data, dataC) {
data <- data %>% left_join(dataC[,c("id", "cluster", "long_centroid_spatial_cluster", "lat_centroid_spatial_cluster")], by = "id")
data$id <- NULL
colnames(data)[colnames(data) == "cluster"] <- "spatial_cluster"

data
}
Loading

0 comments on commit afe8317

Please sign in to comment.