Merge pull request #119 from ropensci-review-tools/user-data

mpadge · web-flow · commit c0d8d3e88c71 · 2025-02-21T11:55:01.000+01:00
Fix bug in dashboard pre-process of user data
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: repometrics
 Title: Metrics for Your Code Repository
-Version: 0.1.6.064
+Version: 0.1.6.075
 Authors@R: 
     person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-2172-5265"))
diff --git a/R/analyse-users.R b/R/analyse-users.R
@@ -1,7 +1,7 @@
 #' Construct user-by-user square matrices of strengths of relation between
 #' users.
 #'
-#' @param user_data Result of `lapply(logins, repometrics_data_user)`.
+#' @param data_users Result of `lapply(logins, repometrics_data_user)`.
 #' Contains the following fields:
 #' \enumerate{
 #' \item general (not considered here)
@@ -15,36 +15,37 @@
 #' @return A `data.frame` of pairwise user logins, and proportions of overlap
 #' betwen repositories in the six variables described above.
 #' @noRd
-user_relation_matrices <- function (user_data) {
+user_relation_matrices <- function (data_users) {
 
     # Suppress no visible binding notes:
     followers <- following <- org_repo <- repo <- login <- num_comments <- NULL
 
-    user_names <- names (user_data)
-    user_data <- add_user_login_cols (user_data) |>
+    user_names <- names (data_users)
+    data_users <- add_user_login_cols (data_users) |>
         combine_user_data ()
 
     # Pre-processing to name grouping column "repo" and count column "n":
-    user_data$commit_cmt$repo <-
-        paste0 (user_data$commit_cmt$org, user_data$commit_cmt$repo)
+    data_users$commit_cmt$repo <-
+        paste0 (data_users$commit_cmt$org, data_users$commit_cmt$repo)
 
-    user_data$followers <-
-        dplyr::rename (user_data$followers, repo = followers) |>
+    data_users$followers <-
+        dplyr::rename (data_users$followers, repo = followers) |>
         dplyr::mutate (n = 1L)
-    user_data$following <-
-        dplyr::rename (user_data$following, repo = following) |>
+    data_users$following <-
+        dplyr::rename (data_users$following, repo = following) |>
         dplyr::mutate (n = 1L)
 
-    user_data$issue_cmts <-
-        dplyr::rename (user_data$issue_cmts, repo = org_repo) |>
+    data_users$issue_cmts <-
+        dplyr::rename (data_users$issue_cmts, repo = org_repo) |>
         dplyr::group_by (repo, login) |>
         dplyr::summarise (n = sum (num_comments), .groups = "keep")
-    user_data$issues <- dplyr::rename (user_data$issues, repo = org_repo) |>
+    data_users$issues <- dplyr::rename (data_users$issues, repo = org_repo) |>
         dplyr::group_by (repo, login) |>
         dplyr::summarise (n = dplyr::n (), .groups = "keep")
 
-    overlap <- lapply (names (user_data), function (n) {
-        user_data [[n]] <- user_relate_fields (user_data, user_names, what = n)
+    overlap <- lapply (names (data_users), function (n) {
+        data_users [[n]] <-
+            user_relate_fields (data_users, user_names, what = n)
     })
 
     res <- dplyr::left_join (
@@ -62,18 +63,19 @@ user_relation_matrices <- function (user_data) {
 
 #' Add 'login' columns to all user data, so each element can be combined.
 #' @noRd
-add_user_login_cols <- function (user_data) {
+add_user_login_cols <- function (data_users) {
 
-    nms <- names (user_data)
-    res <- lapply (seq_along (user_data), function (u) {
-        nms_u <- names (user_data [[u]])
-        res_u <- lapply (seq_along (user_data [[u]]), function (i) {
-            ud <- user_data [[u]] [[i]]
+    nms <- names (data_users)
+    res <- lapply (seq_along (data_users), function (u) {
+        nms_u <- names (data_users [[u]])
+        res_u <- lapply (seq_along (data_users [[u]]), function (i) {
+            ud <- data_users [[u]] [[i]]
             if (is.data.frame (ud) && nrow (ud) > 0L) {
-                ud$login <- names (user_data) [u]
+                ud$login <- names (data_users) [u]
             } else if (is.character (ud)) {
-                ud <- data.frame (ud, login = names (user_data) [u])
-                names (ud) [1] <- names (user_data [[u]]) [i]
+                login <- names (data_users) [i]
+                ud <- data.frame (ud, login = rep (login, length (ud)))
+                names (ud) [1] <- names (data_users [[u]]) [i]
             }
             return (ud)
         })
@@ -90,39 +92,39 @@ add_user_login_cols <- function (user_data) {
 #'
 #' The `add_user_login_cols` enables all data to be `rbind`-ed here.
 #' @noRd
-combine_user_data <- function (user_data) {
+combine_user_data <- function (data_users) {
 
-    data <- lapply (names (user_data [[1]]), function (n) {
-        these <- lapply (user_data, function (i) i [[n]])
+    data <- lapply (names (data_users [[1]]), function (n) {
+        these <- lapply (data_users, function (i) i [[n]])
         res <- do.call (rbind, these)
         rownames (res) <- NULL
         return (res)
     })
 
-    names (data) <- names (user_data [[1]])
+    names (data) <- names (data_users [[1]])
     data$general <- NULL
 
     return (data)
 }
 
-user_relate_fields <- function (user_data, user_names, what = "commits") {
+user_relate_fields <- function (data_users, user_names, what = "commits") {
 
     # Suppress no visible binding notes:
     num_commits <- login <- repo <- n <- NULL
 
     user_combs <- t (utils::combn (user_names, m = 2L))
     if (what == "commits") {
-        user_data [[what]] <-
-            dplyr::rename (user_data [[what]], n = num_commits)
+        data_users [[what]] <-
+            dplyr::rename (data_users [[what]], n = num_commits)
     } else if (what == "commit_cmt") {
-        user_data$commit_cmt$n <- 1L
+        data_users$commit_cmt$n <- 1L
     }
 
     res <- apply (user_combs, 1, function (i) {
-        cmt1 <- dplyr::filter (user_data [[what]], login == i [1]) |>
+        cmt1 <- dplyr::filter (data_users [[what]], login == i [1]) |>
             dplyr::group_by (repo) |>
             dplyr::summarise (n1 = sum (n))
-        cmt2 <- dplyr::filter (user_data [[what]], login == i [2]) |>
+        cmt2 <- dplyr::filter (data_users [[what]], login == i [2]) |>
             dplyr::group_by (repo) |>
             dplyr::summarise (n2 = sum (n))
         overlap <- dplyr::inner_join (cmt1, cmt2, by = "repo")
diff --git a/R/quarto-dashboard.R b/R/quarto-dashboard.R
@@ -8,13 +8,37 @@
 #' @param action One of "preview", to start and open a live preview of the
 #' dashboard website, or "render" to render a static version without previewing
 #' or opening.
+#' @param ctb_threshold An optional single numeric value between 0 and 1. If
+#' specified, contributions are arranged in cumulative order, and the
+#' contributor data reduced to only those who contribute to this proportion of
+#' all contributions.
+#' @param max_ctbs Optional maximum number of contributors to be included. This
+#' is an alternative way to reduce number of contributors presented in
+#' dashboard, and may only be specified if `ctb_threshold` is left at default
+#' value of `NULL`.
+#'
 #' @return (Invisibly) Path to main "index.html" document of quarto site. Note
 #' that the site must be served with `action = "preview"`, and will not work by
 #' simply opening this "index.html" file.
 #'
 #' @family dashboard
 #' @export
-repometrics_dashboard <- function (data_repo, data_users, action = "preview") {
+repometrics_dashboard <- function (data_repo, data_users, action = "preview",
+                                   ctb_threshold = NULL, max_ctbs = NULL) {
+
+    if (!is.null (ctb_threshold)) {
+        checkmate::assert_numeric (ctb_threshold, len = 1L, lower = 0, upper = 1)
+        if (!is.null (max_ctbs)) {
+            cli::cli_abort ("Only one of 'ctb_threshold' or 'max_ctbs' may be specified.")
+        }
+    }
+    if (!is.null (max_ctbs)) {
+        checkmate::assert_integerish (max_ctbs, len = 1L, lower = 1, upper = length (data_users))
+    }
+
+    if (!is.null (ctb_threshold) || !is.null (max_ctbs)) {
+        data_users <- reduce_data_users (data_users, ctb_threshold, max_ctbs)
+    }
 
     check_dashboard_arg (data_repo)
     data_repo$pkgstats <- timestamps_to_dates (data_repo$pkgstats)
@@ -47,6 +71,31 @@ repometrics_dashboard <- function (data_repo, data_users, action = "preview") {
     })
 }
 
+reduce_data_users <- function (data_users,
+                               ctb_threshold = NULL,
+                               max_ctbs = NULL) {
+
+    classes <- vapply (data_users [[1]], class, character (1L))
+    index <- which (classes == "data.frame")
+    # Those are "commit_cmt", "commits", "issue_cmts", "issues"
+    rowcounts <- t (vapply (data_users, function (u) {
+        vapply (u [index], nrow, integer (1L))
+    }, integer (length (index))))
+    n <- sort (rowSums (rowcounts), decreasing = TRUE)
+
+    if (!is.null (max_ctbs)) {
+        these_ctbs <- names (n) [seq_len (max_ctbs)]
+        index <- sort (match (these_ctbs, names (data_users)))
+    } else {
+        ncum <- cumsum (n) / sum (n)
+        ctbs_trimmed <- names (ncum) [which (ncum <= ctb_threshold)]
+        index <- sort (match (ctbs_trimmed, names (data_users)))
+    }
+    data_users <- data_users [index]
+
+    return (data_users)
+}
+
 # `range` is used to scale values, and restrict to sufficiently large values.
 # Total range is first re-scaled to maximum of `range[2]`, then values below
 # `range[1]` are removed.
diff --git a/codemeta.json b/codemeta.json
@@ -8,7 +8,7 @@
   "codeRepository": "https://github.com/ropensci-review-tools/repometrics",
   "issueTracker": "https://github.com/ropensci-review-tools/repometrics/issues",
   "license": "https://spdx.org/licenses/GPL-3.0",
-  "version": "0.1.6.064",
+  "version": "0.1.6.075",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",
diff --git a/man/repometrics_dashboard.Rd b/man/repometrics_dashboard.Rd
diff --git a/tests/testthat/test-dashboard.R b/tests/testthat/test-dashboard.R