|
| 1 | +#' Apply \pkg{pkgstats} across the git history of a package |
| 2 | +#' |
| 3 | +#' @param path Path to local repository containing an R package. |
| 4 | +#' @param step_days Analyse package at intervals of this number of days. The |
| 5 | +#' last commit for each day is chosen. For example, `step_days = 7L` will |
| 6 | +#' return weekly statistics. Values of zero or less will analyse all commits, |
| 7 | +#' including potentially multiple daily commits. |
| 8 | +#' @param num_cores Number of cores to use in multi-core processing. Has no |
| 9 | +#' effect on Windows operating systems, on which calculations are always |
| 10 | +#' single-core only. Negative values are subtracted from number of available |
| 11 | +#' cores, determined as `parallel::detectCores()`, so default of `num_cores = |
| 12 | +#' -1L` uses `detectCores() - 1L`. Positive values use precisely that number, |
| 13 | +#' restricted to maximum available cores, and a value of zero will use all |
| 14 | +#' available cores. |
| 15 | +#' |
| 16 | +#' @return A list of three items: |
| 17 | +#' \itemize{ |
| 18 | +#' \item desc_data Containing data from `DESCRIPTION` files, along with data on |
| 19 | +#' numbers of functions. |
| 20 | +#' \item loc Containing data on "lines-of-code" for all languages and |
| 21 | +#' sub-directories within package. |
| 22 | +#' \item stats Containing statistics on (mean, medium, and sum) of various |
| 23 | +#' properties of each function in package. |
| 24 | +#' } |
| 25 | +#' |
| 26 | +#' @export |
| 27 | +repo_pkgstats_history <- function (path, |
| 28 | + step_days = 1L, |
| 29 | + num_cores = -1L) { |
| 30 | + |
| 31 | + checkmate::assert_character (path, len = 1L) |
| 32 | + checkmate::assert_directory (path) |
| 33 | + checkmate::assert_int (step_days, lower = 0L) |
| 34 | + checkmate::assert_int (num_cores) |
| 35 | + |
| 36 | + num_cores <- set_num_cores (num_cores) |
| 37 | + |
| 38 | + log <- cm_data_gitlog (path) |
| 39 | + log <- filter_git_log (log, step_days) |
| 40 | + |
| 41 | + if (num_cores == 1L) { |
| 42 | + |
| 43 | + res <- extract_pkgstats_data_single (log, path) |
| 44 | + |
| 45 | + } else { |
| 46 | + |
| 47 | + res <- extract_pkgstats_data_multi (log, path, num_cores) |
| 48 | + |
| 49 | + } |
| 50 | + |
| 51 | + collate_pkgstats (res) |
| 52 | +} |
| 53 | + |
| 54 | +filter_git_log <- function (log, step_days) { |
| 55 | + |
| 56 | + if (step_days >= 1L) { |
| 57 | + log$date <- as.Date (log$timestamp) |
| 58 | + log <- dplyr::group_by (log, date) |> |
| 59 | + dplyr::filter (dplyr::row_number () == 1L) |
| 60 | + if (step_days > 1L) { |
| 61 | + index <- which (-diff (log$date) < step_days) |
| 62 | + if (length (index) > 0L) { |
| 63 | + log <- log [-(index), ] |
| 64 | + } |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + return (log) |
| 69 | +} |
| 70 | + |
| 71 | + |
| 72 | +extract_pkgstats_data_single <- function (log, path) { |
| 73 | + |
| 74 | + path_cp <- fs::path (fs::path_temp (), basename (path)) |
| 75 | + clean_after <- FALSE |
| 76 | + if (fs::path (fs::path_dir (path)) != fs::path_temp () && |
| 77 | + !fs::dir_exists (path_cp)) { |
| 78 | + path_cp <- fs::dir_copy (path, fs::path_temp ()) |
| 79 | + clean_after <- TRUE |
| 80 | + } |
| 81 | + |
| 82 | + res <- pbapply::pblapply (seq_len (nrow (log)), function (i) { |
| 83 | + g <- gert::git_reset_hard (ref = log$hash [i], repo = path_cp) |
| 84 | + run_one_pkgstats (path = path_cp, pkg_date = log$timestamp [i]) |
| 85 | + }) |
| 86 | + |
| 87 | + if (clean_after) { |
| 88 | + fs::dir_delete (path_cp) |
| 89 | + } |
| 90 | + |
| 91 | + return (res) |
| 92 | +} |
| 93 | + |
| 94 | +extract_pkgstats_data_multi <- function (log, path, num_cores) { |
| 95 | + |
| 96 | + cl <- parallel::makeCluster (num_cores) |
| 97 | + parallel::clusterExport ( |
| 98 | + cl, |
| 99 | + c ("log", "path", "run_one_pkgstats"), |
| 100 | + envir = environment () |
| 101 | + ) |
| 102 | + res <- pbapply::pblapply (seq_len (nrow (log)), function (i) { |
| 103 | + path_cp <- fs::dir_copy (path, fs::path_temp ()) |
| 104 | + g <- gert::git_reset_hard (ref = log$hash [i], repo = path_cp) |
| 105 | + s <- run_one_pkgstats (path = path_cp, pkg_date = log$timestamp [i]) |
| 106 | + fs::dir_delete (path_cp) |
| 107 | + return (s) |
| 108 | + }, cl = cl) |
| 109 | + parallel::stopCluster (cl) |
| 110 | + |
| 111 | + return (res) |
| 112 | + |
| 113 | + return (res) |
| 114 | +} |
| 115 | + |
1 | 116 | run_one_pkgstats <- function (path, pkg_date) {
|
2 | 117 |
|
3 | 118 | s <- pkgstats::pkgstats (path)
|
@@ -62,3 +177,38 @@ run_one_pkgstats <- function (path, pkg_date) {
|
62 | 177 | )
|
63 | 178 | )
|
64 | 179 | }
|
| 180 | + |
| 181 | +collate_pkgstats <- function (x) { |
| 182 | + nms <- names (x [[1]]) |
| 183 | + nms2df <- nms [seq_len (which (nms == "loc") - 1L)] |
| 184 | + desc_data <- lapply (nms2df, function (i) { |
| 185 | + unlist (lapply (x, function (j) j [[i]])) |
| 186 | + }) |
| 187 | + desc_data <- data.frame (do.call (cbind, desc_data)) |
| 188 | + names (desc_data) <- nms2df |
| 189 | + desc_data$date <- vapply ( |
| 190 | + x, |
| 191 | + function (i) strftime (i$date, "%y-%m-%d %H:%M:%S"), |
| 192 | + "character" |
| 193 | + ) |
| 194 | + desc_data$date <- strptime (desc_data$date, format = "%y-%m-%d %H:%M:%S") |
| 195 | + |
| 196 | + nms_int <- nms2df [-seq_len (which (nms2df == "date"))] |
| 197 | + for (n in nms_int) { |
| 198 | + desc_data [[n]] <- as.integer (desc_data [[n]]) |
| 199 | + } |
| 200 | + |
| 201 | + loc <- do.call (rbind, lapply (x, function (i) i$loc)) |
| 202 | + stats <- do.call (rbind, lapply (x, function (i) i$stats)) |
| 203 | + stats$measure <- gsub ("[0-9]+$", "", rownames (stats)) |
| 204 | + rownames (stats) <- NULL |
| 205 | + |
| 206 | + # Lazy convert all to tibbles, which `res$loc` is from `dplyr`: |
| 207 | + class (desc_data) <- class (stats) <- class (loc) |
| 208 | + |
| 209 | + list ( |
| 210 | + desc_data = desc_data, |
| 211 | + loc = loc, |
| 212 | + stats = stats |
| 213 | + ) |
| 214 | +} |
0 commit comments