moving a function from one R file to another

kasperdanielhansen · kasperdanielhansen · commit 9358fabd1b1c · 2025-03-12T14:12:17.000-04:00
diff --git a/R/FWGRanges-class.R b/R/FWGRanges-class.R
@@ -214,172 +214,6 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
     unique(x)
 }
 
-# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels())
-#       to the output. This is the default behaviour because it results in
-#       the smallest returned object (albeit at the small cost of a sort).
-.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
-                                    strandCollapse = FALSE, sort = TRUE,
-                                    nThread = 1L, verbose = FALSE) {
-    # Argument checks ----------------------------------------------------------
-
-    stopifnot(isTRUEorFALSE(rmZeroCov))
-    stopifnot(isTRUEorFALSE(strandCollapse))
-    stopifnot(isTRUEorFALSE(sort))
-
-    # Quieten R CMD check about 'no visible binding for global variable'
-    M <- U <- NULL
-
-    # Read file to construct data.table of valid loci --------------------------
-    if (rmZeroCov) {
-        dt <- .readBismarkAsDT(
-            file = file,
-            col_spec = "BSseq",
-            check = TRUE,
-            verbose = verbose)
-        if (strandCollapse && !is.null(dt[["strand"]]) &&
-            !dt[, all(strand == "*")]) {
-            # Shift loci on negative strand by 1 to the left and then remove
-            # strand since no longer valid.
-            dt[strand == "-", start := start - 1L][, strand := NULL]
-            # Aggregate counts at loci with the same 'seqnames' and 'start'.
-            dt <- dt[,
-                     list(M = sum(M), U = sum(U)), by = c("seqnames", "start")]
-        }
-        # Identify loci with non-zero coverage then drop 'M' and 'U' as no
-        # longer required.
-        dt <- dt[(M + U) > 0][, c("M", "U") := list(NULL, NULL)]
-    } else {
-        dt <- .readBismarkAsDT(
-            file = file,
-            col_spec = "GRanges",
-            check = FALSE,
-            nThread = nThread,
-            verbose = verbose)
-        if (strandCollapse && !is.null(dt[["strand"]]) &&
-            !dt[, all(strand == "*")]) {
-            # Shift loci on negative strand by 1 to the left and then remove
-            # strand since no longer valid.
-            dt[strand == "-", start := start - 1L][, strand := NULL]
-            dt <- data.table:::funique(dt)
-        }
-    }
-
-    # Construct FWGRanges from 'dt' --------------------------------------------
-
-    # NOTE: Sorting results in a smaller FWGRanges object because the
-    #       'seqnames' and 'strand' slots are more compressible in their Rle
-    #       representation.
-    if (sort) {
-        if (is.null(dt[["strand"]])) {
-            setkey(dt, seqnames, start)
-        } else {
-            setkey(dt, seqnames, strand, start)
-        }
-    }
-    seqnames <- Rle(dt[["seqnames"]])
-    dt[, seqnames := NULL]
-    seqinfo <- Seqinfo(seqnames = levels(seqnames))
-    ranges <- .FWIRanges(start = dt[["start"]], width = 1L)
-    dt[, start := NULL]
-    mcols <- make_zero_col_DFrame(length(ranges))
-    if (is.null(dt[["strand"]])) {
-        strand <- strand(Rle("*", length(seqnames)))
-    } else {
-        strand <- Rle(dt[["strand"]])
-        dt[, strand := NULL]
-    }
-    fwgranges <- .FWGRanges(
-        seqnames = seqnames,
-        ranges = ranges,
-        strand = strand,
-        seqinfo = seqinfo,
-        elementMetadata = mcols)
-    # NOTE: Final sort is to re-order with respect to sorted seqlevels.
-    if (sort) {
-        fwgranges <- sort(sortSeqlevels(fwgranges))
-    }
-    fwgranges
-}
-
-# TODO: Document that this applies sort(sortSeqlevels()) to the output. It is
-#       deliberate that there is no option to override this behaviour.
-.contructFWGRangesFromBismarkFiles <- function(files,
-                                               rmZeroCov,
-                                               strandCollapse,
-                                               verbose,
-                                               nThread,
-                                               BPPARAM) {
-    subverbose <- max(as.integer(verbose) - 1L, 0L)
-
-    # TODO: Instead of using the 'largest' file, use the largest
-    #       'cytosine report' file, which will have all loci in the
-    #       reference genome; provided all samples were aligned to the same
-    #       reference genome, this means it contains all loci.
-    # TODO: Initialise using the 'largest' file (i.e. largest number of lines)?
-    #       Would like to do this without reading the data into memory.
-    #       Some benchmarks can be found at
-    #       https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a
-    #       My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give:
-    #       wc -l:                                       77.000s
-    #       R.utils::readLines():                      1165.299s
-    #       nrow(fread(..., select = 1, nThread = 1)):  582.721s (359s re-run)
-    #       nrow(fread(..., select = 1, nThread = 10)):  82.029s
-    #       nrow(fread(..., select = 1, nThread = 40)):  81.408s
-    #       file.size():                                  0.000s
-    #       Of course, fread() only works directly with non-[b]gzipped files.
-    #       And subsequent runs of fread() benefit from some cacheing effect
-    #       that I don't fully understand except to know that subsequent runs
-    #       are 'artificially' faster.
-    #       And using file.size() will be innaccurate if files are a mix of
-    #       compressed and uncompressed files.
-    # Initalise `loci_dt` using the first file.
-    if (verbose) {
-        message("[.contructFWGRangesFromBismarkFiles] Extracting loci from ",
-                "'", files[1L], "'")
-    }
-    loci_from_first_file <- .readBismarkAsFWGRanges(
-        file = files[[1L]],
-        rmZeroCov = rmZeroCov,
-        strandCollapse = strandCollapse,
-        nThread = nThread,
-        verbose = subverbose)
-    # Identify loci not found in first file.
-    # TODO: Pre-process loci as a GNCList?
-    # Set number of tasks to ensure the progress bar gives frequent updates.
-    # NOTE: The progress bar increments once per task
-    #       (https://github.com/Bioconductor/BiocParallel/issues/54).
-    #       Although it is somewhat of a bad idea to overrides a user-specified
-    #       bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
-    #       performance in this instance, and so we opt for a useful progress
-    #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
-    #       bptasks<-() method.
-    if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
-        bptasks(BPPARAM) <- length(files) - 1L
-    }
-    list_of_loci_from_other_files_not_in_first_file <- bplapply(
-        files[-1L], function(file, loci_from_first_file) {
-            # Read this file.
-            loci_from_this_file <- .readBismarkAsFWGRanges(
-                file = file,
-                rmZeroCov = rmZeroCov,
-                strandCollapse = strandCollapse,
-                verbose = subverbose)
-            subsetByOverlaps(
-                x = loci_from_this_file,
-                ranges = loci_from_first_file,
-                type = "equal",
-                invert = TRUE)
-        }, loci_from_first_file = loci_from_first_file,
-        BPPARAM = BPPARAM)
-    # Identify unique FWGRanges.
-    loci_non_found_in_first_file <- unique(
-        do.call(c, list_of_loci_from_other_files_not_in_first_file))
-    loci <- c(loci_from_first_file, loci_non_found_in_first_file)
-
-    # Sort the loci
-    sort(sortSeqlevels(loci))
-}
-
 # TODOs ------------------------------------------------------------------------
 
 # TODO: Document internal classes, methods, and functions for my own sanity.
diff --git a/R/read.bismark.R b/R/read.bismark.R
@@ -24,6 +24,174 @@
     }, character(1L))
 }
 
+
+# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels())
+#       to the output. This is the default behaviour because it results in
+#       the smallest returned object (albeit at the small cost of a sort).
+.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
+                                    strandCollapse = FALSE, sort = TRUE,
+                                    nThread = 1L, verbose = FALSE) {
+    # Argument checks ----------------------------------------------------------
+
+    stopifnot(isTRUEorFALSE(rmZeroCov))
+    stopifnot(isTRUEorFALSE(strandCollapse))
+    stopifnot(isTRUEorFALSE(sort))
+
+    # Quieten R CMD check about 'no visible binding for global variable'
+    M <- U <- NULL
+
+    # Read file to construct data.table of valid loci --------------------------
+    if (rmZeroCov) {
+        dt <- .readBismarkAsDT(
+            file = file,
+            col_spec = "BSseq",
+            check = TRUE,
+            verbose = verbose)
+        if (strandCollapse && !is.null(dt[["strand"]]) &&
+            !dt[, all(strand == "*")]) {
+            # Shift loci on negative strand by 1 to the left and then remove
+            # strand since no longer valid.
+            dt[strand == "-", start := start - 1L][, strand := NULL]
+            # Aggregate counts at loci with the same 'seqnames' and 'start'.
+            dt <- dt[,
+                     list(M = sum(M), U = sum(U)), by = c("seqnames", "start")]
+        }
+        # Identify loci with non-zero coverage then drop 'M' and 'U' as no
+        # longer required.
+        dt <- dt[(M + U) > 0][, c("M", "U") := list(NULL, NULL)]
+    } else {
+        dt <- .readBismarkAsDT(
+            file = file,
+            col_spec = "GRanges",
+            check = FALSE,
+            nThread = nThread,
+            verbose = verbose)
+        if (strandCollapse && !is.null(dt[["strand"]]) &&
+            !dt[, all(strand == "*")]) {
+            # Shift loci on negative strand by 1 to the left and then remove
+            # strand since no longer valid.
+            dt[strand == "-", start := start - 1L][, strand := NULL]
+            dt <- data.table:::funique(dt)
+        }
+    }
+
+    # Construct FWGRanges from 'dt' --------------------------------------------
+
+    # NOTE: Sorting results in a smaller FWGRanges object because the
+    #       'seqnames' and 'strand' slots are more compressible in their Rle
+    #       representation.
+    if (sort) {
+        if (is.null(dt[["strand"]])) {
+            setkey(dt, seqnames, start)
+        } else {
+            setkey(dt, seqnames, strand, start)
+        }
+    }
+    seqnames <- Rle(dt[["seqnames"]])
+    dt[, seqnames := NULL]
+    seqinfo <- Seqinfo(seqnames = levels(seqnames))
+    ranges <- .FWIRanges(start = dt[["start"]], width = 1L)
+    dt[, start := NULL]
+    mcols <- make_zero_col_DFrame(length(ranges))
+    if (is.null(dt[["strand"]])) {
+        strand <- strand(Rle("*", length(seqnames)))
+    } else {
+        strand <- Rle(dt[["strand"]])
+        dt[, strand := NULL]
+    }
+    fwgranges <- .FWGRanges(
+        seqnames = seqnames,
+        ranges = ranges,
+        strand = strand,
+        seqinfo = seqinfo,
+        elementMetadata = mcols)
+    # NOTE: Final sort is to re-order with respect to sorted seqlevels.
+    if (sort) {
+        fwgranges <- sort(sortSeqlevels(fwgranges))
+    }
+    fwgranges
+}
+
+# TODO: Document that this applies sort(sortSeqlevels()) to the output. It is
+#       deliberate that there is no option to override this behaviour.
+.constructFWGRangesFromBismarkFiles <- function(files,
+                                               rmZeroCov,
+                                               strandCollapse,
+                                               verbose,
+                                               nThread,
+                                               BPPARAM) {
+    subverbose <- max(as.integer(verbose) - 1L, 0L)
+
+    # TODO: Instead of using the 'largest' file, use the largest
+    #       'cytosine report' file, which will have all loci in the
+    #       reference genome; provided all samples were aligned to the same
+    #       reference genome, this means it contains all loci.
+    # TODO: Initialise using the 'largest' file (i.e. largest number of lines)?
+    #       Would like to do this without reading the data into memory.
+    #       Some benchmarks can be found at
+    #       https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a
+    #       My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give:
+    #       wc -l:                                       77.000s
+    #       R.utils::readLines():                      1165.299s
+    #       nrow(fread(..., select = 1, nThread = 1)):  582.721s (359s re-run)
+    #       nrow(fread(..., select = 1, nThread = 10)):  82.029s
+    #       nrow(fread(..., select = 1, nThread = 40)):  81.408s
+    #       file.size():                                  0.000s
+    #       Of course, fread() only works directly with non-[b]gzipped files.
+    #       And subsequent runs of fread() benefit from some cacheing effect
+    #       that I don't fully understand except to know that subsequent runs
+    #       are 'artificially' faster.
+    #       And using file.size() will be innaccurate if files are a mix of
+    #       compressed and uncompressed files.
+    # Initalise `loci_dt` using the first file.
+    if (verbose) {
+        message("[.constructFWGRangesFromBismarkFiles] Extracting loci from ",
+                "'", files[1L], "'")
+    }
+    loci_from_first_file <- .readBismarkAsFWGRanges(
+        file = files[[1L]],
+        rmZeroCov = rmZeroCov,
+        strandCollapse = strandCollapse,
+        nThread = nThread,
+        verbose = subverbose)
+    # Identify loci not found in first file.
+    # TODO: Pre-process loci as a GNCList?
+    # Set number of tasks to ensure the progress bar gives frequent updates.
+    # NOTE: The progress bar increments once per task
+    #       (https://github.com/Bioconductor/BiocParallel/issues/54).
+    #       Although it is somewhat of a bad idea to overrides a user-specified
+    #       bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
+    #       performance in this instance, and so we opt for a useful progress
+    #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
+    #       bptasks<-() method.
+    if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
+        bptasks(BPPARAM) <- length(files) - 1L
+    }
+    list_of_loci_from_other_files_not_in_first_file <- bplapply(
+        files[-1L], function(file, loci_from_first_file) {
+            # Read this file.
+            loci_from_this_file <- .readBismarkAsFWGRanges(
+                file = file,
+                rmZeroCov = rmZeroCov,
+                strandCollapse = strandCollapse,
+                verbose = subverbose)
+            subsetByOverlaps(
+                x = loci_from_this_file,
+                ranges = loci_from_first_file,
+                type = "equal",
+                invert = TRUE)
+        }, loci_from_first_file = loci_from_first_file,
+        BPPARAM = BPPARAM)
+    # Identify unique FWGRanges.
+    loci_non_found_in_first_file <- unique(
+        do.call(c, list_of_loci_from_other_files_not_in_first_file))
+    loci <- c(loci_from_first_file, loci_non_found_in_first_file)
+
+    # Sort the loci
+    sort(sortSeqlevels(loci))
+}
+
+
 # NOTE: In brief benchmarking, readr::read_csv() is ~1.3-1.6x faster than
 #       utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x
 #       more total memory allocated. Therefore, there may be times users prefer
@@ -442,7 +610,7 @@ read.bismark <- function(files,
             message(
                 "[read.bismark] Parsing files and constructing valid loci ...")
         }
-        loci <- .contructFWGRangesFromBismarkFiles(
+        loci <- .constructFWGRangesFromBismarkFiles(
             files = files,
             rmZeroCov = rmZeroCov,
             strandCollapse = strandCollapse,
@@ -477,7 +645,7 @@ read.bismark <- function(files,
             }
             ptime1 <- proc.time()
             # Construct loci with non-zero coverage in files.
-            loci_from_files <- .contructFWGRangesFromBismarkFiles(
+            loci_from_files <- .constructFWGRangesFromBismarkFiles(
                 files = files,
                 rmZeroCov = rmZeroCov,
                 strandCollapse = strandCollapse,
@@ -534,7 +702,7 @@ read.bismark <- function(files,
     # .BSseq(se, trans = function(x) NULL, parameters = list())
     bsseq <- new2("BSseq", se, check = FALSE)
     if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
-        # NOTE: Save BSseq object; mimicing
+          # NOTE: Save BSseq object; mimicing
         #       HDF5Array::saveHDF5SummarizedExperiment().
         x <- bsseq
         x@assays <- HDF5Array::shorten_assay2h5_links(x@assays)