Skip to content

Commit 9358fab

Browse files
committedMar 12, 2025·
moving a function from one R file to another
1 parent de1be1e commit 9358fab

File tree

2 files changed

+171
-169
lines changed

2 files changed

+171
-169
lines changed
 

‎R/FWGRanges-class.R

-166
Original file line numberDiff line numberDiff line change
@@ -214,172 +214,6 @@ setMethod("findOverlaps", c("FWGRanges", "FWGRanges"), .findOverlaps_FWGRanges)
214214
unique(x)
215215
}
216216

217-
# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels())
218-
# to the output. This is the default behaviour because it results in
219-
# the smallest returned object (albeit at the small cost of a sort).
220-
.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
221-
strandCollapse = FALSE, sort = TRUE,
222-
nThread = 1L, verbose = FALSE) {
223-
# Argument checks ----------------------------------------------------------
224-
225-
stopifnot(isTRUEorFALSE(rmZeroCov))
226-
stopifnot(isTRUEorFALSE(strandCollapse))
227-
stopifnot(isTRUEorFALSE(sort))
228-
229-
# Quieten R CMD check about 'no visible binding for global variable'
230-
M <- U <- NULL
231-
232-
# Read file to construct data.table of valid loci --------------------------
233-
if (rmZeroCov) {
234-
dt <- .readBismarkAsDT(
235-
file = file,
236-
col_spec = "BSseq",
237-
check = TRUE,
238-
verbose = verbose)
239-
if (strandCollapse && !is.null(dt[["strand"]]) &&
240-
!dt[, all(strand == "*")]) {
241-
# Shift loci on negative strand by 1 to the left and then remove
242-
# strand since no longer valid.
243-
dt[strand == "-", start := start - 1L][, strand := NULL]
244-
# Aggregate counts at loci with the same 'seqnames' and 'start'.
245-
dt <- dt[,
246-
list(M = sum(M), U = sum(U)), by = c("seqnames", "start")]
247-
}
248-
# Identify loci with non-zero coverage then drop 'M' and 'U' as no
249-
# longer required.
250-
dt <- dt[(M + U) > 0][, c("M", "U") := list(NULL, NULL)]
251-
} else {
252-
dt <- .readBismarkAsDT(
253-
file = file,
254-
col_spec = "GRanges",
255-
check = FALSE,
256-
nThread = nThread,
257-
verbose = verbose)
258-
if (strandCollapse && !is.null(dt[["strand"]]) &&
259-
!dt[, all(strand == "*")]) {
260-
# Shift loci on negative strand by 1 to the left and then remove
261-
# strand since no longer valid.
262-
dt[strand == "-", start := start - 1L][, strand := NULL]
263-
dt <- data.table:::funique(dt)
264-
}
265-
}
266-
267-
# Construct FWGRanges from 'dt' --------------------------------------------
268-
269-
# NOTE: Sorting results in a smaller FWGRanges object because the
270-
# 'seqnames' and 'strand' slots are more compressible in their Rle
271-
# representation.
272-
if (sort) {
273-
if (is.null(dt[["strand"]])) {
274-
setkey(dt, seqnames, start)
275-
} else {
276-
setkey(dt, seqnames, strand, start)
277-
}
278-
}
279-
seqnames <- Rle(dt[["seqnames"]])
280-
dt[, seqnames := NULL]
281-
seqinfo <- Seqinfo(seqnames = levels(seqnames))
282-
ranges <- .FWIRanges(start = dt[["start"]], width = 1L)
283-
dt[, start := NULL]
284-
mcols <- make_zero_col_DFrame(length(ranges))
285-
if (is.null(dt[["strand"]])) {
286-
strand <- strand(Rle("*", length(seqnames)))
287-
} else {
288-
strand <- Rle(dt[["strand"]])
289-
dt[, strand := NULL]
290-
}
291-
fwgranges <- .FWGRanges(
292-
seqnames = seqnames,
293-
ranges = ranges,
294-
strand = strand,
295-
seqinfo = seqinfo,
296-
elementMetadata = mcols)
297-
# NOTE: Final sort is to re-order with respect to sorted seqlevels.
298-
if (sort) {
299-
fwgranges <- sort(sortSeqlevels(fwgranges))
300-
}
301-
fwgranges
302-
}
303-
304-
# TODO: Document that this applies sort(sortSeqlevels()) to the output. It is
305-
# deliberate that there is no option to override this behaviour.
306-
.contructFWGRangesFromBismarkFiles <- function(files,
307-
rmZeroCov,
308-
strandCollapse,
309-
verbose,
310-
nThread,
311-
BPPARAM) {
312-
subverbose <- max(as.integer(verbose) - 1L, 0L)
313-
314-
# TODO: Instead of using the 'largest' file, use the largest
315-
# 'cytosine report' file, which will have all loci in the
316-
# reference genome; provided all samples were aligned to the same
317-
# reference genome, this means it contains all loci.
318-
# TODO: Initialise using the 'largest' file (i.e. largest number of lines)?
319-
# Would like to do this without reading the data into memory.
320-
# Some benchmarks can be found at
321-
# https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a
322-
# My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give:
323-
# wc -l: 77.000s
324-
# R.utils::readLines(): 1165.299s
325-
# nrow(fread(..., select = 1, nThread = 1)): 582.721s (359s re-run)
326-
# nrow(fread(..., select = 1, nThread = 10)): 82.029s
327-
# nrow(fread(..., select = 1, nThread = 40)): 81.408s
328-
# file.size(): 0.000s
329-
# Of course, fread() only works directly with non-[b]gzipped files.
330-
# And subsequent runs of fread() benefit from some cacheing effect
331-
# that I don't fully understand except to know that subsequent runs
332-
# are 'artificially' faster.
333-
# And using file.size() will be innaccurate if files are a mix of
334-
# compressed and uncompressed files.
335-
# Initalise `loci_dt` using the first file.
336-
if (verbose) {
337-
message("[.contructFWGRangesFromBismarkFiles] Extracting loci from ",
338-
"'", files[1L], "'")
339-
}
340-
loci_from_first_file <- .readBismarkAsFWGRanges(
341-
file = files[[1L]],
342-
rmZeroCov = rmZeroCov,
343-
strandCollapse = strandCollapse,
344-
nThread = nThread,
345-
verbose = subverbose)
346-
# Identify loci not found in first file.
347-
# TODO: Pre-process loci as a GNCList?
348-
# Set number of tasks to ensure the progress bar gives frequent updates.
349-
# NOTE: The progress bar increments once per task
350-
# (https://github.com/Bioconductor/BiocParallel/issues/54).
351-
# Although it is somewhat of a bad idea to overrides a user-specified
352-
# bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
353-
# performance in this instance, and so we opt for a useful progress
354-
# bar. Only SnowParam (and MulticoreParam by inheritance) have a
355-
# bptasks<-() method.
356-
if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
357-
bptasks(BPPARAM) <- length(files) - 1L
358-
}
359-
list_of_loci_from_other_files_not_in_first_file <- bplapply(
360-
files[-1L], function(file, loci_from_first_file) {
361-
# Read this file.
362-
loci_from_this_file <- .readBismarkAsFWGRanges(
363-
file = file,
364-
rmZeroCov = rmZeroCov,
365-
strandCollapse = strandCollapse,
366-
verbose = subverbose)
367-
subsetByOverlaps(
368-
x = loci_from_this_file,
369-
ranges = loci_from_first_file,
370-
type = "equal",
371-
invert = TRUE)
372-
}, loci_from_first_file = loci_from_first_file,
373-
BPPARAM = BPPARAM)
374-
# Identify unique FWGRanges.
375-
loci_non_found_in_first_file <- unique(
376-
do.call(c, list_of_loci_from_other_files_not_in_first_file))
377-
loci <- c(loci_from_first_file, loci_non_found_in_first_file)
378-
379-
# Sort the loci
380-
sort(sortSeqlevels(loci))
381-
}
382-
383217
# TODOs ------------------------------------------------------------------------
384218

385219
# TODO: Document internal classes, methods, and functions for my own sanity.

‎R/read.bismark.R

+171-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,174 @@
2424
}, character(1L))
2525
}
2626

27+
28+
# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels())
29+
# to the output. This is the default behaviour because it results in
30+
# the smallest returned object (albeit at the small cost of a sort).
31+
.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
32+
strandCollapse = FALSE, sort = TRUE,
33+
nThread = 1L, verbose = FALSE) {
34+
# Argument checks ----------------------------------------------------------
35+
36+
stopifnot(isTRUEorFALSE(rmZeroCov))
37+
stopifnot(isTRUEorFALSE(strandCollapse))
38+
stopifnot(isTRUEorFALSE(sort))
39+
40+
# Quieten R CMD check about 'no visible binding for global variable'
41+
M <- U <- NULL
42+
43+
# Read file to construct data.table of valid loci --------------------------
44+
if (rmZeroCov) {
45+
dt <- .readBismarkAsDT(
46+
file = file,
47+
col_spec = "BSseq",
48+
check = TRUE,
49+
verbose = verbose)
50+
if (strandCollapse && !is.null(dt[["strand"]]) &&
51+
!dt[, all(strand == "*")]) {
52+
# Shift loci on negative strand by 1 to the left and then remove
53+
# strand since no longer valid.
54+
dt[strand == "-", start := start - 1L][, strand := NULL]
55+
# Aggregate counts at loci with the same 'seqnames' and 'start'.
56+
dt <- dt[,
57+
list(M = sum(M), U = sum(U)), by = c("seqnames", "start")]
58+
}
59+
# Identify loci with non-zero coverage then drop 'M' and 'U' as no
60+
# longer required.
61+
dt <- dt[(M + U) > 0][, c("M", "U") := list(NULL, NULL)]
62+
} else {
63+
dt <- .readBismarkAsDT(
64+
file = file,
65+
col_spec = "GRanges",
66+
check = FALSE,
67+
nThread = nThread,
68+
verbose = verbose)
69+
if (strandCollapse && !is.null(dt[["strand"]]) &&
70+
!dt[, all(strand == "*")]) {
71+
# Shift loci on negative strand by 1 to the left and then remove
72+
# strand since no longer valid.
73+
dt[strand == "-", start := start - 1L][, strand := NULL]
74+
dt <- data.table:::funique(dt)
75+
}
76+
}
77+
78+
# Construct FWGRanges from 'dt' --------------------------------------------
79+
80+
# NOTE: Sorting results in a smaller FWGRanges object because the
81+
# 'seqnames' and 'strand' slots are more compressible in their Rle
82+
# representation.
83+
if (sort) {
84+
if (is.null(dt[["strand"]])) {
85+
setkey(dt, seqnames, start)
86+
} else {
87+
setkey(dt, seqnames, strand, start)
88+
}
89+
}
90+
seqnames <- Rle(dt[["seqnames"]])
91+
dt[, seqnames := NULL]
92+
seqinfo <- Seqinfo(seqnames = levels(seqnames))
93+
ranges <- .FWIRanges(start = dt[["start"]], width = 1L)
94+
dt[, start := NULL]
95+
mcols <- make_zero_col_DFrame(length(ranges))
96+
if (is.null(dt[["strand"]])) {
97+
strand <- strand(Rle("*", length(seqnames)))
98+
} else {
99+
strand <- Rle(dt[["strand"]])
100+
dt[, strand := NULL]
101+
}
102+
fwgranges <- .FWGRanges(
103+
seqnames = seqnames,
104+
ranges = ranges,
105+
strand = strand,
106+
seqinfo = seqinfo,
107+
elementMetadata = mcols)
108+
# NOTE: Final sort is to re-order with respect to sorted seqlevels.
109+
if (sort) {
110+
fwgranges <- sort(sortSeqlevels(fwgranges))
111+
}
112+
fwgranges
113+
}
114+
115+
# TODO: Document that this applies sort(sortSeqlevels()) to the output. It is
116+
# deliberate that there is no option to override this behaviour.
117+
.constructFWGRangesFromBismarkFiles <- function(files,
118+
rmZeroCov,
119+
strandCollapse,
120+
verbose,
121+
nThread,
122+
BPPARAM) {
123+
subverbose <- max(as.integer(verbose) - 1L, 0L)
124+
125+
# TODO: Instead of using the 'largest' file, use the largest
126+
# 'cytosine report' file, which will have all loci in the
127+
# reference genome; provided all samples were aligned to the same
128+
# reference genome, this means it contains all loci.
129+
# TODO: Initialise using the 'largest' file (i.e. largest number of lines)?
130+
# Would like to do this without reading the data into memory.
131+
# Some benchmarks can be found at
132+
# https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a
133+
# My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give:
134+
# wc -l: 77.000s
135+
# R.utils::readLines(): 1165.299s
136+
# nrow(fread(..., select = 1, nThread = 1)): 582.721s (359s re-run)
137+
# nrow(fread(..., select = 1, nThread = 10)): 82.029s
138+
# nrow(fread(..., select = 1, nThread = 40)): 81.408s
139+
# file.size(): 0.000s
140+
# Of course, fread() only works directly with non-[b]gzipped files.
141+
# And subsequent runs of fread() benefit from some cacheing effect
142+
# that I don't fully understand except to know that subsequent runs
143+
# are 'artificially' faster.
144+
# And using file.size() will be innaccurate if files are a mix of
145+
# compressed and uncompressed files.
146+
# Initalise `loci_dt` using the first file.
147+
if (verbose) {
148+
message("[.constructFWGRangesFromBismarkFiles] Extracting loci from ",
149+
"'", files[1L], "'")
150+
}
151+
loci_from_first_file <- .readBismarkAsFWGRanges(
152+
file = files[[1L]],
153+
rmZeroCov = rmZeroCov,
154+
strandCollapse = strandCollapse,
155+
nThread = nThread,
156+
verbose = subverbose)
157+
# Identify loci not found in first file.
158+
# TODO: Pre-process loci as a GNCList?
159+
# Set number of tasks to ensure the progress bar gives frequent updates.
160+
# NOTE: The progress bar increments once per task
161+
# (https://github.com/Bioconductor/BiocParallel/issues/54).
162+
# Although it is somewhat of a bad idea to overrides a user-specified
163+
# bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
164+
# performance in this instance, and so we opt for a useful progress
165+
# bar. Only SnowParam (and MulticoreParam by inheritance) have a
166+
# bptasks<-() method.
167+
if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
168+
bptasks(BPPARAM) <- length(files) - 1L
169+
}
170+
list_of_loci_from_other_files_not_in_first_file <- bplapply(
171+
files[-1L], function(file, loci_from_first_file) {
172+
# Read this file.
173+
loci_from_this_file <- .readBismarkAsFWGRanges(
174+
file = file,
175+
rmZeroCov = rmZeroCov,
176+
strandCollapse = strandCollapse,
177+
verbose = subverbose)
178+
subsetByOverlaps(
179+
x = loci_from_this_file,
180+
ranges = loci_from_first_file,
181+
type = "equal",
182+
invert = TRUE)
183+
}, loci_from_first_file = loci_from_first_file,
184+
BPPARAM = BPPARAM)
185+
# Identify unique FWGRanges.
186+
loci_non_found_in_first_file <- unique(
187+
do.call(c, list_of_loci_from_other_files_not_in_first_file))
188+
loci <- c(loci_from_first_file, loci_non_found_in_first_file)
189+
190+
# Sort the loci
191+
sort(sortSeqlevels(loci))
192+
}
193+
194+
27195
# NOTE: In brief benchmarking, readr::read_csv() is ~1.3-1.6x faster than
28196
# utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x
29197
# more total memory allocated. Therefore, there may be times users prefer
@@ -442,7 +610,7 @@ read.bismark <- function(files,
442610
message(
443611
"[read.bismark] Parsing files and constructing valid loci ...")
444612
}
445-
loci <- .contructFWGRangesFromBismarkFiles(
613+
loci <- .constructFWGRangesFromBismarkFiles(
446614
files = files,
447615
rmZeroCov = rmZeroCov,
448616
strandCollapse = strandCollapse,
@@ -477,7 +645,7 @@ read.bismark <- function(files,
477645
}
478646
ptime1 <- proc.time()
479647
# Construct loci with non-zero coverage in files.
480-
loci_from_files <- .contructFWGRangesFromBismarkFiles(
648+
loci_from_files <- .constructFWGRangesFromBismarkFiles(
481649
files = files,
482650
rmZeroCov = rmZeroCov,
483651
strandCollapse = strandCollapse,
@@ -534,7 +702,7 @@ read.bismark <- function(files,
534702
# .BSseq(se, trans = function(x) NULL, parameters = list())
535703
bsseq <- new2("BSseq", se, check = FALSE)
536704
if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
537-
# NOTE: Save BSseq object; mimicing
705+
# NOTE: Save BSseq object; mimicing
538706
# HDF5Array::saveHDF5SummarizedExperiment().
539707
x <- bsseq
540708
x@assays <- HDF5Array::shorten_assay2h5_links(x@assays)

0 commit comments

Comments
 (0)
Please sign in to comment.