|
24 | 24 | }, character(1L))
|
25 | 25 | }
|
26 | 26 |
|
| 27 | + |
| 28 | +# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels()) |
| 29 | +# to the output. This is the default behaviour because it results in |
| 30 | +# the smallest returned object (albeit at the small cost of a sort). |
| 31 | +.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE, |
| 32 | + strandCollapse = FALSE, sort = TRUE, |
| 33 | + nThread = 1L, verbose = FALSE) { |
| 34 | + # Argument checks ---------------------------------------------------------- |
| 35 | + |
| 36 | + stopifnot(isTRUEorFALSE(rmZeroCov)) |
| 37 | + stopifnot(isTRUEorFALSE(strandCollapse)) |
| 38 | + stopifnot(isTRUEorFALSE(sort)) |
| 39 | + |
| 40 | + # Quieten R CMD check about 'no visible binding for global variable' |
| 41 | + M <- U <- NULL |
| 42 | + |
| 43 | + # Read file to construct data.table of valid loci -------------------------- |
| 44 | + if (rmZeroCov) { |
| 45 | + dt <- .readBismarkAsDT( |
| 46 | + file = file, |
| 47 | + col_spec = "BSseq", |
| 48 | + check = TRUE, |
| 49 | + verbose = verbose) |
| 50 | + if (strandCollapse && !is.null(dt[["strand"]]) && |
| 51 | + !dt[, all(strand == "*")]) { |
| 52 | + # Shift loci on negative strand by 1 to the left and then remove |
| 53 | + # strand since no longer valid. |
| 54 | + dt[strand == "-", start := start - 1L][, strand := NULL] |
| 55 | + # Aggregate counts at loci with the same 'seqnames' and 'start'. |
| 56 | + dt <- dt[, |
| 57 | + list(M = sum(M), U = sum(U)), by = c("seqnames", "start")] |
| 58 | + } |
| 59 | + # Identify loci with non-zero coverage then drop 'M' and 'U' as no |
| 60 | + # longer required. |
| 61 | + dt <- dt[(M + U) > 0][, c("M", "U") := list(NULL, NULL)] |
| 62 | + } else { |
| 63 | + dt <- .readBismarkAsDT( |
| 64 | + file = file, |
| 65 | + col_spec = "GRanges", |
| 66 | + check = FALSE, |
| 67 | + nThread = nThread, |
| 68 | + verbose = verbose) |
| 69 | + if (strandCollapse && !is.null(dt[["strand"]]) && |
| 70 | + !dt[, all(strand == "*")]) { |
| 71 | + # Shift loci on negative strand by 1 to the left and then remove |
| 72 | + # strand since no longer valid. |
| 73 | + dt[strand == "-", start := start - 1L][, strand := NULL] |
| 74 | + dt <- data.table:::funique(dt) |
| 75 | + } |
| 76 | + } |
| 77 | + |
| 78 | + # Construct FWGRanges from 'dt' -------------------------------------------- |
| 79 | + |
| 80 | + # NOTE: Sorting results in a smaller FWGRanges object because the |
| 81 | + # 'seqnames' and 'strand' slots are more compressible in their Rle |
| 82 | + # representation. |
| 83 | + if (sort) { |
| 84 | + if (is.null(dt[["strand"]])) { |
| 85 | + setkey(dt, seqnames, start) |
| 86 | + } else { |
| 87 | + setkey(dt, seqnames, strand, start) |
| 88 | + } |
| 89 | + } |
| 90 | + seqnames <- Rle(dt[["seqnames"]]) |
| 91 | + dt[, seqnames := NULL] |
| 92 | + seqinfo <- Seqinfo(seqnames = levels(seqnames)) |
| 93 | + ranges <- .FWIRanges(start = dt[["start"]], width = 1L) |
| 94 | + dt[, start := NULL] |
| 95 | + mcols <- make_zero_col_DFrame(length(ranges)) |
| 96 | + if (is.null(dt[["strand"]])) { |
| 97 | + strand <- strand(Rle("*", length(seqnames))) |
| 98 | + } else { |
| 99 | + strand <- Rle(dt[["strand"]]) |
| 100 | + dt[, strand := NULL] |
| 101 | + } |
| 102 | + fwgranges <- .FWGRanges( |
| 103 | + seqnames = seqnames, |
| 104 | + ranges = ranges, |
| 105 | + strand = strand, |
| 106 | + seqinfo = seqinfo, |
| 107 | + elementMetadata = mcols) |
| 108 | + # NOTE: Final sort is to re-order with respect to sorted seqlevels. |
| 109 | + if (sort) { |
| 110 | + fwgranges <- sort(sortSeqlevels(fwgranges)) |
| 111 | + } |
| 112 | + fwgranges |
| 113 | +} |
| 114 | + |
| 115 | +# TODO: Document that this applies sort(sortSeqlevels()) to the output. It is |
| 116 | +# deliberate that there is no option to override this behaviour. |
| 117 | +.constructFWGRangesFromBismarkFiles <- function(files, |
| 118 | + rmZeroCov, |
| 119 | + strandCollapse, |
| 120 | + verbose, |
| 121 | + nThread, |
| 122 | + BPPARAM) { |
| 123 | + subverbose <- max(as.integer(verbose) - 1L, 0L) |
| 124 | + |
| 125 | + # TODO: Instead of using the 'largest' file, use the largest |
| 126 | + # 'cytosine report' file, which will have all loci in the |
| 127 | + # reference genome; provided all samples were aligned to the same |
| 128 | + # reference genome, this means it contains all loci. |
| 129 | + # TODO: Initialise using the 'largest' file (i.e. largest number of lines)? |
| 130 | + # Would like to do this without reading the data into memory. |
| 131 | + # Some benchmarks can be found at |
| 132 | + # https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a |
| 133 | + # My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give: |
| 134 | + # wc -l: 77.000s |
| 135 | + # R.utils::readLines(): 1165.299s |
| 136 | + # nrow(fread(..., select = 1, nThread = 1)): 582.721s (359s re-run) |
| 137 | + # nrow(fread(..., select = 1, nThread = 10)): 82.029s |
| 138 | + # nrow(fread(..., select = 1, nThread = 40)): 81.408s |
| 139 | + # file.size(): 0.000s |
| 140 | + # Of course, fread() only works directly with non-[b]gzipped files. |
| 141 | + # And subsequent runs of fread() benefit from some cacheing effect |
| 142 | + # that I don't fully understand except to know that subsequent runs |
| 143 | + # are 'artificially' faster. |
| 144 | + # And using file.size() will be innaccurate if files are a mix of |
| 145 | + # compressed and uncompressed files. |
| 146 | + # Initalise `loci_dt` using the first file. |
| 147 | + if (verbose) { |
| 148 | + message("[.constructFWGRangesFromBismarkFiles] Extracting loci from ", |
| 149 | + "'", files[1L], "'") |
| 150 | + } |
| 151 | + loci_from_first_file <- .readBismarkAsFWGRanges( |
| 152 | + file = files[[1L]], |
| 153 | + rmZeroCov = rmZeroCov, |
| 154 | + strandCollapse = strandCollapse, |
| 155 | + nThread = nThread, |
| 156 | + verbose = subverbose) |
| 157 | + # Identify loci not found in first file. |
| 158 | + # TODO: Pre-process loci as a GNCList? |
| 159 | + # Set number of tasks to ensure the progress bar gives frequent updates. |
| 160 | + # NOTE: The progress bar increments once per task |
| 161 | + # (https://github.com/Bioconductor/BiocParallel/issues/54). |
| 162 | + # Although it is somewhat of a bad idea to overrides a user-specified |
| 163 | + # bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect |
| 164 | + # performance in this instance, and so we opt for a useful progress |
| 165 | + # bar. Only SnowParam (and MulticoreParam by inheritance) have a |
| 166 | + # bptasks<-() method. |
| 167 | + if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) { |
| 168 | + bptasks(BPPARAM) <- length(files) - 1L |
| 169 | + } |
| 170 | + list_of_loci_from_other_files_not_in_first_file <- bplapply( |
| 171 | + files[-1L], function(file, loci_from_first_file) { |
| 172 | + # Read this file. |
| 173 | + loci_from_this_file <- .readBismarkAsFWGRanges( |
| 174 | + file = file, |
| 175 | + rmZeroCov = rmZeroCov, |
| 176 | + strandCollapse = strandCollapse, |
| 177 | + verbose = subverbose) |
| 178 | + subsetByOverlaps( |
| 179 | + x = loci_from_this_file, |
| 180 | + ranges = loci_from_first_file, |
| 181 | + type = "equal", |
| 182 | + invert = TRUE) |
| 183 | + }, loci_from_first_file = loci_from_first_file, |
| 184 | + BPPARAM = BPPARAM) |
| 185 | + # Identify unique FWGRanges. |
| 186 | + loci_non_found_in_first_file <- unique( |
| 187 | + do.call(c, list_of_loci_from_other_files_not_in_first_file)) |
| 188 | + loci <- c(loci_from_first_file, loci_non_found_in_first_file) |
| 189 | + |
| 190 | + # Sort the loci |
| 191 | + sort(sortSeqlevels(loci)) |
| 192 | +} |
| 193 | + |
| 194 | + |
27 | 195 | # NOTE: In brief benchmarking, readr::read_csv() is ~1.3-1.6x faster than
|
28 | 196 | # utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x
|
29 | 197 | # more total memory allocated. Therefore, there may be times users prefer
|
@@ -442,7 +610,7 @@ read.bismark <- function(files,
|
442 | 610 | message(
|
443 | 611 | "[read.bismark] Parsing files and constructing valid loci ...")
|
444 | 612 | }
|
445 |
| - loci <- .contructFWGRangesFromBismarkFiles( |
| 613 | + loci <- .constructFWGRangesFromBismarkFiles( |
446 | 614 | files = files,
|
447 | 615 | rmZeroCov = rmZeroCov,
|
448 | 616 | strandCollapse = strandCollapse,
|
@@ -477,7 +645,7 @@ read.bismark <- function(files,
|
477 | 645 | }
|
478 | 646 | ptime1 <- proc.time()
|
479 | 647 | # Construct loci with non-zero coverage in files.
|
480 |
| - loci_from_files <- .contructFWGRangesFromBismarkFiles( |
| 648 | + loci_from_files <- .constructFWGRangesFromBismarkFiles( |
481 | 649 | files = files,
|
482 | 650 | rmZeroCov = rmZeroCov,
|
483 | 651 | strandCollapse = strandCollapse,
|
@@ -534,7 +702,7 @@ read.bismark <- function(files,
|
534 | 702 | # .BSseq(se, trans = function(x) NULL, parameters = list())
|
535 | 703 | bsseq <- new2("BSseq", se, check = FALSE)
|
536 | 704 | if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
|
537 |
| - # NOTE: Save BSseq object; mimicing |
| 705 | + # NOTE: Save BSseq object; mimicing |
538 | 706 | # HDF5Array::saveHDF5SummarizedExperiment().
|
539 | 707 | x <- bsseq
|
540 | 708 | x@assays <- HDF5Array::shorten_assay2h5_links(x@assays)
|
|
0 commit comments