Merge pull request #7 from HDash/seacr_support

Add support for reading SEACR peak files
neurogenomics · Jun 13, 2024 · daa36c0 · daa36c0
2 parents 5e1a6ff + 7f94650
commit daa36c0
Show file tree

Hide file tree

Showing 7 changed files with 1,681 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,6 +19,7 @@ Imports:
   BSgenome,
   GenomicRanges,
   ggplot2,
+  IRanges,
   memes,
   rGADEM,
   rtracklayer,

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,12 +5,15 @@ export(density_plot)
 export(motif_enrichment)
 export(read_motif_file)
 export(read_peak_file)
+export(read_peak_file_seacr)
 export(summit_to_motif)
 import(Biostrings)
 import(GenomicRanges)
+import(IRanges)
 import(ggplot2)
 import(universalmotif)
 importFrom(BSgenome,getSeq)
 importFrom(memes,runAme)
 importFrom(memes,runFimo)
 importFrom(rtracklayer,import)
+importFrom(utils,read.table)
diff --git a/R/read_peak_file_seacr.R b/R/read_peak_file_seacr.R
@@ -0,0 +1,53 @@
+#' Import a SEACR BED file as a GRanges object
+#'
+#' \code{read_peak_file()} reads a SEACR BED file as a GRanges object and adds
+#' an additional metadata column representing the position of the peak summit.
+#'
+#' @importFrom utils read.table
+#' @import IRanges
+#' @import GenomicRanges
+#'
+#' @param file_path Path to a \code{BED} file.
+#'
+#' @examples
+#' \dontrun{
+#' peak_file <- system.file("extdata",
+#'                          "ctcf_seacr.stringent.bed",
+#'                          package = "MotifStats"
+#'                          )
+#' peak_obj <- read_peak_file_seacr(file_path = peak_file)
+#' }
+#'
+#' @export
+read_peak_file_seacr <- function(file_path){
+  # Read BED file as table
+  obj <- utils::read.table(file_path, header = TRUE)
+  names(obj) <- c("chr",
+                  "start",
+                  "end",
+                  "total_signal",
+                  "max_signal",
+                  "max_signal_region")
+
+  # Create summit column
+  separated_cols <- strsplit(as.character(obj$max_signal_region), "[:|-]")
+  obj$max_chr <- sapply(separated_cols, function(x) x[1])
+  obj$max_start <- sapply(separated_cols, function(x) x[2])
+  obj$max_end <- sapply(separated_cols, function(x) x[3])
+  obj$summit <- floor((as.numeric(obj$max_start) + as.numeric(obj$max_end)) / 2)
+
+  # Create GRanges object
+  gr_ranges <- IRanges::IRanges(start = obj$start, end = obj$end)
+
+  gr_obj <- GenomicRanges::GRanges(
+    seqnames = obj$chr,
+    ranges = gr_ranges,
+    strand = "*"
+  )
+
+  GenomicRanges::mcols(gr_obj)$summit <- obj$summit
+  GenomicRanges::mcols(gr_obj)$name <- paste0("peak_", 1:nrow(obj))
+  names(gr_obj) <- GenomicRanges::mcols(gr_obj)$name
+
+  return(gr_obj)
+}
diff --git a/inst/extdata/ctcf_seacr.stringent.bed b/inst/extdata/ctcf_seacr.stringent.bed
diff --git a/man/read_peak_file_seacr.Rd b/man/read_peak_file_seacr.Rd
diff --git a/tests/testthat/test-read_peak_file_seacr.R b/tests/testthat/test-read_peak_file_seacr.R
@@ -0,0 +1,8 @@
+test_that("The output of read_peak_file is a GRanges object", {
+  peak_file <- system.file("extdata",
+                           "ctcf_seacr.stringent.bed",
+                           package = "MotifStats")
+  res <- read_peak_file_seacr(file_path = peak_file)
+
+  expect_s4_class(res, class = "GRanges")
+})
diff --git a/vignettes/MotifStats.Rmd b/vignettes/MotifStats.Rmd
@@ -97,6 +97,11 @@ Next, we load the narrowPeak files from MACS2/3[^f2].
 name_peaks <- read_peak_file("/path/to/peak/file.narrowPeak")
 ```
 
+Alternatively, we may load the BED files from SEACR[^f3].
+```{r eval = FALSE}
+name_peaks <- read_peak_file("/path/to/peak/file.bed")
+```
+
 For this example, we will load the built-in CTCF TIP-seq peaks (as `GRanges`
 object) and CREB motif (as `PWMatrix` object) data.
 ```{r include  = TRUE}
@@ -116,7 +121,7 @@ relative to a set of background sequences.
 - A 0-order background model with the same nucleotide composition as the input
 sequences is used to generate the background sequences. 
 - An additional `out_dir` argument can be used to specify the
-directory to save the AME output files[^f3] and the background model.  
+directory to save the AME output files[^f4] and the background model.  
 
 ```{r include  = TRUE}
 ctcf_read_prop <- motif_enrichment(
@@ -206,5 +211,7 @@ sessionInfo()
 [^f1]: The peak file is a subset of chromosome 19 to reduce the file size.
 [^f2]: narrowPeak files from both version 2 and 3 of [MACS: Model-based Analysis
 for ChIP-Seq](https://github.com/macs3-project/MACS) is supported.
-[^f3]: For more information on the output files, refer to the
+[^f3]: BED files from
+[SEACR: Sparse Enrichment Analysis for CUT&RUN](https://github.com/FredHutch/SEACR)
+[^f4]: For more information on the output files, refer to the
 [AME documentation](https://meme-suite.org/meme/doc/ame.html).