diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 745d09057e..0aafa27cf9 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -27,14 +27,14 @@ MultiSampleArrays 1.6.2 2024-08-02 ValidateChip 1.16.7 2024-11-04 Arrays 2.6.30 2024-11-04 AnnotationFiltration 1.2.7 2024-11-04 -Multiome 5.9.6 2025-01-21 +Multiome 5.10.0 2025-02-03 snm3C 4.0.4 2024-08-06 SlideSeq 3.4.8 2025-01-13 scATAC 1.3.2 2023-08-03 BuildIndices 4.0.0 2025-01-17 MultiSampleSmartSeq2 2.2.22 2024-09-11 Optimus 7.9.1 2025-01-13 -atac 2.6.0 2025-01-21 -PairedTag 1.10.0 2025-01-21 +atac 2.7.0 2025-02-03 +PairedTag 1.10.1 2025-02-03 SmartSeq2SingleSample 5.1.21 2024-09-11 MultiSampleSmartSeq2SingleNucleus 2.0.7 2025-01-13 diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index d47665f255..636f57ac71 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,9 @@ +# 2.7.0 +2025-02-03 (Date of Last Commit) + +* Added an optional PeakCalling task +* Added a boolean variable peak_calling; default is false + # 2.6.0 2025-01-21 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 9771e753cc..3c74822c93 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -28,6 +28,8 @@ workflow ATAC { # Option for running files with preindex Boolean preindex = false + # Option for running peak calling + Boolean peak_calling = false # BWA ref File tar_bwa_reference @@ -49,7 +51,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.6.0" + String pipeline_version = "2.7.0" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" @@ -61,7 +63,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:1.1.0" + String snap_atac_docker = "snapatac2:2.0.0" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { @@ -71,7 +73,6 @@ workflow ATAC { } } - parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" read2_fastq_gzipped: "read 2 FASTQ file as input for the pipeline, contains the cellular barcodes corresponding to the reads in the read1 FASTQ and read 3 FASTQ" @@ -160,20 +161,32 @@ workflow ATAC { input_id = input_id } + if (peak_calling) { + call PeakCalling { + input: + output_base_name = input_id, + annotations_gtf = annotations_gtf, + metrics_h5ad = CreateFragmentFile.Snap_metrics, + chrom_sizes = chrom_sizes, + docker_path = docker_prefix + snap_atac_docker + } + } } - + File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File fragment_file_index_atac = select_first([BB_fragment.fragment_file_index, CreateFragmentFile.fragment_file_index]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) - + output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac File fragment_file_index = fragment_file_index_atac File snap_metrics = snap_metrics_atac File library_metrics_file = library_metrics + File? cellbybin_h5ad_file = PeakCalling.cellbybin_h5ad + File? cellbypeak_h5ad_file = PeakCalling.cellbypeak_h5ad } } @@ -269,7 +282,6 @@ task GetNumSplits { } } - # trim read 1 and read 2 adapter sequeunce with cutadapt task TrimAdapters { input { @@ -513,7 +525,6 @@ task CreateFragmentFile { File bam File annotations_gtf File chrom_sizes - File annotations_gtf Boolean preindex Int disk_size = 500 Int mem_size = 64 @@ -536,7 +547,8 @@ task CreateFragmentFile { } command <<< - set -e pipefail + set -euo pipefail + set -x python3 <>> + + runtime { + docker: docker_path + disks: "local-disk ${disk_size} SSD" + memory: "${mem_size} GiB" + cpu: nthreads + } + + output { + File cellbybin_h5ad = "~{output_base_name}.cellbybin.h5ad" + File cellbypeak_h5ad = "~{output_base_name}.cellbypeak.h5ad" + } +} diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index c73565dbbc..2dd910c46a 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,9 @@ +# 5.10.0 +2025-02-03 (Date of Last Commit) + +* Added an optional PeakCalling task to the ATAC workflow +* Added a boolean variable run_peak_calling to the Multiome pipeline; default is false + # 5.9.6 2025-01-21 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 1ed1108f07..d079e5c58a 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -7,7 +7,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.9.6" + String pipeline_version = "5.10.0" input { String cloud_provider @@ -50,6 +50,8 @@ workflow Multiome { # CellBender Boolean run_cellbender = false + # Peak Calling + Boolean run_peak_calling = false } # Determine docker prefix based on cloud provider @@ -121,7 +123,8 @@ workflow Multiome { annotations_gtf = annotations_gtf, atac_nhash_id = atac_nhash_id, adapter_seq_read3 = adapter_seq_read3, - atac_expected_cells = expected_cells + atac_expected_cells = expected_cells, + peak_calling = run_peak_calling } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: @@ -149,6 +152,8 @@ workflow Multiome { File fragment_file_index = JoinBarcodes.atac_fragment_tsv_index File snap_metrics_atac = JoinBarcodes.atac_h5ad_file File atac_library_metrics = Atac.library_metrics_file + File? cellbybin_h5ad_file = Atac.cellbybin_h5ad_file + File? cellbypeak_h5ad_file = Atac.cellbypeak_h5ad_file # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled_peakcall.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled_peakcall.json new file mode 100644 index 0000000000..a567f330b8 --- /dev/null +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled_peakcall.json @@ -0,0 +1,31 @@ +{ + "Multiome.annotations_gtf":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf", + "Multiome.input_id":"10k_PBMC_downsampled", + "Multiome.cloud_provider":"gcp", + "Multiome.gex_r1_fastq":[ + "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R1_gex.fastq.gz" + ], + "Multiome.gex_r2_fastq":[ + "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R2_gex.fastq.gz" + ], + "Multiome.atac_r1_fastq":[ + "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R1_atac.fastq.gz" + ], + "Multiome.atac_r2_fastq":[ + "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R2_atac.fastq.gz" + ], + "Multiome.atac_r3_fastq":[ + "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R3_atac.fastq.gz" + ], + "Multiome.tar_bwa_reference":"gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", + "Multiome.tar_star_reference":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", + "Multiome.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", + "Multiome.run_cellbender":"false", + "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake", + "Multiome.Atac.num_threads_bwa":"16", + "Multiome.Atac.mem_size_bwa":"64", + "Multiome.soloMultiMappers":"Uniform", + "Multiome.gex_nhash_id":"example_1234", + "Multiome.atac_nhash_id":"example_1234", + "Multiome.run_peak_calling":"true" +} diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index d26db399ff..403d6ce5b5 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,10 +1,16 @@ +# 1.10.1 +2025-02-03 (Date of Last Commit) + +* Added an optional PeakCalling task to the ATAC workflow; this does not affect the outputs of the pipeline +* Added a boolean variable run_peak_calling to the Multiome pipeline; default is false and this does not affect the outputs of the pipeline + # 1.10.0 2025-01-21 (Date of Last Commit) * Added a boolean variable is_slidetags; default is false, but set to true if Slide-Tags pipeline is calling Optimus * Added reference_gtf_file to the output h5ad unstructured metadata * Added the fragment file CSI index as workflow output -* Updated the default STARsolo multimapping parameter to the "EM" tehcnique +* Updated the default STARsolo multimapping parameter to the EM tehcnique # 1.9.0 2024-12-05 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index fdf942bf90..05d1b74b96 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.10.0" + String pipeline_version = "1.10.1" input { diff --git a/verification/test-wdls/TestMultiome.wdl b/verification/test-wdls/TestMultiome.wdl index 00df0dd856..9479a9c031 100644 --- a/verification/test-wdls/TestMultiome.wdl +++ b/verification/test-wdls/TestMultiome.wdl @@ -51,6 +51,7 @@ workflow TestMultiome { String results_path Boolean update_truth Boolean run_cellbender = false + Boolean run_peak_calling = false } @@ -85,7 +86,8 @@ workflow TestMultiome { soloMultiMappers = soloMultiMappers, cloud_provider = cloud_provider, gex_nhash_id = gex_nhash_id, - atac_nhash_id = atac_nhash_id + atac_nhash_id = atac_nhash_id, + run_peak_calling = run_peak_calling } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 86d3eaab3e..0d5ee42458 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -65,6 +65,7 @@ The following describes the inputs of the ATAC workflow. For more details on how | adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | | vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | atac_nhash_id | String that represents an optional library aliquot identifier. When used, it is echoed in the h5ad unstructured data. | +| peak_calling | Optional boolean used to determine if the ATAC pipeline should run Peak Calling; default is "false". | Boolean | ## ATAC tasks and tools @@ -86,7 +87,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | [TrimAdapters](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | Cutadapt v4.4 | cutadapt | Trims adaptor sequences. | | [BWAPairedEndAlignment](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | bwa-mem2 | mem | Aligns reads from each set of partitioned FASTQ files to the genome and outputs a BAM with ATAC barcodes in the CB:Z tag. | | [CreateFragmentFile](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | make_fragment_file, import_data | SnapATAC2 | Generates a fragment file from the final aligned BAM and outputs per barcode quality metrics in h5ad. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | - +| [PeakCalling](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | macs3 | SnapATAC2 | Generates two h5ad files (`cellbybin.h5ad` and `cellbypeak.h5ad`) from the CreateFragmentFile h5ad output file (`metrics.h5ad`). The `cellbybin.h5ad` contains the peak called per cluster in the macs3 unstructured metadata and `cellbypeak.h5ad` contains the merged peaks and the count matrix of peaks per fragment. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | ## Output variables @@ -95,7 +96,9 @@ To see specific tool parameters, select the task WDL link in the table; then vie | bam_aligned_output | ``.bam | BAM containing aligned reads from ATAC workflow. | | fragment_file | ``.fragments.sorted.tsv.gz | Bgzipped TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) + library_metrics | ``_``_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) | +| snap_cellbybin | ``.cellbybin.h5ad | h5ad (Anndata) containing peaks (called by MACS3) per cluster. [SnapATAC2](https://github.com/kaizhang/SnapATAC2). | +| snap_cellbypeak | ``.cellbypeak.h5ad | h5ad (Anndata) containing merged peaks (called by MACS3) per cluster and count matrix of insertion sites per peak and cell. [SnapATAC2](https://github.com/kaizhang/SnapATAC2).| ## Versioning and testing diff --git a/website/docs/Pipelines/BuildIndices_Pipeline/README.md b/website/docs/Pipelines/BuildIndices_Pipeline/README.md index 547975b821..04b2eea7f0 100644 --- a/website/docs/Pipelines/BuildIndices_Pipeline/README.md +++ b/website/docs/Pipelines/BuildIndices_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/BuildIndices_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [BuildIndices_v3.0.0](https://github.com/broadinstitute/warp/releases) | December, 2023 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [BuildIndices_v4.0.0](https://github.com/broadinstitute/warp/releases) | January, 2025 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![BuildIndices_diagram](./buildindices_diagram.png) @@ -48,6 +48,11 @@ The BuildIndices pipeline can be deployed using [Cromwell](https://cromwell.read The BuildIndices workflow inputs are specified in JSON configuration files. Configuration files for [macaque](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/build_indices/Macaque.json) and [mouse](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/build_indices/Mouse.json) references can be found in the WARP repository. #### Input descriptions +The table below describes the input variables for the BuildIndices workflow. + +:::tip +Marmoset scripts expect a custom-modified input Marmoset GTF file and FASTA file. These inputs and accompanying README are located in a [public Google Drive](https://drive.google.com/drive/folders/15JcUhwOqkJwTVS8BOlA0yIdjh4RwJOdz) maintained by Mike Debardine from the BICAN consortium. +::: | Parameter name | Description | Type | | --- | --- | --- | @@ -64,7 +69,7 @@ The BuildIndices workflow inputs are specified in JSON configuration files. Conf Overall, the BuildIndices workflow: 1. Checks inputs, modifies reference files, and creates STAR index. 2. Calculates chromosome sizes. -3. Builds reference bundle for bwa. +3. Builds reference bundle for bwa-mem2. The tasks and tools used in the BuildIndices workflow are detailed in the table below. @@ -72,7 +77,7 @@ To see specific tool parameters, select the [workflow WDL link](https://github.c | Task name | Tool | Software | Description | | --- | --- | --- | --- | -| BuildStarSingleNucleus | [modify_gtf.py](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/modify_gtf.py), STAR | [warp-tools](https://github.com/broadinstitute/warp-tools/tree/develop), [STAR](https://github.com/alexdobin/STAR) | Checks that the input GTF file contains input genome source, genome build version, and annotation version with correct build source information, modifies files for the STAR aligner, and creates STAR index file. | +| BuildStarSingleNucleus | [modify_gtf.py](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/modify_gtf.py), STAR | [warp-tools](https://github.com/broadinstitute/warp-tools/tree/develop), [STAR](https://github.com/alexdobin/STAR) | Checks that the input GTF file contains input genome source, genome build version, and annotation version with correct build source information, modifies files for the STAR aligner, and creates STAR index file. If "Marmoset" is selected as organism, a [Marmoset-specific custom script](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/modify_gtf_marmoset.py) is run to modify the GTF | | CalculateChromosomeSizes | faidx | [Samtools](http://www.htslib.org/) | Reads the genome FASTA file to create a FASTA index file that contains the genome chromosome sizes. | | BuildBWAreference | index | [bwa-mem2](https://github.com/bwa-mem2/bwa-mem2) | Builds the reference bundle for the bwa aligner. | @@ -84,7 +89,7 @@ The BuildStarSingleNucleus task reads the input GTF file and verifies that the ` **Modify reference files and create STAR index** -The BuildStarSingleNucleus task uses a custom python script, [`modify_gtf.py`](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/modify_gtf.py), and a list of biotypes ([example](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/Biotypes.tsv)) to filter the input GTF file for only the biotypes indicated in the list with the value “Y” in the second column. The defaults in the custom code produce reference outputs that are similar to those built with 10x Genomics reference scripts. +The BuildStarSingleNucleus task uses a custom python script, [`modify_gtf.py`](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/modify_gtf.py) or [`modify_get_marmoset`](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/modify_gtf_marmoset.py), and a list of biotypes ([example](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/build-indices/Biotypes.tsv)) to filter the input GTF file for only the biotypes indicated in the list with the value “Y” in the second column. The defaults in the custom code produce reference outputs that are similar to those built with 10x Genomics reference scripts. The task uses the filtered GTF file and STAR `--runMode genomeGenerate` to generate the index file for the STAR aligner. Outputs of the task include the modified GTF and compressed STAR index files. @@ -119,10 +124,127 @@ If you use the BuildIndices Pipeline in your research, please consider citing ou Degatano, K.; Awdeh, A.; Dingman, W.; Grant, G.; Khajouei, F.; Kiernan, E.; Konwar, K.; Mathews, K.; Palis, K.; Petrillo, N.; Van der Auwera, G.; Wang, C.; Way, J.; Pipelines, W. WDL Analysis Research Pipelines: Cloud-Optimized Workflows for Biological Data Processing and Reproducible Analysis. Preprints 2024, 2024012131. https://doi.org/10.20944/preprints202401.2131.v1 ## Consortia support -This pipeline is supported by the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and BICAN). +This pipeline is supported by the [BRAIN Initiative](https://braininitiative.nih.gov/) (BICCN and BICAN) and SCORCH. If your organization also uses this pipeline, we would like to list you! Please reach out to us by [filing an issue in WARP](https://github.com/broadinstitute/warp/issues). +## Example references +Example references are available in the Broad Public Reference bucket, a Google bucket that hosts reference files at no charge to the end-user. + +### Human +| File Type | File Location | +|-------------------------|--------------| +| Genomics Reference | GRCh38, primary assembly (PRI) | +| Gene annotation (PRI) | GENCODE Release 43 GRCh38.p13 | +| Reference README | `gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/v43_README.txt` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar` | +| STAR Annotation GTF | `gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf` | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/chrom.sizes` | + +### Mouse + +| File Type | File Location | +|-------------------------|--------------| +| Genomics Reference | GRCm39, primary assembly (PRI) | +| Gene annotation (PRI) | GENCODE Release 32 | +| Reference README | `gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/M32_README.txt` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_star2.7.10a-Mouse-GENCODE-build-GRCm39-M32.tar` | +| STAR Annotation GTF | `gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf` | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/GRCm39/bwa/v2_2_1/bwa-mem2-2.2.1-Mouse-GENCODE-build-GRCm39.tar` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/GRCm39/bwa/v2_2_1/chrom.sizes` | + +### Macaque +Inputs for the Macaque reference below were modified using a custom tool to handle nuclear mitochondrial inserts, [numty-dumpty](https://github.com/nkschaefer/numty-dumpty). See the README for the [STAR index](https://storage.cloud.google.com/gcp-public-data--broad-references/M.mulatta/Mmul_10/star/v2_7_10a/numty_dumpty/README_STAR.txt) and the [bwa-mem2 index] +(https://storage.cloud.google.com/gcp-public-data--broad-references/M.mulatta/Mmul_10/bwa/v2_2_1/numty_dumpty/README_BWA.txt). + +| File Type | File Location | +|---------------------|--------------| +| Genomics Reference | mmul10 | +| Gene annotation | RefSeq annotation version 103 | +| STAR Index TAR | `gs://gcp-public-data--broad-references/M.mulatta/Mmul_10/star/v2_7_10a/numty_dumpty/numt_modified_star2.7.10a-Macaque-NCBI-build-GCF_003339765.1-103.tar` | +| BWA Index TAR | `gs://gcp-public-data--broad-references/M.mulatta/Mmul_10/bwa/v2_2_1/numty_dumpty/numt_bwa-mem2-2.2.1-Macaque-NCBI-build-GCF_003339765.1.tar` | +| GTF Annotation | `gs://gcp-public-data--broad-references/M.mulatta/Mmul_10/star/v2_7_10a/numty_dumpty/numt_modified_v103.annotation.gtf` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/M.mulatta/Mmul_10/bwa/v2_2_1/numty_dumpty/numt_chrom.sizes` | + + +This macaque reference works with the Optimus, Multiome, and Paired-tag workflows. However, mitochondrial genes are not demarcated with an "mt-" tag. A separate text file with MT genes is required. An example is the list below: + +``` +ND1 +ND2 +COX1 +COX2 +ATP8 +ATP6 +COX3 +ND3 +ND4L +ND4 +ND5 +ND6 +CYTB +``` + +An example file with this list is located in a public Google bucket here: gs://warp-testing-public/references/BuildIndices_outs/Macaque_MT_genes.txt + +### Marmoset +Marmoset scripts expect a custom-modified input Marmoset GTF file. These inputs and accompanying README are located in a [public Google Drive](https://drive.google.com/drive/folders/15JcUhwOqkJwTVS8BOlA0yIdjh4RwJOdz) maintained by Mike Debardine from the BICAN consortium. + + +| File Type | File Location | +|---------------------|--------------| +| Genomics Reference | mCalJa1.2.pat.X (GenBank Accession GCA_011100555.2 and RefSeq Accession GCF_011100555.1) | +| Gene annotation | Custom (see note above table) | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/mCalJa1/mCalJa1.2.pat.X/chrom.sizes` | +| GTF Annotation | `gs://gcp-public-data--broad-references/mCalJa1/mCalJa1.2.pat.X/modified_vGCF_011100555.1-RS_2023_03.annotation.gtf` | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/mCalJa1/mCalJa1.2.pat.X/bwa-mem2-2.2.1-Marmoset-RefSeq-build-mCalJa1.2.pat.X.tar` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/mCalJa1/mCalJa1.2.pat.X/modified_star2.7.10a-Marmoset-RefSeq-build-mCalJa1.2.pat.X-GCF_011100555.1-RS_2023_03.tar` | + +### Armadillo +| File Type | File Location | +|---------------------|--------------| +| Genomic Reference | mDasNov1.hap2 (NCBI) | +| Gene Annotation | RefSeq GCF_030445035.1-RS_2023_07 | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/D.novemcinctus/mDasNov1.hap2/cleanome/bwa/v2_2_1/bwa-mem2-2.2.1-Armadillo-NCBI-build-mDasNov1.hap2.tar` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/D.novemcinctus/mDasNov1.hap2/cleanome/bwa/v2_2_1/chrom.sizes` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/D.novemcinctus/mDasNov1.hap2/cleanome/star/v2_7_10a/modified_star2.7.10a-Armadillo-NCBI-build-mDasNov1.hap2-2.2.tar` | +| GTF Annotation | `gs://gcp-public-data--broad-references/D.novemcinctus/mDasNov1.hap2/cleanome/star/v2_7_10a/modified_v2.2.annotation.gtf` | + +### Opposum + +| File Type | File Location | +|---------------------|--------------| +| Genomic Reference | mMonDom1.pri (NCBI) | +| Gene Annotation | RefSeq GCF_027887165.1-RS_2023_05 (RefSeq link) | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/M.domestica/mMonDom1.pri/cleanome/bwa/v2_2_1/bwa-mem2-2.2.1-Opossum-NCBI-build-mMonDom1.pri.tar` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/M.domestica/mMonDom1.pri/cleanome/bwa/v2_2_1/chrom.sizes` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/M.domestica/mMonDom1.pri/cleanome/star/v2_7_10a/modified_star2.7.10a-Opossum-NCBI-build-mMonDom1.pri-2.2.tar` | +| GTF Annotation | `gs://gcp-public-data--broad-references/M.domestica/mMonDom1.pri/cleanome/star/v2_7_10a/modified_v2.2.annotation.gtf` | + +### Rat + +| File Type | File Location | +|---------------------|--------------| +| Genomic Reference | mRatBN7.2 (NCBI) | +| Gene Annotation | RefSeq GCF_015227675.2-RS_2023_06 | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/R.norvegicus/mRatBN7.2/cleanome/bwa/v2_2_1/bwa-mem2-2.2.1-Rat-NCBI-build-mRatBN7.2.tar` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/R.norvegicus/mRatBN7.2/cleanome/bwa/v2_2_1/chrom.sizes` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/R.norvegicus/mRatBN7.2/cleanome/star/v2_7_10a/modified_star2.7.10a-Rat-NCBI-build-mRatBN7.2-2.2.tar` | +| GTF Annotation | `gs://gcp-public-data--broad-references/R.norvegicus/mRatBN7.2/cleanome/star/v2_7_10a/modified_v2.2.annotation.gtf` | + +### Pig +| File Type | File Location | +|---------------------|--------------| +| Genomic Reference | Sscrofa11.1 (NCBI) | +| Gene Annotation | NCBI Annotation Release 106 (RefSeq GCF_000003025.6_Sscrofa11.1) | +| BWA-MEM2 Index TAR | `gs://gcp-public-data--broad-references/S.scrofa/Sscrofa11.1/cleanome/bwa/v2_2_1/bwa-mem2-2.2.1-Pig-NCBI-build-Sscrofa11.1.tar` | +| Chromosome Sizes | `gs://gcp-public-data--broad-references/S.scrofa/Sscrofa11.1/cleanome/bwa/v2_2_1/chrom.sizes` | +| STAR Index TAR | `gs://gcp-public-data--broad-references/S.scrofa/Sscrofa11.1/cleanome/star/v2_7_10a/modified_star2.7.10a-Pig-NCBI-build-Sscrofa11.1-2.2.tar` | +| GTF Annotation | `gs://gcp-public-data--broad-references/S.scrofa/Sscrofa11.1/cleanome/star/v2_7_10a/modified_v2.2.annotation.gtf` | + + ## Feedback -Please help us make our tools better by [filing an issue in WARP](https://github.com/broadinstitute/warp/issues) for pipeline-related suggestions or questions. \ No newline at end of file +Please help us make our tools better by [filing an issue in WARP](https://github.com/broadinstitute/warp/issues) for pipeline-related suggestions or questions. + diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 131a81aca5..2952e36c0c 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Multiome v5.9.1](https://github.com/broadinstitute/warp/releases) | November, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [Multiome v5.9.6](https://github.com/broadinstitute/warp/releases) | January, 2025 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![Multiome_diagram](./multiome_diagram.png) @@ -82,6 +82,7 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String | | adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String | | run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | +| run_peak_calling | Optional boolean used to determine if the ATAC pipeline should run Peak Calling; default is "false". | Boolean | | vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index 607c2b01a5..a8a3cdd98e 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v7.8.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | +| [optimus_v7.9.1](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | January, 2025 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | ![Optimus_diagram](Optimus_diagram.png) @@ -108,6 +108,7 @@ The example configuration files also contain metadata for the reference files, d | emptydrops_lower | UMI threshold for emptyDrops detection; default is 100. | N/A | | count_exons | Boolean indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**. If true, this option will output an additional layer for the h5ad file. By default, it is set to "false". If the parameter is true and used with sc_rnamode, the workflow will return an error. | "true" or "false" (default) | | gex_expected_cells | Optional integer input for the expected number of cells, which is used calculate library-level metrics. The default is set to 3,000. | N/A | +| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | #### Pseudogene handling The example Optimus reference files are downloaded directly from GENCODE (see Quickstart table) and are not modified to remove pseudogenes. This is in contrast to the [references created for Cell Ranger](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/release-notes/references#header) which remove pseudogenes and small RNAs. @@ -148,6 +149,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | [Metrics.CalculateCellMetrics (alias = CellMetrics)](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/Metrics.wdl) | TagSort | [warp-tools](https://github.com/broadinstitute/warp-tools) | Sorts the BAM file by cell using the cell barcode (CB), molecule barcode (UB) and gene ID (GX) tags and computes cell metrics. | | [RunEmptyDrops.RunEmptyDrops](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/RunEmptyDrops.wdl) | npz2rds.sh, emptyDropsWrapper.R, emptyDrops | [DropletUtils](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) | Runs custom scripts to convert the NPY and NPZ files to RDS and then uses emptyDrops to identify empty lipid droplets. This step only runs when `counting_mode` = "sc_rna".| | [H5adUtils.OptimusH5adGeneration](https://github.com/broadinstitute/warp/blob/master/tasks/skylab/H5adUtils.wdl) | create_h5ad_optimus.py | Python3 | Merges the gene counts, cell metrics, gene metrics, and emptyDrops data into a h5ad formatted cell-by-gene matrix. The h5ad contains exon counts when using sc_rna mode, and whole-gene counts when running in sn_rna mode. It optionally contains an additional layer for exon counts when running sn_rna mode with `exon_counts` set to true. | +| CellBender.run_cellbender_remove_background_gpu as CellBender ([WDL](https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl))| CellBender | Optional task that runs the `cellbender_remove_background.wdl` WDL script directly from the [CellBender GitHub repository](https://github.com/broadinstitute/CellBender/tree/master), depending on whether the input `run_cellbender` is "true" or "false". | More information about the different tags used to flag the data can be found in the [Bam_tags documentation](./Bam_tags.md). @@ -238,8 +240,10 @@ You can determine which type of counts are in the h5ad by looking at the global For sn_rna mode, you can also access whole transcript and exonic counts using AnnData alyers `layers()` method. For example, adata.layers[“exon_counts”]` will return the exonic counts from the output h5ad. +#### 9. Optional: Run CellBender +This task runs when the `run_cellbender` input is set to true. CellBender is a tool for removing background UMIs and thereby helps to flag empty drops. Learn more in the [CellBender documentation](https://cellbender.readthedocs.io/en/latest/). -#### 9. Outputs +#### 10. Outputs Output files of the pipeline include: @@ -269,6 +273,14 @@ The following table lists the output files produced from the pipeline. For sampl | cell_calls | empty_drops_result.csv | emptyDrops results from the RunEmptyDrops task. | CSV | | h5ad_output_file | `.h5ad` | h5ad file with count data (exonic or whole transcript depending on the counting_mode) and metadata. | H5AD | | mtx_files | `.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | +| cell_barcodes_csv | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information.| +| checkpoint_file | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | +| h5_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | +| html_report_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | +| log | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | +| metrics_csv_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | +| output_directory | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | +| summary_pdf | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | The h5ad matrix is the default output. This matrix contains the unnormalized (unfiltered), UMI-corrected count matrices, as well as the gene and cell metrics detailed in the [Optimus Count Matrix Overview](./Loom_schema.md). diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index 3970472fc1..460326d32b 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -6,7 +6,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | |:---:| :---: | :---: | :---: | -| [PairedTag_v1.8.2](https://github.com/broadinstitute/warp/releases) | November, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [PairedTag_v1.10.0](https://github.com/broadinstitute/warp/releases) | January, 2025 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![pairedtag_diagram](pairedtag_diagram.png) @@ -76,7 +76,7 @@ The Paired-Tag workflow inputs are specified in JSON configuration files. Exampl | star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | | count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | | gex_whitelist | Optional file containing the list of valid barcodes for 10x multiome GEX data; default is "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt". | File | -| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag; default is "Uniform". | String | +| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag; default is "EM". | String | | atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single paired-tag DNA library. | Array[File] | | atac_r2_fastq | Array of barcodes FASTQ files representing a single paired-tag DNA library. | Array[File] | | atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single paired-tag DNA library. | Array[File] | @@ -120,8 +120,8 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | library_metrics | `_gex__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | | atac_library_final | `_atac__library_metrics` | CSV file containing all the library-level metrics calucalted by SnapATAC2. | | cloud_provider | N/A | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | -| multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| -| multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform" (default); see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| +| multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM" (default); see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| +| multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | | multimappers_PropUnique_matrix | `UniqueAndMult-PropUnique.mtx` | Optional output produced when `soloMultiMappers` is "PropUnique"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.|