Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/np_jw_test_illumina_genotyping_a…
Browse files Browse the repository at this point in the history
…rrays' into np_jw_test_illumina_genotyping_arrays
  • Loading branch information
nikellepetrillo committed Feb 6, 2025
2 parents 9577456 + b3a2074 commit 7c096ec
Show file tree
Hide file tree
Showing 14 changed files with 402 additions and 34 deletions.
6 changes: 3 additions & 3 deletions pipeline_versions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ MultiSampleArrays 1.6.2 2024-08-02
ValidateChip 1.16.7 2024-11-04
Arrays 2.6.30 2024-11-04
AnnotationFiltration 1.2.7 2024-11-04
Multiome 5.9.6 2025-01-21
Multiome 5.10.0 2025-02-03
snm3C 4.0.4 2024-08-06
SlideSeq 3.4.8 2025-01-13
scATAC 1.3.2 2023-08-03
BuildIndices 4.0.0 2025-01-17
MultiSampleSmartSeq2 2.2.22 2024-09-11
Optimus 7.9.1 2025-01-13
atac 2.6.0 2025-01-21
PairedTag 1.10.0 2025-01-21
atac 2.7.0 2025-02-03
PairedTag 1.10.1 2025-02-03
SmartSeq2SingleSample 5.1.21 2024-09-11
MultiSampleSmartSeq2SingleNucleus 2.0.7 2025-01-13
6 changes: 6 additions & 0 deletions pipelines/skylab/atac/atac.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 2.7.0
2025-02-03 (Date of Last Commit)

* Added an optional PeakCalling task
* Added a boolean variable peak_calling; default is false

# 2.6.0
2025-01-21 (Date of Last Commit)

Expand Down
196 changes: 185 additions & 11 deletions pipelines/skylab/atac/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ workflow ATAC {

# Option for running files with preindex
Boolean preindex = false
# Option for running peak calling
Boolean peak_calling = false

# BWA ref
File tar_bwa_reference
Expand All @@ -49,7 +51,7 @@ workflow ATAC {
String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
}

String pipeline_version = "2.6.0"
String pipeline_version = "2.7.0"

# Determine docker prefix based on cloud provider
String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
Expand All @@ -61,7 +63,7 @@ workflow ATAC {
String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919"
String samtools_docker = "samtools-dist-bwa:3.0.0"
String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311"
String snap_atac_docker = "snapatac2:1.1.0"
String snap_atac_docker = "snapatac2:2.0.0"

# Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
Expand All @@ -71,7 +73,6 @@ workflow ATAC {
}
}


parameter_meta {
read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads"
read2_fastq_gzipped: "read 2 FASTQ file as input for the pipeline, contains the cellular barcodes corresponding to the reads in the read1 FASTQ and read 3 FASTQ"
Expand Down Expand Up @@ -160,20 +161,32 @@ workflow ATAC {
input_id = input_id

}
if (peak_calling) {
call PeakCalling {
input:
output_base_name = input_id,
annotations_gtf = annotations_gtf,
metrics_h5ad = CreateFragmentFile.Snap_metrics,
chrom_sizes = chrom_sizes,
docker_path = docker_prefix + snap_atac_docker
}
}
}

File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file])
File fragment_file_index_atac = select_first([BB_fragment.fragment_file_index, CreateFragmentFile.fragment_file_index])
File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics])
File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics])

output {
File bam_aligned_output = bam_aligned_output_atac
File fragment_file = fragment_file_atac
File fragment_file_index = fragment_file_index_atac
File snap_metrics = snap_metrics_atac
File library_metrics_file = library_metrics
File? cellbybin_h5ad_file = PeakCalling.cellbybin_h5ad
File? cellbypeak_h5ad_file = PeakCalling.cellbypeak_h5ad
}
}

Expand Down Expand Up @@ -269,7 +282,6 @@ task GetNumSplits {
}
}


# trim read 1 and read 2 adapter sequeunce with cutadapt
task TrimAdapters {
input {
Expand Down Expand Up @@ -513,7 +525,6 @@ task CreateFragmentFile {
File bam
File annotations_gtf
File chrom_sizes
File annotations_gtf
Boolean preindex
Int disk_size = 500
Int mem_size = 64
Expand All @@ -536,7 +547,8 @@ task CreateFragmentFile {
}

command <<<
set -e pipefail
set -euo pipefail
set -x

python3 <<CODE
Expand All @@ -559,6 +571,9 @@ task CreateFragmentFile {
# use snap atac2
import snapatac2.preprocessing as pp
import snapatac2 as snap
import scanpy as sc
import numpy as np
import polars as pl
import anndata as ad
from collections import OrderedDict
import csv
Expand All @@ -579,8 +594,7 @@ task CreateFragmentFile {
atac_percent_target = number_of_cells / expected_cells*100
print("Setting percent target in nested dictionary")
data['Cells']['atac_percent_target'] = atac_percent_target
# Flatten the dictionary
flattened_data = []
for category, metrics in data.items():
Expand Down Expand Up @@ -637,8 +651,168 @@ task CreateFragmentFile {
output {
File fragment_file = "~{input_id}.fragments.sorted.tsv.gz"
File fragment_file_index = "~{input_id}.fragments.sorted.tsv.gz.csi"
File Snap_metrics = "~{input_id}.metrics.h5ad"
File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
}
}
# peak calling using SnapATAC2
task PeakCalling {
input {
File annotations_gtf
File metrics_h5ad
File chrom_sizes
String output_base_name
# SnapATAC2 parameters
Int min_counts = 5000
Int min_tsse = 10
Int max_counts = 100000
Float probability_threshold = 1
# Runtime attributes/docker
String docker_path
Int disk_size = 500
Int mem_size = 64
Int nthreads = 4
}
parameter_meta {
annotations_gtf: "GTF for SnapATAC2 to calculate TSS sites of fragment file."
disk_size: "Disk size used in create fragment file step."
mem_size: "The size of memory used in create fragment file."
docker_path: "The docker image path containing the runtime environment for this task"
}
command <<<
set -euo pipefail
set -x
python3 <<CODE
# use snap atac2
import snapatac2 as snap
import scanpy as sc
import numpy as np
import polars as pl
import pandas as pd
output_base_name = "~{output_base_name}"
atac_gtf = "~{annotations_gtf}"
metrics_h5ad = "~{metrics_h5ad}"
chrom_sizes = "~{chrom_sizes}"
min_counts = "~{min_counts}"
min_tsse = "~{min_tsse}"
max_counts = "~{max_counts}"
probability_threshold = "~{probability_threshold}"
probability_threshold = float(probability_threshold)
print("Peak calling starting...")
atac_data = snap.read(metrics_h5ad)
# Calculate and plot the size distribution of fragments
print("Calculating fragment size distribution")
snap.pl.frag_size_distr(atac_data)
print(atac_data)
# Filter cells
print("Filtering cells")
snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts)
print(atac_data)
# Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins.
print("Creating cell by bin matrix")
atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False)
print("set obsm")
atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"]
print("set all uns")
for key in atac_data.uns.keys():
print("set ",key)
atac_data_mod.uns[key] = atac_data.uns[key]
print(atac_data_mod)
# Feature selection
print("Feature selection")
snap.pp.select_features(atac_data_mod)
print(atac_data_mod)
# Run customized scrublet algorithm to identify potential doublets
print("Run scrublet to identify potential doublets")
snap.pp.scrublet(atac_data_mod)
print(atac_data_mod)
# Employ spectral embedding for dimensionality reduction
print("Employ spectral embedding for dimensionality reduction")
snap.tl.spectral(atac_data_mod)
print(atac_data_mod)
# Filter doublets based on scrublet scores
print("Filter doublets based on scrublet scores")
snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold)
print(atac_data_mod)
# Perform graph-based clustering to identify cell clusters.
# Build a k-nearest neighbour graph using snap.pp.knn
print("Perform knn graph-based clustering to identify cell clusters")
snap.pp.knn(atac_data_mod)
print(atac_data_mod)
# Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph
print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph")
snap.tl.leiden(atac_data_mod)
print(atac_data_mod)
# Create the cell by gene activity matrix
print("Create the cell by gene activity matrix")
gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf)
print(atac_data_mod)
# Normalize the gene matrix
print("Normalize the gene matrix")
gene_mat.obs['leiden'] = atac_data_mod.obs['leiden']
sc.pp.normalize_total(gene_mat)
sc.pp.log1p(gene_mat)
sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon")
for i in np.unique(gene_mat.obs['leiden']):
markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names']
print(f"Cluster {i}: {', '.join(markers)}")
print("Peak calling using MACS3")
snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1)
print("Merge peaks and create peak matrix")
# read chrom sizes
chromsize_dict = pd.read_csv(chrom_sizes, sep='\t', header=None)
chromsize_dict = pd.Series(chromsize_dict[1].values, index=chromsize_dict[0]).to_dict()
# merge peaks and create peak matrix
peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chromsize_dict)
peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks'])
print("Convert pl.DataFrame to pandas DataFrame")
# Convert pl.DataFrame to pandas DataFrame
for key in atac_data_mod.uns.keys():
if isinstance(atac_data_mod.uns[key], pl.DataFrame):
print(key)
atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas()
print("Write into h5ad file")
atac_data_mod.write_h5ad("~{output_base_name}.cellbybin.h5ad")
peak_matrix.write_h5ad("~{output_base_name}.cellbypeak.h5ad")
CODE
>>>
runtime {
docker: docker_path
disks: "local-disk ${disk_size} SSD"
memory: "${mem_size} GiB"
cpu: nthreads
}
output {
File cellbybin_h5ad = "~{output_base_name}.cellbybin.h5ad"
File cellbypeak_h5ad = "~{output_base_name}.cellbypeak.h5ad"
}
}
6 changes: 6 additions & 0 deletions pipelines/skylab/multiome/Multiome.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 5.10.0
2025-02-03 (Date of Last Commit)

* Added an optional PeakCalling task to the ATAC workflow
* Added a boolean variable run_peak_calling to the Multiome pipeline; default is false

# 5.9.6
2025-01-21 (Date of Last Commit)

Expand Down
9 changes: 7 additions & 2 deletions pipelines/skylab/multiome/Multiome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils

workflow Multiome {

String pipeline_version = "5.9.6"
String pipeline_version = "5.10.0"

input {
String cloud_provider
Expand Down Expand Up @@ -50,6 +50,8 @@ workflow Multiome {

# CellBender
Boolean run_cellbender = false
# Peak Calling
Boolean run_peak_calling = false
}

# Determine docker prefix based on cloud provider
Expand Down Expand Up @@ -121,7 +123,8 @@ workflow Multiome {
annotations_gtf = annotations_gtf,
atac_nhash_id = atac_nhash_id,
adapter_seq_read3 = adapter_seq_read3,
atac_expected_cells = expected_cells
atac_expected_cells = expected_cells,
peak_calling = run_peak_calling
}
call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
input:
Expand Down Expand Up @@ -149,6 +152,8 @@ workflow Multiome {
File fragment_file_index = JoinBarcodes.atac_fragment_tsv_index
File snap_metrics_atac = JoinBarcodes.atac_h5ad_file
File atac_library_metrics = Atac.library_metrics_file
File? cellbybin_h5ad_file = Atac.cellbybin_h5ad_file
File? cellbypeak_h5ad_file = Atac.cellbypeak_h5ad_file

# optimus outputs
File genomic_reference_version_gex = Optimus.genomic_reference_version
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"Multiome.annotations_gtf":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf",
"Multiome.input_id":"10k_PBMC_downsampled",
"Multiome.cloud_provider":"gcp",
"Multiome.gex_r1_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R1_gex.fastq.gz"
],
"Multiome.gex_r2_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R2_gex.fastq.gz"
],
"Multiome.atac_r1_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R1_atac.fastq.gz"
],
"Multiome.atac_r2_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R2_atac.fastq.gz"
],
"Multiome.atac_r3_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R3_atac.fastq.gz"
],
"Multiome.tar_bwa_reference":"gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar",
"Multiome.tar_star_reference":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar",
"Multiome.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes",
"Multiome.run_cellbender":"false",
"Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake",
"Multiome.Atac.num_threads_bwa":"16",
"Multiome.Atac.mem_size_bwa":"64",
"Multiome.soloMultiMappers":"Uniform",
"Multiome.gex_nhash_id":"example_1234",
"Multiome.atac_nhash_id":"example_1234",
"Multiome.run_peak_calling":"true"
}
Loading

0 comments on commit 7c096ec

Please sign in to comment.