Skip to content

Commit

Permalink
Merge pull request #28 from broadinstitute/dp-spades
Browse files Browse the repository at this point in the history
update SPAdes and default to rnaviralspades settings
  • Loading branch information
dpark01 authored Dec 2, 2022
2 parents 0784b5a + 5e3983e commit ea463b6
Show file tree
Hide file tree
Showing 5 changed files with 523 additions and 74 deletions.
107 changes: 49 additions & 58 deletions assemble/spades.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def execute(self, args): # pylint: disable=W0221
subprocess.check_call(tool_cmd)

def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None,
contigs_untrusted=None, kmer_sizes=(55,65), always_succeed=False, max_kmer_sizes=1,
contigs_untrusted=None, kmer_sizes=None, always_succeed=True,
filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''):
'''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs.
Expand All @@ -57,10 +57,9 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
contigs_trusted (fasta/q): optionally, already-assembled contigs of high quality
contigs_untrusted (fasta/q): optionally, already-assembled contigs of average quality
Params:
kmer_sizes: if given, use these kmer sizes and combine the resulting contigs. kmer size of 0 or None means use size auto-selected
kmer_sizes: if given, use these comma-separated ascending kmer sizes--all values must be odd and < 127. kmer size of 0 or None means use size auto-selected
by SPAdes based on read length.
always_succeed: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size
max_kmer_sizes: if this many kmer sizes succeed, do not try further ones
always_succeed: if True, if spades fails with an error, pretend it just produced no contigs
filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs)
min_contig_len: drop contigs shorter than this many bp
mem_limit_gb: max memory to use, in gigabytes
Expand All @@ -83,60 +82,52 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or
(reads_unpaired and os.path.getsize(reads_unpaired) > 0)):

kmer_sizes_succeeded = 0
for kmer_size in util.misc.make_seq(kmer_sizes):

with util.file.tmp_dir('_spades') as spades_dir:
log.debug('spades_dir=' + spades_dir)
args = []
if reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0:
args += ['-1', reads_fwd, '-2', reads_bwd ]
if reads_unpaired and os.path.getsize(reads_unpaired) > 0:
args += [ '--s1', reads_unpaired ]
if contigs_trusted: args += [ '--trusted-contigs', contigs_trusted ]
if contigs_untrusted: args += [ '--untrusted-contigs', contigs_untrusted ]
if kmer_size: args += [ '-k', kmer_size ]
if spades_opts: args += shlex.split(spades_opts)
args += [ '-m' + str(mem_limit_gb), '-t', str(threads), '-o', spades_dir ]

transcripts_fname = os.path.join(spades_dir, ('hard_filtered_' if filter_contigs else '') + 'transcripts.fasta')

try:
self.execute(args=args)
except Exception as e:
if always_succeed:
log.warning('SPAdes failed for k={}: {}'.format(kmer_size, e))
util.file.make_empty(transcripts_fname)
else:
raise

# work around the bug that spades may succeed yet not create the transcripts.fasta file
if not os.path.isfile(transcripts_fname):
msg = 'SPAdes failed to make transcripts.fasta for k={}'.format(kmer_size)
if always_succeed:
log.warning(msg)
util.file.make_empty(transcripts_fname)
else:
raise RuntimeError(msg)

if min_contig_len:
transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
transcripts_fname = os.path.join(spades_dir, 'transcripts_over_{}bp.fasta'.format(min_contig_len))
Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')

contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
contigs_cumul_count += 1

util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
shutil.copyfile(contigs_cumul, contigs_out)

if os.path.getsize(transcripts_fname):
kmer_sizes_succeeded += 1
if kmer_sizes_succeeded >= max_kmer_sizes:
break
# end: with util.file.tmp_dir('_spades') as spades_dir
# end: for kmer_size in util.misc.make_seq(kmer_sizes)
with util.file.tmp_dir('_spades') as spades_dir:
log.debug('spades_dir=' + spades_dir)
args = []
if reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0:
args += ['-1', reads_fwd, '-2', reads_bwd ]
if reads_unpaired and os.path.getsize(reads_unpaired) > 0:
args += [ '--s1', reads_unpaired ]
if contigs_trusted: args += [ '--trusted-contigs', contigs_trusted ]
if contigs_untrusted: args += [ '--untrusted-contigs', contigs_untrusted ]
if kmer_sizes: args += [ '-k', kmer_sizes ]
if spades_opts: args += shlex.split(spades_opts)
args += [ '-m' + str(mem_limit_gb), '-t', str(threads), '-o', spades_dir ]

transcripts_fname = os.path.join(spades_dir, ('hard_filtered_' if filter_contigs else '') + 'contigs.fasta')

try:
self.execute(args=args)
except Exception as e:
if always_succeed:
log.warning('SPAdes failed: {}'.format(e))
util.file.make_empty(transcripts_fname)
else:
raise

# work around the bug that spades may succeed yet not create the contigs.fasta file
if not os.path.isfile(transcripts_fname):
msg = 'SPAdes failed to make contigs.fasta'
if always_succeed:
log.warning(msg)
util.file.make_empty(transcripts_fname)
else:
raise RuntimeError(msg)

if min_contig_len:
transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
transcripts_fname = os.path.join(spades_dir, 'contigs_over_{}bp.fasta'.format(min_contig_len))
Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')

contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
contigs_cumul_count += 1

util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
shutil.copyfile(contigs_cumul, contigs_out)

# end: with util.file.tmp_dir('_spades') as spades_dir
# if input non-empty
# end: def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, ...)
# end: class SpadesTool(tools.Tool)
Expand Down
15 changes: 7 additions & 8 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,15 +275,14 @@ def assemble_spades(
in_bam,
clip_db,
out_fasta,
spades_opts='--rna',
spades_opts='--rnaviral',
contigs_trusted=None, contigs_untrusted=None,
filter_contigs=False,
min_contig_len=0,
kmer_sizes=(55,65,35),
kmer_sizes=None,
n_reads=10000000,
outReads=None,
always_succeed=False,
max_kmer_sizes=1,
mem_limit_gb=8,
threads=None,
):
Expand All @@ -305,7 +304,7 @@ def assemble_spades(
contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted,
contigs_out=out_fasta, filter_contigs=filter_contigs,
min_contig_len=min_contig_len,
kmer_sizes=kmer_sizes, always_succeed=always_succeed, max_kmer_sizes=max_kmer_sizes,
kmer_sizes=kmer_sizes, always_succeed=always_succeed,
spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
threads=threads)
except subprocess.CalledProcessError as e:
Expand All @@ -326,17 +325,17 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()):
help='Optional input contigs of high medium quality, previously assembled from the same sample')
parser.add_argument('--nReads', dest='n_reads', type=int, default=10000000,
help='Before assembly, subsample the reads to at most this many')
parser.add_argument('--kmerSizes', dest='kmer_sizes', type=int, nargs='+', default=(55,65,35),
help='Ordered list of kmer sizes to attempt')
parser.add_argument('--kmerSizes', dest='kmer_sizes', default=None,
help='Comma-separated ascending order list of odd-value kmer sizes to attempt')
parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true',
help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)')
parser.add_argument('--alwaysSucceed', dest='always_succeed', default=False, action='store_true',
parser.add_argument('--alwaysSucceed', dest='always_succeed', default=True, action='store_true',
help='if assembly fails for any reason, output an empty contigs file, rather than failing with '
'an error code')
parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0,
help='only output contigs longer than this many bp')
parser.add_argument('--spadesOpts', dest='spades_opts', default='--rna', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--spadesOpts', dest='spades_opts', default='--rnaviral', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, assemble_spades, split_args=True)
Expand Down
8 changes: 4 additions & 4 deletions requirements-conda.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
gap2seq>=3.1.1a2
mafft>=7.464
mummer4>=4.0.0beta2
muscle>=3.8.1551
spades>=3.12.0
mafft>=7.508
mummer4>=4.0.0rc1
muscle=3.8.1551
spades>=3.15.5
# Python packages below
1 change: 0 additions & 1 deletion test/input/TestAssembleSpades/clipDb.fasta

This file was deleted.

Loading

0 comments on commit ea463b6

Please sign in to comment.