Merge pull request #28 from broadinstitute/dp-spades

update SPAdes and default to rnaviralspades settings
broadinstitute · Dec 2, 2022 · ea463b6 · ea463b6
2 parents 0784b5a + 5e3983e
commit ea463b6
Show file tree

Hide file tree

Showing 5 changed files with 523 additions and 74 deletions.
diff --git a/assemble/spades.py b/assemble/spades.py
@@ -45,7 +45,7 @@ def execute(self, args):    # pylint: disable=W0221
         subprocess.check_call(tool_cmd)
 
     def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None,
-                 contigs_untrusted=None, kmer_sizes=(55,65), always_succeed=False, max_kmer_sizes=1, 
+                 contigs_untrusted=None, kmer_sizes=None, always_succeed=True,
                  filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''):
         '''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs.
 
@@ -57,10 +57,9 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
               contigs_trusted (fasta/q): optionally, already-assembled contigs of high quality
               contigs_untrusted (fasta/q): optionally, already-assembled contigs of average quality
         Params:
-            kmer_sizes: if given, use these kmer sizes and combine the resulting contigs.  kmer size of 0 or None means use size auto-selected
+            kmer_sizes: if given, use these comma-separated ascending kmer sizes--all values must be odd and < 127.  kmer size of 0 or None means use size auto-selected
               by SPAdes based on read length.
-            always_succeed: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size
-            max_kmer_sizes: if this many kmer sizes succeed, do not try further ones
+            always_succeed: if True, if spades fails with an error, pretend it just produced no contigs
             filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs)
             min_contig_len: drop contigs shorter than this many bp
             mem_limit_gb: max memory to use, in gigabytes
@@ -83,60 +82,52 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
         if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or
             (reads_unpaired and os.path.getsize(reads_unpaired) > 0)):
 
-            kmer_sizes_succeeded = 0
-            for kmer_size in util.misc.make_seq(kmer_sizes):
-
-                with util.file.tmp_dir('_spades') as spades_dir:
-                    log.debug('spades_dir=' + spades_dir)
-                    args = []
-                    if reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0:
-                        args += ['-1', reads_fwd, '-2', reads_bwd ]
-                    if reads_unpaired and os.path.getsize(reads_unpaired) > 0:
-                        args += [ '--s1', reads_unpaired ]
-                    if contigs_trusted: args += [ '--trusted-contigs', contigs_trusted ]
-                    if contigs_untrusted: args += [ '--untrusted-contigs', contigs_untrusted ]
-                    if kmer_size: args += [ '-k', kmer_size ]
-                    if spades_opts: args += shlex.split(spades_opts)
-                    args += [ '-m' + str(mem_limit_gb), '-t', str(threads), '-o', spades_dir ]
-
-                    transcripts_fname = os.path.join(spades_dir, ('hard_filtered_' if filter_contigs else '') + 'transcripts.fasta')
-
-                    try:
-                        self.execute(args=args)
-                    except Exception as e:
-                        if always_succeed:
-                            log.warning('SPAdes failed for k={}: {}'.format(kmer_size, e))
-                            util.file.make_empty(transcripts_fname)
-                        else:
-                            raise
-
-                    # work around the bug that spades may succeed yet not create the transcripts.fasta file
-                    if not os.path.isfile(transcripts_fname):
-                        msg = 'SPAdes failed to make transcripts.fasta for k={}'.format(kmer_size)
-                        if always_succeed:
-                            log.warning(msg)
-                            util.file.make_empty(transcripts_fname)
-                        else:
-                            raise RuntimeError(msg)
-
-                    if min_contig_len:
-                        transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
-                        transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
-                        transcripts_fname = os.path.join(spades_dir, 'transcripts_over_{}bp.fasta'.format(min_contig_len))
-                        Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')
-
-                    contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
-                    contigs_cumul_count += 1
-
-                    util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
-                    shutil.copyfile(contigs_cumul, contigs_out)
-
-                    if os.path.getsize(transcripts_fname):
-                        kmer_sizes_succeeded += 1
-                        if kmer_sizes_succeeded >= max_kmer_sizes:
-                            break
-                # end: with util.file.tmp_dir('_spades') as spades_dir
-            # end: for kmer_size in util.misc.make_seq(kmer_sizes)
+            with util.file.tmp_dir('_spades') as spades_dir:
+                log.debug('spades_dir=' + spades_dir)
+                args = []
+                if reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0:
+                    args += ['-1', reads_fwd, '-2', reads_bwd ]
+                if reads_unpaired and os.path.getsize(reads_unpaired) > 0:
+                    args += [ '--s1', reads_unpaired ]
+                if contigs_trusted: args += [ '--trusted-contigs', contigs_trusted ]
+                if contigs_untrusted: args += [ '--untrusted-contigs', contigs_untrusted ]
+                if kmer_sizes: args += [ '-k', kmer_sizes ]
+                if spades_opts: args += shlex.split(spades_opts)
+                args += [ '-m' + str(mem_limit_gb), '-t', str(threads), '-o', spades_dir ]
+
+                transcripts_fname = os.path.join(spades_dir, ('hard_filtered_' if filter_contigs else '') + 'contigs.fasta')
+
+                try:
+                    self.execute(args=args)
+                except Exception as e:
+                    if always_succeed:
+                        log.warning('SPAdes failed: {}'.format(e))
+                        util.file.make_empty(transcripts_fname)
+                    else:
+                        raise
+
+                # work around the bug that spades may succeed yet not create the contigs.fasta file
+                if not os.path.isfile(transcripts_fname):
+                    msg = 'SPAdes failed to make contigs.fasta'
+                    if always_succeed:
+                        log.warning(msg)
+                        util.file.make_empty(transcripts_fname)
+                    else:
+                        raise RuntimeError(msg)
+
+                if min_contig_len:
+                    transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
+                    transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
+                    transcripts_fname = os.path.join(spades_dir, 'contigs_over_{}bp.fasta'.format(min_contig_len))
+                    Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')
+
+                contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
+                contigs_cumul_count += 1
+
+                util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
+                shutil.copyfile(contigs_cumul, contigs_out)
+
+            # end: with util.file.tmp_dir('_spades') as spades_dir
         # if input non-empty
     # end: def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, ...)
 # end: class SpadesTool(tools.Tool)

diff --git a/assembly.py b/assembly.py
@@ -275,15 +275,14 @@ def assemble_spades(
     in_bam,
     clip_db,
     out_fasta,
-    spades_opts='--rna',
+    spades_opts='--rnaviral',
     contigs_trusted=None, contigs_untrusted=None,
     filter_contigs=False,
     min_contig_len=0,
-    kmer_sizes=(55,65,35),
+    kmer_sizes=None,
     n_reads=10000000,
     outReads=None,
     always_succeed=False,
-    max_kmer_sizes=1,
     mem_limit_gb=8,
     threads=None,
 ):
@@ -305,7 +304,7 @@ def assemble_spades(
                                                contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted,
                                                contigs_out=out_fasta, filter_contigs=filter_contigs,
                                                min_contig_len=min_contig_len,
-                                               kmer_sizes=kmer_sizes, always_succeed=always_succeed, max_kmer_sizes=max_kmer_sizes,
+                                               kmer_sizes=kmer_sizes, always_succeed=always_succeed,
                                                spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
                                                threads=threads)
         except subprocess.CalledProcessError as e:
@@ -326,17 +325,17 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()):
                         help='Optional input contigs of high medium quality, previously assembled from the same sample')
     parser.add_argument('--nReads', dest='n_reads', type=int, default=10000000, 
                         help='Before assembly, subsample the reads to at most this many')
-    parser.add_argument('--kmerSizes', dest='kmer_sizes', type=int, nargs='+', default=(55,65,35),
-                        help='Ordered list of kmer sizes to attempt')
+    parser.add_argument('--kmerSizes', dest='kmer_sizes', default=None,
+                        help='Comma-separated ascending order list of odd-value kmer sizes to attempt')
     parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
     parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true', 
                         help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)')
-    parser.add_argument('--alwaysSucceed', dest='always_succeed', default=False, action='store_true',
+    parser.add_argument('--alwaysSucceed', dest='always_succeed', default=True, action='store_true',
                         help='if assembly fails for any reason, output an empty contigs file, rather than failing with '
                         'an error code')
     parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0,
                         help='only output contigs longer than this many bp')
-    parser.add_argument('--spadesOpts', dest='spades_opts', default='--rna', help='(advanced) Extra flags to pass to the SPAdes assembler')
+    parser.add_argument('--spadesOpts', dest='spades_opts', default='--rnaviral', help='(advanced) Extra flags to pass to the SPAdes assembler')
     parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
     util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, assemble_spades, split_args=True)

diff --git a/requirements-conda.txt b/requirements-conda.txt
@@ -1,6 +1,6 @@
 gap2seq>=3.1.1a2
-mafft>=7.464
-mummer4>=4.0.0beta2
-muscle>=3.8.1551
-spades>=3.12.0
+mafft>=7.508
+mummer4>=4.0.0rc1
+muscle=3.8.1551
+spades>=3.15.5
 # Python packages below
diff --git a/test/input/TestAssembleSpades/clipDb.fasta b/test/input/TestAssembleSpades/clipDb.fasta