feat: add rescaling feature

maxibor · Jul 16, 2024 · d5ff542 · d5ff542
1 parent 956a9b7
commit d5ff542
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 50 deletions.
diff --git a/docs/source/output.md b/docs/source/output.md
@@ -29,6 +29,15 @@ The tabular outputs are  comma-separated file (`.csv`) with the following column
 
 Same file as above, but with contigs filtered with `qvalue <= 0.05` and `predicted_accuracy >= threshold` with a user defined filtering threshold (default = 0.5), or determined with the [kneedle](https://ieeexplore.ieee.org/document/5961514) method.
 
+
+### `pydamage_rescaled.bam`
+
+The input alignment file with rescaled base quality scores when running `pydamage analyze` with the `-r` or `--rescale` flag.
+
+The rescaled base calling scores are computed for each read containing ancient DNA damage according to the following formula, with `i` the position in the read, `p_err` the original base calling error probability,`p_pydam` the pydamage computed ancient damage probability, and `p_new` the updated base calling error probability.
+
+`p_new(i) = 1 - (1 - p_err(i)) (1 - p_pydam(i))`
+
 ### Plots
 
 The visual output are PNG files, one per reference contig. They show the frequency of observed C to T, and G to A transition at the 5' end of the sequencing data and overlay it with the fitted models for both the null and the damage model, including 95% confidence intervals. Furthermore, it provides a "residuals versus fitted" plot to help evaluate the fit of the pydamage damage model. Finally, the plot contains informtion on the average coverage along the reference and the p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution.

diff --git a/pydamage/cli.py b/pydamage/cli.py
@@ -32,7 +32,15 @@ def list_commands(self, ctx):
     show_default=True,
     help="Output directory",
 )
-def cli(ctx, outdir):
+@click.option(
+    "-t",
+    "--threshold",
+    default=0.5,
+    type=float,
+    show_default=True,
+    help="Predicted accuracy filtering threshold. Set to 0 for finding threshold with kneed method",
+)
+def cli(ctx, outdir, threshold):
     """\b
     PyDamage: Damage parameter estimation for ancient DNA
     Author: Maxime Borry
@@ -44,6 +52,7 @@ def cli(ctx, outdir):
     ctx.ensure_object(dict)
 
     ctx.obj["outdir"] = outdir
+    ctx.obj["threshold"] = threshold
     pass
 
 
@@ -89,6 +98,13 @@ def cli(ctx, outdir):
     help="Use entire BAM file as single reference for analysis "
     "(ignore reference headers)",
 )
+@click.option("-n", "--no_ga", is_flag=True, help="Do not use G->A transitions")
+@click.option(
+    "-r",
+    "--rescale",
+    is_flag=True,
+    help="Rescale base quality scores using the PyDamage damage model",
+)
 def analyze(ctx, no_args_is_help=True, **kwargs):
     """\b
     Run the PyDamage analysis
@@ -101,14 +117,6 @@ def analyze(ctx, no_args_is_help=True, **kwargs):
 @cli.command()
 @click.pass_context
 @click.argument("csv", type=click.Path(exists=True))
-@click.option(
-    "-t",
-    "--threshold",
-    default=0.5,
-    type=float,
-    show_default=True,
-    help="Predicted accuracy filtering threshold. Set to 0 for finding threshold with kneed method",
-)
 def filter(ctx, no_args_is_help=True, **kwargs):
     """\b
     Filter PyDamage results on predicted accuracy and qvalue thresholds.

diff --git a/pydamage/main.py b/pydamage/main.py
@@ -43,29 +43,38 @@ def pydamage_analyze(
     bam,
     wlen=30,
     minlen=0,
+    threshold=0.5,
     show_al=False,
     process=1,
     outdir="",
     plot=False,
     verbose=False,
     force=False,
     group=False,
+    rescale=False,
+    no_ga=False,
 ):
     """Runs the pydamage analysis for each reference separately
 
     Args:
         bam(str): Path to alignment (sam/bam/cram) file
         minlen(int): minimum reference sequence length threshold
+        threshold(float): Predicted accuracy threshold
         wlen(int): window length
         show_al(bool): print alignments representations
         process(int):  Number of  processes for parellel computing
         outdir(str): Path to output directory
         verbose(bool): verbose mode
         force(bool): force overwriting of results directory
+        group(bool): Use entire BAM file as single reference for analysis
+        plot(bool): Write damage fitting plots to disk
+        rescale(bool): Rescale base quality scores using the PyDamage damage model
+        no_ga(bool): Do not use G->A transitions
     Returns:
         pd.DataFrame: pandas DataFrame containg pydamage results
 
     """
+    g2a = not no_ga
     if verbose:
         print(f"Pydamage version {__version__}\n")
     utils.makedir(outdir, force=force)
@@ -100,17 +109,19 @@ def pydamage_analyze(
         bam=bam,
         mode=mode,
         wlen=wlen,
+        g2a=g2a,
         show_al=show_al,
         process=process,
         verbose=verbose,
     )
-    print("Estimating and testing Damage")
     if group:
+        print("Estimating and testing Damage")
         filt_res, read_dict = damage.test_damage(
             ref=None,
             bam=bam,
             mode=mode,
             wlen=wlen,
+            g2a=g2a,
             show_al=show_al,
             process=process,
             verbose=verbose,
@@ -119,7 +130,13 @@ def pydamage_analyze(
     else:
         if len(refs) > 0:
             with multiprocessing.Pool(proc) as p:
-                res = list(tqdm(p.imap(test_damage_partial, refs), total=len(refs)))
+                res = list(
+                    tqdm(
+                        p.imap(test_damage_partial, refs),
+                        total=len(refs),
+                        desc="Estimating and testing Damage",
+                    ),
+                )
             filt_res, read_dicts = zip(*res)
             filt_res = [i for i in filt_res if i]
             read_dicts = [i for i in read_dicts if i]
@@ -138,32 +155,30 @@ def pydamage_analyze(
             "No alignments were found, check your alignment file", PyDamageWarning
         )
 
-    rescale = True
-    if rescale:
-        print("\nRescaling quality scores")
-        rescale_bam(
-            bam=bam,
-            threshold=0.5,
-            alpha=0.05,
-            damage_dict=filt_res,
-            read_dict=read_dict,
-            outname=os.path.join(outdir, "rescaled.bam"),
-        )
-
     if plot and len(filt_res) > 0:
         print("\nGenerating Pydamage plots")
         plotdir = f"{outdir}/plots"
         utils.makedir(plotdir, confirm=False)
 
-        plot_partial = partial(damageplot, outdir=plotdir, wlen=wlen)
+        plot_partial = partial(damageplot, outdir=plotdir, wlen=wlen, plot_g2a=g2a)
         with multiprocessing.Pool(proc) as p:
             list(tqdm(p.imap(plot_partial, filt_res), total=len(filt_res)))
-    df_pydamage = utils.pandas_processing(res_dict=filt_res, wlen=wlen)
+    df_pydamage = utils.pandas_processing(res_dict=filt_res, wlen=wlen, g2a=g2a)
 
     prep_df_glm = prepare_data(df_pydamage)
     df_glm = glm_predict(prep_df_glm, glm_model_params)
 
     df = df_glm.merge(df_pydamage, left_index=True, right_index=True)
 
+    rescale = True
+    if rescale:
+        rescale_bam(
+            bam=bam,
+            threshold=threshold,
+            alpha=0.05,
+            damage_dict=df.to_dict(),
+            read_dict=read_dict,
+            outname=os.path.join(outdir, "pydamage_rescaled.bam"),
+        )
     utils.df_to_csv(df, outdir)
     return df
diff --git a/pydamage/rescale.py b/pydamage/rescale.py
@@ -2,6 +2,7 @@
 import numpy as np
 from array import array
 from pydamage.models import damage_model
+from tqdm import tqdm
 
 
 def phred_to_prob(qual):
@@ -13,24 +14,28 @@ def phred_to_prob(qual):
     Returns:
         np.array(int): Array of read error probabilities
     """
-    return 10 ** (-np.array(qual) / 10)
+    return 10 ** (-np.array(qual).astype(int) / 10)
 
 
-def rescale_qual(read_qual, dmg_pmf, damage_bases):
+def rescale_qual(read_qual, dmg_pmf, damage_bases, reverse):
     """Rescale quality scores using damage model
     Args:
         read_qual (array): Array of Phred quality scores
         dmg_pmf (array): Array of damage model probabilities
         damage_bases (array): Array of positions with damage
+        reverse(bool): Read mapped to reverse strand
     Returns:
         np.array(int): Array of rescaled Phred quality scores
     """
     e = phred_to_prob(read_qual)
+    if reverse:
+        e = e[::-1]
     d = np.zeros(len(read_qual))
     d[damage_bases] = dmg_pmf[damage_bases]
-    return array(
-        "B", np.round(-10 * np.log10(1 - np.multiply(1 - e, 1 - d)), 0).astype(int)
-    )
+    r = np.round(-10 * np.log10(1 - np.multiply(1 - e, 1 - d)), 0).astype(int)
+    if reverse:
+        r = r[::-1]
+    return array("B", r)
 
 
 def rescale_bam(bam, threshold, alpha, damage_dict, read_dict, outname):
@@ -44,41 +49,34 @@ def rescale_bam(bam, threshold, alpha, damage_dict, read_dict, outname):
         read_dict (dict): Dictionary of read names
         outname (str): Path to output BAM file
     """
-    damage_dict = {v["reference"]: v for v in damage_dict}
-    print(damage_dict)
     with pysam.AlignmentFile(bam, "rb") as al:
         refs = al.references
         with pysam.AlignmentFile(outname, "wb", template=al) as out:
-            for ref in refs:
+            for ref in tqdm(refs, desc="Rescaling quality scores"):
                 dmg = damage_model()
                 if ref in read_dict:
                     pass_filter = False
                     if (
                         threshold
-                        and threshold <= damage_dict[ref]["predicted_accuracy"]
-                    ) and (alpha and alpha >= damage_dict[ref]["qvalue"]):
+                        and threshold <= damage_dict["predicted_accuracy"][ref]
+                    ) and (alpha and alpha >= damage_dict["qvalue"][ref]):
                         pass_filter = True
                     if pass_filter:
-                        dmg_pmf = dmg.fit(x=np.arange(400), **damage_dict[ref])
+                        dmg_pmf = dmg.fit(
+                            x=np.arange(400),
+                            p=damage_dict["damage_model_p"][ref],
+                            pmin=damage_dict["damage_model_pmin"][ref],
+                            pmax=damage_dict["damage_model_pmax"][ref],
+                        )
                         for read in al.fetch(ref):
                             if read.query_name in read_dict[ref]:
                                 qual = read.query_qualities
                                 read.query_qualities = rescale_qual(
-                                    qual, dmg_pmf, read_dict[ref][read.query_name]
+                                    qual,
+                                    dmg_pmf,
+                                    read_dict[ref][read.query_name],
+                                    reverse=read.is_reverse,
                                 )
-                                print(
-                                    np.mean(np.array(qual)),
-                                    np.mean(
-                                        np.array(
-                                            rescale_qual(
-                                                qual,
-                                                dmg_pmf,
-                                                read_dict[ref][read.query_name],
-                                            )
-                                        )
-                                    ),
-                                )
-
                             out.write(read)
                     else:
                         for read in al.fetch(ref):