Skip to content

Commit

Permalink
feat: add rescaling feature
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Jul 16, 2024
1 parent 956a9b7 commit d5ff542
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 50 deletions.
9 changes: 9 additions & 0 deletions docs/source/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ The tabular outputs are comma-separated file (`.csv`) with the following column

Same file as above, but with contigs filtered with `qvalue <= 0.05` and `predicted_accuracy >= threshold` with a user defined filtering threshold (default = 0.5), or determined with the [kneedle](https://ieeexplore.ieee.org/document/5961514) method.


### `pydamage_rescaled.bam`

The input alignment file with rescaled base quality scores when running `pydamage analyze` with the `-r` or `--rescale` flag.

The rescaled base calling scores are computed for each read containing ancient DNA damage according to the following formula, with `i` the position in the read, `p_err` the original base calling error probability,`p_pydam` the pydamage computed ancient damage probability, and `p_new` the updated base calling error probability.

`p_new(i) = 1 - (1 - p_err(i)) (1 - p_pydam(i))`

### Plots

The visual output are PNG files, one per reference contig. They show the frequency of observed C to T, and G to A transition at the 5' end of the sequencing data and overlay it with the fitted models for both the null and the damage model, including 95% confidence intervals. Furthermore, it provides a "residuals versus fitted" plot to help evaluate the fit of the pydamage damage model. Finally, the plot contains informtion on the average coverage along the reference and the p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution.
Expand Down
26 changes: 17 additions & 9 deletions pydamage/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@ def list_commands(self, ctx):
show_default=True,
help="Output directory",
)
def cli(ctx, outdir):
@click.option(
"-t",
"--threshold",
default=0.5,
type=float,
show_default=True,
help="Predicted accuracy filtering threshold. Set to 0 for finding threshold with kneed method",
)
def cli(ctx, outdir, threshold):
"""\b
PyDamage: Damage parameter estimation for ancient DNA
Author: Maxime Borry
Expand All @@ -44,6 +52,7 @@ def cli(ctx, outdir):
ctx.ensure_object(dict)

ctx.obj["outdir"] = outdir
ctx.obj["threshold"] = threshold
pass


Expand Down Expand Up @@ -89,6 +98,13 @@ def cli(ctx, outdir):
help="Use entire BAM file as single reference for analysis "
"(ignore reference headers)",
)
@click.option("-n", "--no_ga", is_flag=True, help="Do not use G->A transitions")
@click.option(
"-r",
"--rescale",
is_flag=True,
help="Rescale base quality scores using the PyDamage damage model",
)
def analyze(ctx, no_args_is_help=True, **kwargs):
"""\b
Run the PyDamage analysis
Expand All @@ -101,14 +117,6 @@ def analyze(ctx, no_args_is_help=True, **kwargs):
@cli.command()
@click.pass_context
@click.argument("csv", type=click.Path(exists=True))
@click.option(
"-t",
"--threshold",
default=0.5,
type=float,
show_default=True,
help="Predicted accuracy filtering threshold. Set to 0 for finding threshold with kneed method",
)
def filter(ctx, no_args_is_help=True, **kwargs):
"""\b
Filter PyDamage results on predicted accuracy and qvalue thresholds.
Expand Down
47 changes: 31 additions & 16 deletions pydamage/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,29 +43,38 @@ def pydamage_analyze(
bam,
wlen=30,
minlen=0,
threshold=0.5,
show_al=False,
process=1,
outdir="",
plot=False,
verbose=False,
force=False,
group=False,
rescale=False,
no_ga=False,
):
"""Runs the pydamage analysis for each reference separately
Args:
bam(str): Path to alignment (sam/bam/cram) file
minlen(int): minimum reference sequence length threshold
threshold(float): Predicted accuracy threshold
wlen(int): window length
show_al(bool): print alignments representations
process(int): Number of processes for parellel computing
outdir(str): Path to output directory
verbose(bool): verbose mode
force(bool): force overwriting of results directory
group(bool): Use entire BAM file as single reference for analysis
plot(bool): Write damage fitting plots to disk
rescale(bool): Rescale base quality scores using the PyDamage damage model
no_ga(bool): Do not use G->A transitions
Returns:
pd.DataFrame: pandas DataFrame containg pydamage results
"""
g2a = not no_ga
if verbose:
print(f"Pydamage version {__version__}\n")
utils.makedir(outdir, force=force)
Expand Down Expand Up @@ -100,17 +109,19 @@ def pydamage_analyze(
bam=bam,
mode=mode,
wlen=wlen,
g2a=g2a,
show_al=show_al,
process=process,
verbose=verbose,
)
print("Estimating and testing Damage")
if group:
print("Estimating and testing Damage")
filt_res, read_dict = damage.test_damage(
ref=None,
bam=bam,
mode=mode,
wlen=wlen,
g2a=g2a,
show_al=show_al,
process=process,
verbose=verbose,
Expand All @@ -119,7 +130,13 @@ def pydamage_analyze(
else:
if len(refs) > 0:
with multiprocessing.Pool(proc) as p:
res = list(tqdm(p.imap(test_damage_partial, refs), total=len(refs)))
res = list(
tqdm(
p.imap(test_damage_partial, refs),
total=len(refs),
desc="Estimating and testing Damage",
),
)
filt_res, read_dicts = zip(*res)
filt_res = [i for i in filt_res if i]
read_dicts = [i for i in read_dicts if i]
Expand All @@ -138,32 +155,30 @@ def pydamage_analyze(
"No alignments were found, check your alignment file", PyDamageWarning
)

rescale = True
if rescale:
print("\nRescaling quality scores")
rescale_bam(
bam=bam,
threshold=0.5,
alpha=0.05,
damage_dict=filt_res,
read_dict=read_dict,
outname=os.path.join(outdir, "rescaled.bam"),
)

if plot and len(filt_res) > 0:
print("\nGenerating Pydamage plots")
plotdir = f"{outdir}/plots"
utils.makedir(plotdir, confirm=False)

plot_partial = partial(damageplot, outdir=plotdir, wlen=wlen)
plot_partial = partial(damageplot, outdir=plotdir, wlen=wlen, plot_g2a=g2a)
with multiprocessing.Pool(proc) as p:
list(tqdm(p.imap(plot_partial, filt_res), total=len(filt_res)))
df_pydamage = utils.pandas_processing(res_dict=filt_res, wlen=wlen)
df_pydamage = utils.pandas_processing(res_dict=filt_res, wlen=wlen, g2a=g2a)

prep_df_glm = prepare_data(df_pydamage)
df_glm = glm_predict(prep_df_glm, glm_model_params)

df = df_glm.merge(df_pydamage, left_index=True, right_index=True)

rescale = True
if rescale:
rescale_bam(
bam=bam,
threshold=threshold,
alpha=0.05,
damage_dict=df.to_dict(),
read_dict=read_dict,
outname=os.path.join(outdir, "pydamage_rescaled.bam"),
)
utils.df_to_csv(df, outdir)
return df
48 changes: 23 additions & 25 deletions pydamage/rescale.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
from array import array
from pydamage.models import damage_model
from tqdm import tqdm


def phred_to_prob(qual):
Expand All @@ -13,24 +14,28 @@ def phred_to_prob(qual):
Returns:
np.array(int): Array of read error probabilities
"""
return 10 ** (-np.array(qual) / 10)
return 10 ** (-np.array(qual).astype(int) / 10)


def rescale_qual(read_qual, dmg_pmf, damage_bases):
def rescale_qual(read_qual, dmg_pmf, damage_bases, reverse):
"""Rescale quality scores using damage model
Args:
read_qual (array): Array of Phred quality scores
dmg_pmf (array): Array of damage model probabilities
damage_bases (array): Array of positions with damage
reverse(bool): Read mapped to reverse strand
Returns:
np.array(int): Array of rescaled Phred quality scores
"""
e = phred_to_prob(read_qual)
if reverse:
e = e[::-1]
d = np.zeros(len(read_qual))
d[damage_bases] = dmg_pmf[damage_bases]
return array(
"B", np.round(-10 * np.log10(1 - np.multiply(1 - e, 1 - d)), 0).astype(int)
)
r = np.round(-10 * np.log10(1 - np.multiply(1 - e, 1 - d)), 0).astype(int)
if reverse:
r = r[::-1]
return array("B", r)


def rescale_bam(bam, threshold, alpha, damage_dict, read_dict, outname):
Expand All @@ -44,41 +49,34 @@ def rescale_bam(bam, threshold, alpha, damage_dict, read_dict, outname):
read_dict (dict): Dictionary of read names
outname (str): Path to output BAM file
"""
damage_dict = {v["reference"]: v for v in damage_dict}
print(damage_dict)
with pysam.AlignmentFile(bam, "rb") as al:
refs = al.references
with pysam.AlignmentFile(outname, "wb", template=al) as out:
for ref in refs:
for ref in tqdm(refs, desc="Rescaling quality scores"):
dmg = damage_model()
if ref in read_dict:
pass_filter = False
if (
threshold
and threshold <= damage_dict[ref]["predicted_accuracy"]
) and (alpha and alpha >= damage_dict[ref]["qvalue"]):
and threshold <= damage_dict["predicted_accuracy"][ref]
) and (alpha and alpha >= damage_dict["qvalue"][ref]):
pass_filter = True
if pass_filter:
dmg_pmf = dmg.fit(x=np.arange(400), **damage_dict[ref])
dmg_pmf = dmg.fit(
x=np.arange(400),
p=damage_dict["damage_model_p"][ref],
pmin=damage_dict["damage_model_pmin"][ref],
pmax=damage_dict["damage_model_pmax"][ref],
)
for read in al.fetch(ref):
if read.query_name in read_dict[ref]:
qual = read.query_qualities
read.query_qualities = rescale_qual(
qual, dmg_pmf, read_dict[ref][read.query_name]
qual,
dmg_pmf,
read_dict[ref][read.query_name],
reverse=read.is_reverse,
)
print(
np.mean(np.array(qual)),
np.mean(
np.array(
rescale_qual(
qual,
dmg_pmf,
read_dict[ref][read.query_name],
)
)
),
)

out.write(read)
else:
for read in al.fetch(ref):
Expand Down

0 comments on commit d5ff542

Please sign in to comment.