Skip to content

Commit 8520f6e

Browse files
committed
Add benchmarking functions.
fix #6 and work on #7 . These functions will be used soon to create a benchmark command.
1 parent 0e4390b commit 8520f6e

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed

kbbq/benchmark.py

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Utilities for benchmarking calibration methods.
5+
"""
6+
7+
from kbbq import compare_reads
8+
import pysam
9+
10+
def get_ref_dict(reffilename):
11+
fasta = pysam.FastaFile(reffilename)
12+
ref = {chrom : np.array(list(fasta.fetch(reference = chrom)), dtype = np.unicode) for chrom in fasta.references}
13+
return ref
14+
15+
def get_full_skips(refdict, var_sites):
16+
skips = {chrom: np.zeros(len(refdict[chrom]), dtype = np.bool) for chrom in refdict.keys())
17+
for chrom in skips.keys():
18+
variable_positions = var_sites[chrom]
19+
skips[chrom][variable_positions] = True
20+
return skips
21+
22+
def get_bam_readname(read):
23+
"""
24+
Given a bam read, get a canonicalized name.
25+
The name is the query name + a suffix
26+
based on whether it is first or second in pair.
27+
"""
28+
suffix = ("/2" if read.is_read2 else "/1")
29+
return read.query_name + suffix
30+
31+
def get_fastq_readname(read):
32+
"""
33+
Given a fastq read, get a canonicalized name.
34+
This is based on whether the read is first in pair.
35+
"""
36+
return read.name.split(sep = '_')[0]
37+
38+
def get_error_dict(bamfile, refdict, fullskips):
39+
"""
40+
Finds errors given an authoritative BAM, ref, and sites to skip.
41+
42+
The returned dict has readname keys and tuple(np.array, np.array) of bools as values.
43+
"""
44+
edict = dict()
45+
for read in bamfile:
46+
name = get_read_name(read)
47+
edict[name] = find_read_errors(read, refdict, fullskips)
48+
return edict
49+
50+
def find_errors_in_fastq(fqreads, edict)
51+
"""
52+
Finds errors given a readname -> (errors, skips) dict.
53+
"""
54+
errskips = [edict.get(get_fastq_readname(read)) for read in fqreads]
55+
return zip(*errskips) #this will return a list of errors and a list of skips
56+
57+
def calculate_q(errors, quals):
58+
"""
59+
Calculate Actual Q and Predicted Q given a flat
60+
array of errors and flat array of quality scores.
61+
62+
The index of the returned array represents the predicted quality score,
63+
the value represents the actual quality score or the number of bases.
64+
"""
65+
numtotal = np.bincount(quals.reshape(-1))
66+
numerrs = np.bincount(quals.reshape(-1), minlength = len(numtotal))
67+
nonzero = (numtotal != 0)
68+
p = np.true_divide(numerrs[nonzero], numtotal[nonzero])
69+
q = compare_reads.p_to_q(p)
70+
actual_q = np.zeros(len(numtotal), dtype = np.int)
71+
actual_q[nonzero] = q
72+
return actual_q, numtotal
73+
74+
def benchmark_fastq(fqfile, bamfile, fafile, varfile):
75+
var_sites = compare_reads.load_positions(varfile)
76+
ref = get_ref_dict(fafile)
77+
fullskips = get_full_skips(ref, var_sites)
78+
edict = get_error_dict(bamfile, ref, fullskips)
79+
reads = list(pysam.FastxFile(fqfile))
80+
errors, skips = find_errors_in_fastq(reads, edict)
81+
quals = [np.array(r.get_quality_array()) for r in reads]
82+
#turn list of small arrays into 2d arrays
83+
errors = np.array(errors)
84+
skips = np.array(skips)
85+
quals = np.array(quals)
86+
#get rid of skipped sites (we can't tell if these are errors or not)
87+
e = errors[~skips]
88+
quals = quals[~skips]
89+
return calculate_q(e, quals) #actual_q, ntotal
90+
91+
def benchmark_bam(bamfile, fafile, varfile):
92+
"""
93+
TODO
94+
"""
95+
pass
96+
97+
def print_benchmark(actual_q, label, nbases):
98+
"""
99+
Print benchmark data to a tab separated table.
100+
101+
Label should be a string and is used for all values
102+
in the table. No headers are printed. This means
103+
the benchmark command can be called many times and each
104+
table can be concatenated together.
105+
"""
106+
nonzero = (actual_q != 0)
107+
predicted_q = np.arange(len(actual_q))[nonzero]
108+
for pq, aq, nb in zip(predicted_q, actual_q, nbases):
109+
print(pq, aq, label, nb, sep = "\t")
110+
111+
def benchmark(bamfile, fafile, varfile, fastqfile = None, label = None):
112+
"""
113+
Perform the benchmark and print the results to stdout.
114+
115+
If a fastqfile is not given, the benchmark will be taken from
116+
the reads in the BAM file. If a fastqfile is given, it should
117+
have the same read names as the reads in the BAM file. The
118+
names will then be used to look up where errors occur in the
119+
reads.
120+
"""
121+
bam = pysam.AlignmentFile(bamfile, 'r')
122+
if fastqfile is not None:
123+
actual_q, nbases = benchmark_fastq(fastqfile, bamfile, fafile, varfile)
124+
label = (fastqfile if label is None else label)
125+
else:
126+
actual_q, nbases = benchmark_bam(bamfile, fafile, varfile)
127+
label = (bamfile if label is None else label)
128+
print_benchmark(actual_q, label, nbases)

0 commit comments

Comments
 (0)