|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +""" |
| 4 | +Utilities for benchmarking calibration methods. |
| 5 | +""" |
| 6 | + |
| 7 | +from kbbq import compare_reads |
| 8 | +import pysam |
| 9 | + |
| 10 | +def get_ref_dict(reffilename): |
| 11 | + fasta = pysam.FastaFile(reffilename) |
| 12 | + ref = {chrom : np.array(list(fasta.fetch(reference = chrom)), dtype = np.unicode) for chrom in fasta.references} |
| 13 | + return ref |
| 14 | + |
| 15 | +def get_full_skips(refdict, var_sites): |
| 16 | + skips = {chrom: np.zeros(len(refdict[chrom]), dtype = np.bool) for chrom in refdict.keys()) |
| 17 | + for chrom in skips.keys(): |
| 18 | + variable_positions = var_sites[chrom] |
| 19 | + skips[chrom][variable_positions] = True |
| 20 | + return skips |
| 21 | + |
| 22 | +def get_bam_readname(read): |
| 23 | + """ |
| 24 | + Given a bam read, get a canonicalized name. |
| 25 | + The name is the query name + a suffix |
| 26 | + based on whether it is first or second in pair. |
| 27 | + """ |
| 28 | + suffix = ("/2" if read.is_read2 else "/1") |
| 29 | + return read.query_name + suffix |
| 30 | + |
| 31 | +def get_fastq_readname(read): |
| 32 | + """ |
| 33 | + Given a fastq read, get a canonicalized name. |
| 34 | + This is based on whether the read is first in pair. |
| 35 | + """ |
| 36 | + return read.name.split(sep = '_')[0] |
| 37 | + |
| 38 | +def get_error_dict(bamfile, refdict, fullskips): |
| 39 | + """ |
| 40 | + Finds errors given an authoritative BAM, ref, and sites to skip. |
| 41 | +
|
| 42 | + The returned dict has readname keys and tuple(np.array, np.array) of bools as values. |
| 43 | + """ |
| 44 | + edict = dict() |
| 45 | + for read in bamfile: |
| 46 | + name = get_read_name(read) |
| 47 | + edict[name] = find_read_errors(read, refdict, fullskips) |
| 48 | + return edict |
| 49 | + |
| 50 | +def find_errors_in_fastq(fqreads, edict) |
| 51 | + """ |
| 52 | + Finds errors given a readname -> (errors, skips) dict. |
| 53 | + """ |
| 54 | + errskips = [edict.get(get_fastq_readname(read)) for read in fqreads] |
| 55 | + return zip(*errskips) #this will return a list of errors and a list of skips |
| 56 | + |
| 57 | +def calculate_q(errors, quals): |
| 58 | + """ |
| 59 | + Calculate Actual Q and Predicted Q given a flat |
| 60 | + array of errors and flat array of quality scores. |
| 61 | +
|
| 62 | + The index of the returned array represents the predicted quality score, |
| 63 | + the value represents the actual quality score or the number of bases. |
| 64 | + """ |
| 65 | + numtotal = np.bincount(quals.reshape(-1)) |
| 66 | + numerrs = np.bincount(quals.reshape(-1), minlength = len(numtotal)) |
| 67 | + nonzero = (numtotal != 0) |
| 68 | + p = np.true_divide(numerrs[nonzero], numtotal[nonzero]) |
| 69 | + q = compare_reads.p_to_q(p) |
| 70 | + actual_q = np.zeros(len(numtotal), dtype = np.int) |
| 71 | + actual_q[nonzero] = q |
| 72 | + return actual_q, numtotal |
| 73 | + |
| 74 | +def benchmark_fastq(fqfile, bamfile, fafile, varfile): |
| 75 | + var_sites = compare_reads.load_positions(varfile) |
| 76 | + ref = get_ref_dict(fafile) |
| 77 | + fullskips = get_full_skips(ref, var_sites) |
| 78 | + edict = get_error_dict(bamfile, ref, fullskips) |
| 79 | + reads = list(pysam.FastxFile(fqfile)) |
| 80 | + errors, skips = find_errors_in_fastq(reads, edict) |
| 81 | + quals = [np.array(r.get_quality_array()) for r in reads] |
| 82 | + #turn list of small arrays into 2d arrays |
| 83 | + errors = np.array(errors) |
| 84 | + skips = np.array(skips) |
| 85 | + quals = np.array(quals) |
| 86 | + #get rid of skipped sites (we can't tell if these are errors or not) |
| 87 | + e = errors[~skips] |
| 88 | + quals = quals[~skips] |
| 89 | + return calculate_q(e, quals) #actual_q, ntotal |
| 90 | + |
| 91 | +def benchmark_bam(bamfile, fafile, varfile): |
| 92 | + """ |
| 93 | + TODO |
| 94 | + """ |
| 95 | + pass |
| 96 | + |
| 97 | +def print_benchmark(actual_q, label, nbases): |
| 98 | + """ |
| 99 | + Print benchmark data to a tab separated table. |
| 100 | +
|
| 101 | + Label should be a string and is used for all values |
| 102 | + in the table. No headers are printed. This means |
| 103 | + the benchmark command can be called many times and each |
| 104 | + table can be concatenated together. |
| 105 | + """ |
| 106 | + nonzero = (actual_q != 0) |
| 107 | + predicted_q = np.arange(len(actual_q))[nonzero] |
| 108 | + for pq, aq, nb in zip(predicted_q, actual_q, nbases): |
| 109 | + print(pq, aq, label, nb, sep = "\t") |
| 110 | + |
| 111 | +def benchmark(bamfile, fafile, varfile, fastqfile = None, label = None): |
| 112 | + """ |
| 113 | + Perform the benchmark and print the results to stdout. |
| 114 | +
|
| 115 | + If a fastqfile is not given, the benchmark will be taken from |
| 116 | + the reads in the BAM file. If a fastqfile is given, it should |
| 117 | + have the same read names as the reads in the BAM file. The |
| 118 | + names will then be used to look up where errors occur in the |
| 119 | + reads. |
| 120 | + """ |
| 121 | + bam = pysam.AlignmentFile(bamfile, 'r') |
| 122 | + if fastqfile is not None: |
| 123 | + actual_q, nbases = benchmark_fastq(fastqfile, bamfile, fafile, varfile) |
| 124 | + label = (fastqfile if label is None else label) |
| 125 | + else: |
| 126 | + actual_q, nbases = benchmark_bam(bamfile, fafile, varfile) |
| 127 | + label = (bamfile if label is None else label) |
| 128 | + print_benchmark(actual_q, label, nbases) |
0 commit comments