@@ -34,8 +34,8 @@ def load_positions(posfile):
34
34
d .setdefault (chrom , list ()).append (int (pos )- 1 )
35
35
return d
36
36
37
- def find_rcorrected_sites (uncorrfile , corrfile ):
38
- print (tstamp (), "Finding Rcorrected sites . . ." , file = sys .stderr )
37
+ def find_corrected_sites (uncorrfile , corrfile ):
38
+ print (tstamp (), "Finding corrected sites . . ." , file = sys .stderr )
39
39
uncorr_reads = list (pysam .FastxFile (uncorrfile ))
40
40
corr_reads = list (pysam .FastxFile (corrfile ))
41
41
#verify the sequences are the same and can be accessed by index
@@ -50,7 +50,7 @@ def find_rcorrected_sites(uncorrfile, corrfile):
50
50
names = dict ()
51
51
seqlen = len (uncorr_reads [0 ].get_quality_array ())
52
52
rawquals = np .zeros ([len (uncorr_reads ), seqlen ], dtype = np .int )
53
- rcorrected = np .zeros ([len (uncorr_reads ),seqlen ], dtype = np .bool )
53
+ corrected = np .zeros ([len (uncorr_reads ),seqlen ], dtype = np .bool )
54
54
seqs = np .zeros (len (uncorr_reads ), dtype = 'U' + str (seqlen ))
55
55
rgs = np .zeros (len (uncorr_reads ), dtype = 'U7' )
56
56
for i in range (len (uncorr_reads )):
@@ -61,13 +61,13 @@ def find_rcorrected_sites(uncorrfile, corrfile):
61
61
seqs [i ] = uncorr_reads [i ].sequence
62
62
uncorr_s = np .array (list (uncorr_reads [i ].sequence ), dtype = np .unicode )
63
63
corr_s = np .array (list (corr_reads [i ].sequence ), dtype = np .unicode )
64
- rcorrected [i ] = (uncorr_s == corr_s )
65
- return names , rawquals .copy (), rcorrected .copy (), seqs .copy (), rgs .copy (), seqlen
64
+ corrected [i ] = (uncorr_s == corr_s )
65
+ return names , rawquals .copy (), corrected .copy (), seqs .copy (), rgs .copy (), seqlen
66
66
67
- def train_regression (rawquals , rcorrected , tol = 1e-4 ):
67
+ def train_regression (rawquals , corrected , tol = 1e-4 ):
68
68
print (tstamp (), "Doing Logit Regression" , file = sys .stderr )
69
69
lr = LR (tol = tol )
70
- lr = lr .fit (rawquals .flatten ().reshape (- 1 ,1 ), rcorrected .flatten ())
70
+ lr = lr .fit (rawquals .flatten ().reshape (- 1 ,1 ), corrected .flatten ())
71
71
return lr
72
72
73
73
def recalibrate (lr , q ):
@@ -806,10 +806,10 @@ def main():
806
806
corrfile = "nospace.lighter.fq"
807
807
bamfilename = "only_confident.sorted.recal.bam"
808
808
fastafilename = "../chr1.renamed.fa"
809
- names , rawquals , rcorrected , seqs , rgs , seqlen = find_rcorrected_sites (uncorrfile , corrfile )
809
+ names , rawquals , corrected , seqs , rgs , seqlen = find_corrected_sites (uncorrfile , corrfile )
810
810
811
- #rcorrected is true when the bases match the original, false otherwise
812
- #hence rcorrected == false is where the errors are
811
+ #corrected is true when the bases match the original, false otherwise
812
+ #hence corrected == false is where the errors are
813
813
814
814
bad_positions = load_positions ("variable_sites.txt" )
815
815
cachefile = 'cached_recal_errs.npz'
@@ -824,7 +824,7 @@ def main():
824
824
gatkcalibratedquals , erroneous , skips = find_errors (bamfilename , fastafilename , bad_positions , names , seqlen )
825
825
np .savez_compressed (cachefile , gatkcalibratedquals = gatkcalibratedquals , erroneous = erroneous , skips = skips )
826
826
827
- #important arrays: names, rawquals, rcorrected , calibquals, gatkcalibratedquals, erroneous, hmmquals
827
+ #important arrays: names, rawquals, corrected , calibquals, gatkcalibratedquals, erroneous, hmmquals
828
828
829
829
dinucleotide = get_dinucleotide (seqs , rawquals )
830
830
unique_rgs = np .unique (rgs )
@@ -840,7 +840,7 @@ def main():
840
840
reversecycle = np .zeros (len (names ), dtype = np .bool )
841
841
reversecycle [np .array (list (names .values ()), dtype = np .int )] = np .char .endswith (np .array (list (names .keys ()), dtype = np .unicode ),'/2' )
842
842
843
- dq_calibrated = delta_q_recalibrate (rawquals , rgs , dinucleotide , np .logical_not (rcorrected ), reversecycle )
843
+ dq_calibrated = delta_q_recalibrate (rawquals , rgs , dinucleotide , np .logical_not (corrected ), reversecycle )
844
844
#custom_gatk_calibrated = delta_q_recalibrate(rawquals, rgs, dinucleotide, erroneous, reversecycle)
845
845
#from_table = table_recalibrate(rawquals, tablefile, unique_pus, dinuc_order, seqlen, reversecycle, rgs, dinucleotide)
846
846
#assert np.array_equal(from_table, gatkcalibratedquals)
0 commit comments