-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate_variation.pl
4408 lines (3896 loc) · 212 KB
/
annotate_variation.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;
use File::Spec;
use Cwd;
our $REVISION = '$Revision: 599af129dbcfd4e85a2da9832c4ae59898e2f3a9 $';
our $DATE = '$Date: 2017-07-17 01:17:04 -0400 (Mon, 17 Jul 2017) $';
our $AUTHOR = '$Author: Kai Wang <kaichop@gmail.com> $';
our ($verbose, $help, $man);
our ($queryfile, $dbloc);
our ($outfile, $separate, $batchsize, $dbtype, $neargene, $genomebinsize, $geneanno, $regionanno, $filter, $downdb, $buildver, $score_threshold, $normscore_threshold, $minqueryfrac, $expandbin, $splicing_threshold,
$maf_threshold, $chromosome, $zerostart, $rawscore, $memfree, $memtotal, $sift_threshold, $gff3dbfile, $genericdbfile, $vcfdbfile, $time, $wget, $precedence,
$webfrom, $colsWanted, $comment, $scorecolumn, $poscolumn, $transfun, $exonsort, $avcolumn, $bedfile, $hgvs, $reverse, $indexfilter_threshold, $otherinfo, $seq_padding, $indel_splicing_threshold, $infoasscore,
$firstcodondel, $aamatrixfile, $gff3attr, $exonicsplicing, $infosep, $dbm, $idasscore, $thread, $maxgenethread, $mingenelinecount);
our (%valichr, $dbtype1); # valid chromosome name to process, internal alias of dbtype
our (@precedence, @colsWanted, @avcolumn); # precedence of functional categories to print, desired columns to be printed out, redefine standard ANNOVAR 5-column input
our $aamatrix; # amino acid substitution matrix
my ($pad_fh, $cDNA_pad); # Write seq pad here
sub printerr; # declare a subroutine
# codon table
our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
our %codon3 = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"*", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
our %codonfull = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Stop", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
our %codonr1 = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"*", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
our %codonr3 = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"*", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Ile", ACA=>"Thr", AAA=>"Lys", AGA=>"Arg", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"Arg", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
our %codonrfull = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Stop", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Isoleucine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Arginine", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Arginine", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
# mitochondria codon
our %codon1m = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"W", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"M", ACA=>"T", AAA=>"K", AGA=>"*", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"*", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
our %codon3m = (TTT=>"Phe", TTC=>"Phe", TCT=>"Ser", TCC=>"Ser", TAT=>"Tyr", TAC=>"Tyr", TGT=>"Cys", TGC=>"Cys", TTA=>"Leu", TCA=>"Ser", TAA=>"*", TGA=>"Trp", TTG=>"Leu", TCG=>"Ser", TAG=>"*", TGG=>"Trp", CTT=>"Leu", CTC=>"Leu", CCT=>"Pro", CCC=>"Pro", CAT=>"His", CAC=>"His", CGT=>"Arg", CGC=>"Arg", CTA=>"Leu", CTG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", ATT=>"Ile", ATC=>"Ile", ACT=>"Thr", ACC=>"Thr", AAT=>"Asn", AAC=>"Asn", AGT=>"Ser", AGC=>"Ser", ATA=>"Met", ACA=>"Thr", AAA=>"Lys", AGA=>"*", ATG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"*", GTT=>"Val", GTC=>"Val", GCT=>"Ala", GCC=>"Ala", GAT=>"Asp", GAC=>"Asp", GGT=>"Gly", GGC=>"Gly", GTA=>"Val", GTG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
our %codonfullm = (TTT=>"Phenylalanine", TTC=>"Phenylalanine", TCT=>"Serine", TCC=>"Serine", TAT=>"Tyrosine", TAC=>"Tyrosine", TGT=>"Cysteine", TGC=>"Cysteine", TTA=>"Leucine", TCA=>"Serine", TAA=>"Stop", TGA=>"Tryptophan", TTG=>"Leucine", TCG=>"Serine", TAG=>"Stop", TGG=>"Tryptophan", CTT=>"Leucine", CTC=>"Leucine", CCT=>"Proline", CCC=>"Proline", CAT=>"Histidine", CAC=>"Histidine", CGT=>"Arginine", CGC=>"Arginine", CTA=>"Leucine", CTG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", ATT=>"Isoleucine", ATC=>"Isoleucine", ACT=>"Threonine", ACC=>"Threonine", AAT=>"Asparagine", AAC=>"Asparagine", AGT=>"Serine", AGC=>"Serine", ATA=>"Methionine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Stop", ATG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Stop", GTT=>"Valine", GTC=>"Valine", GCT=>"Alanine", GCC=>"Alanine", GAT=>"Aspartic acid", GAC=>"Aspartic acid", GGT=>"Glycine", GGC=>"Glycine", GTA=>"Valine", GTG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
our %codonr1m = (UUU=>"F", UUC=>"F", UCU=>"S", UCC=>"S", UAU=>"Y", UAC=>"Y", UGU=>"C", UGC=>"C", UUA=>"L", UCA=>"S", UAA=>"*", UGA=>"W", UUG=>"L", UCG=>"S", UAG=>"*", UGG=>"W", CUU=>"L", CUC=>"L", CCU=>"P", CCC=>"P", CAU=>"H", CAC=>"H", CGU=>"R", CGC=>"R", CUA=>"L", CUG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", AUU=>"I", AUC=>"I", ACU=>"T", ACC=>"T", AAU=>"N", AAC=>"N", AGU=>"S", AGC=>"S", AUA=>"M", ACA=>"T", AAA=>"K", AGA=>"*", AUG=>"M", ACG=>"T", AAG=>"K", AGG=>"*", GUU=>"V", GUC=>"V", GCU=>"A", GCC=>"A", GAU=>"D", GAC=>"D", GGU=>"G", GGC=>"G", GUA=>"V", GUG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
our %codonr3m = (UUU=>"Phe", UUC=>"Phe", UCU=>"Ser", UCC=>"Ser", UAU=>"Tyr", UAC=>"Tyr", UGU=>"Cys", UGC=>"Cys", UUA=>"Leu", UCA=>"Ser", UAA=>"*", UGA=>"Trp", UUG=>"Leu", UCG=>"Ser", UAG=>"*", UGG=>"Trp", CUU=>"Leu", CUC=>"Leu", CCU=>"Pro", CCC=>"Pro", CAU=>"His", CAC=>"His", CGU=>"Arg", CGC=>"Arg", CUA=>"Leu", CUG=>"Leu", CCA=>"Pro", CCG=>"Pro", CAA=>"Gln", CAG=>"Gln", CGA=>"Arg", CGG=>"Arg", AUU=>"Ile", AUC=>"Ile", ACU=>"Thr", ACC=>"Thr", AAU=>"Asn", AAC=>"Asn", AGU=>"Ser", AGC=>"Ser", AUA=>"Met", ACA=>"Thr", AAA=>"Lys", AGA=>"*", AUG=>"Met", ACG=>"Thr", AAG=>"Lys", AGG=>"*", GUU=>"Val", GUC=>"Val", GCU=>"Ala", GCC=>"Ala", GAU=>"Asp", GAC=>"Asp", GGU=>"Gly", GGC=>"Gly", GUA=>"Val", GUG=>"Val", GCA=>"Ala", GCG=>"Ala", GAA=>"Glu", GAG=>"Glu", GGA=>"Gly", GGG=>"Gly");
our %codonrfullm = (UUU=>"Phenylalanine", UUC=>"Phenylalanine", UCU=>"Serine", UCC=>"Serine", UAU=>"Tyrosine", UAC=>"Tyrosine", UGU=>"Cysteine", UGC=>"Cysteine", UUA=>"Leucine", UCA=>"Serine", UAA=>"Stop", UGA=>"Tryptophan", UUG=>"Leucine", UCG=>"Serine", UAG=>"Stop", UGG=>"Tryptophan", CUU=>"Leucine", CUC=>"Leucine", CCU=>"Proline", CCC=>"Proline", CAU=>"Histidine", CAC=>"Histidine", CGU=>"Arginine", CGC=>"Arginine", CUA=>"Leucine", CUG=>"Leucine", CCA=>"Proline", CCG=>"Proline", CAA=>"Glutamine", CAG=>"Glutamine", CGA=>"Arginine", CGG=>"Arginine", AUU=>"Isoleucine", AUC=>"Isoleucine", ACU=>"Threonine", ACC=>"Threonine", AAU=>"Asparagine", AAC=>"Asparagine", AGU=>"Serine", AGC=>"Serine", AUA=>"Methionine", ACA=>"Threonine", AAA=>"Lysine", AGA=>"Stop", AUG=>"Methionine", ACG=>"Threonine", AAG=>"Lysine", AGG=>"Stop", GUU=>"Valine", GUC=>"Valine", GCU=>"Alanine", GCC=>"Alanine", GAU=>"Aspartic acid", GAC=>"Aspartic acid", GGU=>"Glycine", GGC=>"Glycine", GUA=>"Valine", GUG=>"Valine", GCA=>"Alanine", GCG=>"Alanine", GAA=>"Glutamic acid", GAG=>"Glutamic acid", GGA=>"Glycine", GGG=>"Glycine");
our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');
# process program arguments, set up default values, check for errors, check for existence of db files
processArguments ();
$time and printerr "NOTICE: Current time (before execution) is ", scalar (localtime), "\n";
# print out output file names
if ($geneanno) {
printerr "NOTICE: Output files were written to $outfile.variant_function, $outfile.exonic_variant_function\n";
} elsif ($regionanno) {
printerr "NOTICE: Output file is written to $outfile.${buildver}_$dbtype1\n";
} elsif ($filter) {
printerr "NOTICE: Variants matching filtering criteria are written to $outfile.${buildver}_${dbtype1}_dropped, other variants are written to $outfile.${buildver}_${dbtype1}_filtered\n";
}
# check number of input lines in query, and adjust number of threads accordingly for gene-based annotation
# my general observation is that when input line is less than 1 million, multi-threading does not really improve performance and should be disabled (--mingenelinecount controls this)
# another observation is that when number of threads are too high, the program runs slower (--maxgenethread controls this)
my ($queryfile_line_count, $chunk_line_count);
if ($thread) {
($queryfile_line_count, $chunk_line_count) = calculateChunkLine ($queryfile, $thread);
if ($geneanno) {
if ($queryfile_line_count < $mingenelinecount) { #for gene-based annotation, only use threading if more than 1 million variants are present
printerr ("NOTICE: threading is disabled for gene-based annotation on file with less than $mingenelinecount input lines\n");
$thread = undef;
}
if ($thread and $thread > $maxgenethread) { #do not use too many threads for gene-based annotation
printerr ("NOTICE: number of threads is reduced to $maxgenethread (use --maxgenethread to change this behavior)\n");
$thread = $maxgenethread;
($queryfile_line_count, $chunk_line_count) = calculateChunkLine ($queryfile, $thread); #re-calculate the two measures after adjusting thread
}
}
}
# begin the main program to process gene, region or filter-based annotation
if ($thread) {
my (@thr, $thr);
for my $i (0 .. $thread-1) {
my $start_line = $i * $chunk_line_count + 1; #the $. linecount starts with 1
my $end_line = $start_line + $chunk_line_count - 1;
if ($end_line > $queryfile_line_count) {
$end_line = $queryfile_line_count;
}
printerr ("NOTICE: Creating new threads for query line $start_line to $end_line\n");
if ($geneanno) {
$thr = threads->create(\&annotateQueryByGeneThread, "$outfile.variant_function.$i", "$outfile.exonic_variant_function.$i",
"$outfile.invalid_input.$i", $start_line, $end_line, $i);
} elsif ($regionanno) {
$thr = threads->create(\&annotateQueryByRegionThread, "$outfile.${buildver}_$dbtype1.$i", "$outfile.invalid_input.$i",
$start_line, $end_line, $i);
} elsif ($filter) {
$thr = threads->create(\&filterQueryThread, "$outfile.${buildver}_${dbtype1}_filtered.$i", "$outfile.${buildver}_${dbtype1}_dropped.$i",
"$outfile.invalid_input.$i", $start_line, $end_line, $i);
}
push @thr, $thr;
}
for my $i (0 .. $thread-1) {
$thr[$i]->join(); #wait for the corresponding thread to complete its execution
}
if ($geneanno) {
combineFile ($outfile, "variant_function", "exonic_variant_function", "invalid_input");
} elsif ($regionanno) {
combineFile ($outfile, "${buildver}_$dbtype1", "invalid_input");
} elsif ($filter) {
combineFile ($outfile, "${buildver}_${dbtype1}_filtered", "${buildver}_${dbtype1}_dropped", "invalid_input");
}
} else {
if ($geneanno) {
annotateQueryByGeneThread ("$outfile.variant_function", "$outfile.exonic_variant_function", "$outfile.invalid_input");
} elsif ($regionanno) {
annotateQueryByRegionThread ("$outfile.${buildver}_$dbtype1", "$outfile.invalid_input");
} elsif ($filter) {
filterQueryThread ("$outfile.${buildver}_${dbtype1}_filtered", "$outfile.${buildver}_${dbtype1}_dropped", "$outfile.invalid_input");
} elsif ($downdb) {
downloadDB ();
}
}
# delete invlid_input file if it has zero size, otherwise print out the number of invalid lines
if (not $downdb) {
if (not -z "$outfile.invalid_input") {
printerr "NOTICE: Variants with invalid input format were written to $outfile.invalid_input\n";
} else {
unlink ("$outfile.invalid_input");
}
}
$time and printerr "NOTICE: Current time (after execution) is ", scalar (localtime), "\n";
# combine serveral files produced by multiple threads into one single file, then delete the individual files
sub combineFile {
my ($outfile, @suffix) = @_;
for my $j (0 .. @suffix-1) {
if (not rename ("$outfile.$suffix[$j].0", "$outfile.$suffix[$j]")) {
die "Error: cannot rename $outfile.$suffix[$j].0 to $outfile.$suffix[$j]\n";
}
}
for my $i (1 .. $thread-1) {
my $msg = qx/cat --help 2>&1/ || ''; #collect the output of the system command
if ($msg =~ m/^Usage/) {
for my $j (0 .. @suffix-1) {
system ("cat $outfile.$suffix[$j].$i >> $outfile.$suffix[$j]");
}
} else {
for my $j (0 .. @suffix-1) {
appendFile ("$outfile.$suffix[$j].$i", "$outfile.$suffix[$j]");
}
}
for my $j (0 .. @suffix-1) {
unlink ("$outfile.$suffix[$j].$i");
}
}
}
# Calculate how many query lines should be in each chunk, given the number of threads
sub calculateChunkLine {
my ($queryfile, $thread) = @_;
my $chunk_line_count;
my $queryfile_line_count = `cat $queryfile | wc -l 2>&1`;
chomp $queryfile_line_count;
if ($queryfile_line_count =~ m/^\d+$/) { #integer should be returned if the command finishes successfully
printerr ("NOTICE: the queryfile contains $queryfile_line_count lines\n");
} else { #otherwise, open the file and count the number of lines
open (QUERY, $queryfile);
$queryfile_line_count++ while (<QUERY>);
close (QUERY);
printerr ("NOTICE: the queryfile contains $queryfile_line_count lines\n");
}
if ($queryfile_line_count % $thread == 0) {
$chunk_line_count = $queryfile_line_count/$thread;
} else {
$chunk_line_count = int($queryfile_line_count/$thread)+1; #max number of lines to be read from the input
}
return ($queryfile_line_count, $chunk_line_count);
}
# Append the content of one file to another, in case "cat" system command is not availalbe in the operating system
sub appendFile {
my ($file1, $file2) = @_;
open (FH1, $file1) or die "Error: cannot read from file $file1: $!\n";
open (FH2, ">>$file2") or die "Error: cannot append to file $file2: $!\n";
while (<FH1>) {
print FH2 $_;
}
close (FH1);
close (FH2);
}
# Parse the arguments to this program and identify any potential errors in the arguments and set up appropriate default values
sub processArguments {
my @command_line = @ARGV; #command line argument
GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'outfile=s'=>\$outfile, 'separate'=>\$separate,
'batchsize=s'=>\$batchsize, 'dbtype=s'=>\$dbtype, 'neargene=i'=>\$neargene, 'genomebinsize=s'=>\$genomebinsize,
'geneanno'=>\$geneanno, 'regionanno'=>\$regionanno, , 'filter'=>\$filter, 'downdb'=>\$downdb, 'buildver=s'=>\$buildver, 'score_threshold=f'=>\$score_threshold,
'normscore_threshold=i'=>\$normscore_threshold, 'minqueryfrac=f'=>\$minqueryfrac, 'expandbin=i'=>\$expandbin, 'splicing_threshold=i'=>\$splicing_threshold,
'maf_threshold=f'=>\$maf_threshold, 'chromosome=s'=>\$chromosome, 'zerostart'=>\$zerostart, 'rawscore'=>\$rawscore, 'memfree=i'=>\$memfree,
'memtotal=i'=>\$memtotal, 'sift_threshold=f'=>\$sift_threshold, 'gff3dbfile=s'=>\$gff3dbfile, 'genericdbfile=s'=>\$genericdbfile, 'vcfdbfile=s'=>\$vcfdbfile,
'time'=>\$time, 'wget!'=>\$wget, 'precedence=s'=>\$precedence, 'webfrom=s'=>\$webfrom, 'colsWanted=s'=>\$colsWanted, 'comment'=>\$comment,
'scorecolumn=i'=>\$scorecolumn, 'poscolumn=s'=>\$poscolumn, 'transcript_function'=>\$transfun, 'exonsort'=>\$exonsort, 'avcolumn=s'=>\$avcolumn, 'bedfile=s'=>\$bedfile,
'hgvs'=>\$hgvs, 'reverse'=>\$reverse, 'indexfilter_threshold=f'=>\$indexfilter_threshold, 'otherinfo'=>\$otherinfo,
'seq_padding=i'=>\$seq_padding, 'indel_splicing_threshold=i'=>\$indel_splicing_threshold, 'infoasscore'=>\$infoasscore, 'firstcodondel!'=>\$firstcodondel,
'aamatrixfile=s'=>\$aamatrixfile, 'gff3attribute'=>\$gff3attr, 'exonicsplicing'=>\$exonicsplicing, 'infosep'=>\$infosep, 'dbm'=>\$dbm, 'idasscore'=>\$idasscore,
'thread=i'=>\$thread, 'maxgenethread=i'=>\$maxgenethread) or pod2usage ();
$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 2 or pod2usage ("Syntax error");
($queryfile, $dbloc) = @ARGV;
$dbloc =~ s/[\\\/]+$//; #delete the trailing / or \ sign as part of the directory name
$dbloc =~ s{^~([^/]*)} { $1 ? (getpwnam($1))[7] : ( $ENV{HOME} || $ENV{LOGDIR} || (getpwuid($>))[7])}ex; #expand the ~ tilde to full path name
if (defined $batchsize) {
$batchsize =~ s/k$/000/;
$batchsize =~ s/m$/000000/;
$batchsize =~ m/^\d+$/ or pod2usage ("Error: the --batchsize argument must be a positive integer (suffix of k or m is okay)");
} else {
$batchsize = 5_000_000;
}
if (defined $genomebinsize) {
$genomebinsize =~ s/k$/000/;
$genomebinsize =~ s/m$/000000/;
$genomebinsize =~ m/^\d+$/ or pod2usage ("Error: the --genomebinsize argument must be a positive integer (suffix of k or m is okay)");
$genomebinsize > 1000 or pod2suage ("Error: the --genomebinsize argument must be larger than 1000");
} else {
if ($geneanno) {
$genomebinsize = 100_000; #gene usually span large genomic regions
} else {
$genomebinsize = 10_000; #MCE, TFBS, miRNA, etc are small genomic regions
}
}
$verbose ||= 0; #when it is not specified, it is zero
$neargene ||= 1_000; #for upstream/downstream annotation of variants, specify the distance threshold between variants and genes
$expandbin ||= int(2_000_000/$genomebinsize); #for gene-based annotations, when intergenic variants are found, expand to specified number of nearby bins to find closest genes
$outfile ||= $queryfile; #specify the prefix of output file names
#set up log file
if ($downdb) {
if (not -d $dbloc) {
mkdir ($dbloc) or die "Error: the directory $dbloc does not exist and cannot be created\n";
}
my $errfile = File::Spec->catfile ($dbloc, "annovar_downdb.log");
open (LOG, ">$errfile") or die "Error: cannot write LOG information to log file $errfile: $!\n";
} else {
open (LOG, ">$outfile.log") or die "Error: cannot write LOG information to log file $outfile.log: $!\n";
}
print LOG "ANNOVAR Version:\n\t", q/$Date: 2017-07-17 01:17:04 -0400 (Mon, 17 Jul 2017) $/, "\n";
print LOG "ANNOVAR Information:\n\tFor questions, comments, documentation, bug reports and program update, please visit http://www.openbioinformatics.org/annovar/\n";
print LOG "ANNOVAR Command:\n\t$0 @command_line\n";
print LOG "ANNOVAR Started:\n\t", scalar (localtime), "\n";
my $num = 0;
$geneanno and $num++;
$downdb and $num++;
$filter and $num++;
$regionanno and $num++;
$num <= 1 or pod2usage ("Error in argument: please specify only one of --geneanno, -regionanno, --downdb, --filter");
if (not $num) {
$geneanno++;
printerr "NOTICE: The --geneanno operation is set to ON by default\n";
}
my %dbtype1 = ('gene'=>'refGene', 'refgene'=>'refGene', 'knowngene'=>'knownGene', 'ensgene'=>'ensGene', 'band'=>'cytoBand', 'cytoband'=>'cytoBand', 'tfbs'=>'tfbsConsSites', 'mirna'=>'wgRna',
'mirnatarget'=>'targetScanS', 'segdup'=>'genomicSuperDups', 'omimgene'=>'omimGene', 'gwascatalog'=>'gwasCatalog',
'1000g_ceu'=>'CEU.sites.2009_04', '1000g_yri'=>'YRI.sites.2009_04', '1000g_jptchb'=>'JPTCHB.sites.2009_04',
'1000g2010_ceu'=>'CEU.sites.2010_03', '1000g2010_yri'=>'YRI.sites.2010_03', '1000g2010_jptchb'=>'JPTCHB.sites.2010_03',
'1000g2010jul_ceu'=>'CEU.sites.2010_07', '1000g2010jul_yri'=>'YRI.sites.2010_07', '1000g2010jul_jptchb'=>'JPTCHB.sites.2010_07',
'1000g2010nov_all'=>'ALL.sites.2010_11', '1000g2011may_all'=>'ALL.sites.2011_05'
);
if ($geneanno) {
$dbtype ||= 'refGene';
$dbtype1 = $dbtype1{$dbtype} || $dbtype;
} elsif ($regionanno) {
defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --regionanno operation)");
$dbtype1 = $dbtype1{$dbtype} || $dbtype;
if ($dbtype =~ m/^mce(\d+)way/) { #added 2010Feb16
$dbtype1 = "phastConsElements$1way";
}
if ($dbtype1 eq 'gff3') {
defined $gff3dbfile or pod2usage ("Error in argument: please specify --gff3dbfile for the --dbtype of 'gff3'");
}
} elsif ($filter) {
defined $dbtype or pod2usage ("Error in argument: please specify --dbtype (required for the --filter operation)");
#as of Feb 2012, I no longer check the validity of the database name for -filter operation, to give users the maximum amount of flexibility in designing and using their own favorite databases
$dbtype =~ m/^avsift|generic|1000g_(ceu|yri|jptchb)|1000g2010_(ceu|yri|jptchb)|1000g20\d\d[a-z]{3}_[a-z]+|snp\d+\w+?|vcf|(ljb_[\w\+]+)|esp\d+_[\w]+$/ or print STDERR "NOTICE: the --dbtype $dbtype is assumed to be in generic ANNOVAR database format\n";
$dbtype1 = $dbtype1{$dbtype} || $dbtype;
if ($dbtype1 =~ m/^1000g(20\d\d)([a-z]{3})_([a-z]+)$/) {
my %monthhash = ('jan'=>'01', 'feb'=>'02', 'mar'=>'03', 'apr'=>'04', 'may'=>'05', 'jun'=>'06', 'jul'=>'07', 'aug'=>'08', 'sep'=>'09', 'oct'=>'10', 'nov'=>'11', 'dec'=>'12');
$dbtype1 = uc ($3) . '.sites.' . $1 . '_' . $monthhash{$2};
}
if ($dbtype1 eq 'generic') {
defined $genericdbfile or pod2usage ("Error in argument: please specify --genericdbfile for the --dbtype of 'generic'");
}
if ($dbtype eq 'vcf') {
defined $vcfdbfile or pod2usage ("Error in argument: please specify --vcfdbfile for the --dbtype of 'vcf'");
}
} elsif ($downdb) {
defined $dbtype and pod2usage ("Error in argument: please do not specify --dbtype for the --downdb operation");
$dbtype1 = $dbtype1{$queryfile} || $queryfile;
if ($queryfile =~ m/^mce(\d+)way/) { #added 2013may08
$dbtype1 = "phastConsElements$1way";
}
}
if (not $buildver) {
$buildver = 'hg18';
printerr "NOTICE: The --buildver is set as 'hg18' by default\n";
}
if (defined $score_threshold) {
#$score_threshold >= 0 or pod2usage ("Error in argument: the --score_threshold must be a positive number or zero (you specified $score_threshold)"); #20130208: score_threshold can be anything now as long as it is a number
$geneanno || $downdb and pod2usage ("Error in argument: the --score_threshold is not useful for --geneanno or --downdb operations");
}
if ($normscore_threshold) {
$normscore_threshold >= 0 and $normscore_threshold <= 1000 or pod2usage ("Error in argument: the --normscore_threshold must be between 0 and 1000 (you specified $normscore_threshold)");
$regionanno or pod2usage ("Error in argument: the --normscore_threshold is supported only for the --regionanno operation");
}
if ($zerostart) {
pod2usage ("Error: the -zerostart argument is now obselete and will no longer be supported in ANNOVAR");
}
if (defined $sift_threshold) {
$filter or pod2usage ("Error in argument: the --sift_threshold is supported only for the --filter operation");
$dbtype1 eq 'avsift' or pod2usage ("Error in argument: the --sift_threshold argument can be used only if '--dbtype avsift' is used");
$sift_threshold >= 0 and $sift_threshold <= 1 or pod2usage ("Error in argument: the --sift_threshold must be between 0 and 1 inclusive");
} else {
$dbtype1 eq 'avsift' and printerr "NOTICE: The --sift_threshold is set as 0.05 by default\n";
$sift_threshold = 0.05;
}
if (defined $indexfilter_threshold) {
$filter or pod2usage ("Error in argument: the --indexfilter_threshold is supported only for the --filter operation");
$indexfilter_threshold >= 0 and $indexfilter_threshold <= 1 or pod2usage ("Error in argument: the --indexfilter_threshold must be between 0 and 1 inclusive");
} else {
$indexfilter_threshold = 0.9;
}
#operation-specific argument
if (defined $splicing_threshold) {
$geneanno or pod2usage ("Error in argument: the --splicing_threshold is supported only for the --geneanno operation");
} else {
$splicing_threshold = 2; #for splicing annotation, specify the distance threshold between variants and exon/intron boundaries
}
if (defined $indel_splicing_threshold) {
$geneanno or pod2usage ("Error: the --indel_splicing_threshold is supported only for the --geneanno operation");
}
else {
$indel_splicing_threshold = $splicing_threshold; #if not set, preserve original behavior;
}
if (defined $maf_threshold) {
$filter or pod2usage ("Error in argument: the --maf_threshold is supported only for the --filter operation");
$dbtype =~ m/^1000g/ or pod2usage ("Error in argument: the --maf_threshold is supported only for 1000 Genomes Project data set (try -score_threshold instead)");
} else {
$maf_threshold = 0; #for filter-based annotations on 1000 Genomes Project data, specify the MAF threshold to be used in filtering
}
if (defined $minqueryfrac) {
$regionanno or pod2usage ("Error in argument: the --minqueryfrac is supported only for the --regionanno operation");
} else {
$minqueryfrac = 0; #minimum query overlap to declare a "match" with database records
}
if (defined $gff3dbfile) {
$dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3dbfile argument can be used only if '--dbtype gff3' is used");
$geneanno or $regionanno or pod2usage ("Error in argument: the --gff3dbfile argument is supported only for the --geneanno or --regionanno operation");
}
if (defined $bedfile) {
$dbtype eq 'bed' or pod2usage ("Error in argument: the --bedfile argument can be used only if '--dbtype bed' is used");
$regionanno or pod2usage ("Error in argument: the --bedfile argument is supported only for the --regionanno operation");
}
if (defined $genericdbfile) {
$filter or pod2usage ("Error in argument: the --genericdbfile argument is supported only for the --filter operation");
}
if (defined $wget) {
$downdb or pod2usage ("Error in argument: the --wget argument is supported only for the --downdb operation");
} else {
$wget = 1; #by default, use wget for downloading files from Internet
}
if (defined $precedence) {
$geneanno or pod2usage ("Error in argument: the --precedence argument is supported only for the --geneanno operation");
if ($precedence =~ m/#/) {
@precedence = split (/#/, $precedence);
} else {
@precedence = split (/,/, $precedence);
}
@precedence >= 2 or pod2usage ("Error in argument: the --precedence argument should be comma delimited");
for my $i (0 .. @precedence-1) {
$precedence[$i] =~ m/^(exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing|ncrna)$/ or pod2usage ("Error in argument: the --precedence argument contains invalid keywords (valid ones are exonic|intronic|splicing|utr5|utr3|upstream|downstream|splicing)");
}
}
if (defined $colsWanted) {
$regionanno or $filter or pod2usage ("Error in argument: the --colWanted argument is supported only for the --regionanno and --filter operation");
if (lc $colsWanted eq 'all') {
@colsWanted = ('all');
} elsif (lc $colsWanted eq 'none') {
@colsWanted = ('none');
} else {
if ($colsWanted =~ m/#/) { #by default, -colsWanted are comma separated, but occasionally (for example, when use in table_annovar.pl, the user may want to use # to separate the columns)
@colsWanted = split (/#/, $colsWanted);
} else {
@colsWanted = split (/,/, $colsWanted);
}
for my $i (0 .. @colsWanted-1) {
$colsWanted[$i]=~m/^\d+$/ or pod2usage ("Error in argument: the --colsWanted argument ($colsWanted) must be a list of comma delimited numbers or be 'all' or be 'none'");
}
}
}
if (defined $scorecolumn) {
$regionanno or pod2usage ("Error in argument: the --scorecolumn argument is supported only for the --regionanno operation");
}
if (defined $poscolumn) {
$regionanno or pod2usage ("Error in argument: the --poscolumn argument is supported only for the --regionanno operation");
$poscolumn =~ m/^\d+,\d+,\d+$/ or pod2usage ("Error in argument: the --poscolumn must be three integers separated by comma");
}
if ($exonsort) {
$geneanno or pod2usage ("Error in argument: the --exonsort argument is supported only for the --geneanno operation");
}
if (defined $avcolumn) {
if ($avcolumn =~ m/^\d+,\d+,\d+,\d+,\d+$/) {
@avcolumn = split (/,/, $avcolumn);
} elsif ($avcolumn =~ m/^\d+#\d+#\d+#\d+#\d+$/) {
@avcolumn = split (/#/, $avcolumn);
} else {
pod2usage ("Error in argument: the --avcolumn argument must be five integer numbers separated by comma or #");
}
@avcolumn = map {$_-1} @avcolumn;
} else {
@avcolumn = (0..4); #by default, the first five columns are the required AVINPUT information
}
if (defined $webfrom) {
if ($webfrom ne 'ucsc' and $webfrom ne 'annovar') {
$webfrom =~ m#^(http://|ftp://)# or pod2usage ("Error: the --webfrom argument needs to be 'ucsc', 'annovar', or a URL");
}
} else {
$webfrom = 'ucsc';
}
# Padded output
if ($seq_padding) {
open $pad_fh, ">$outfile.seqpad" or die "Error: cannot write to output file $outfile.seqpad: $!\n";
$cDNA_pad = $seq_padding * 3;
}
if ($infoasscore and $idasscore) {
pod2usage ("Error in argument: you can specify either -infoasscore or -idasscore but not both");
}
$maf_threshold >= 0 and $maf_threshold <= 0.5 or pod2usage ("Error in argument: the --maf_threshold must be between 0 and 0.5 (you specified $maf_threshold)");
$minqueryfrac >= 0 and $minqueryfrac <= 1 or pod2usage ("Error in argument: the --minqueryfrac must be between 0 and 1 (you specified $minqueryfrac)");
$memfree and $memfree >= 100_000 || pod2usage ("Error in argument: the --memfree argument must be at least 100000 (in the order of kilobytes)");
$memtotal and $memtotal >= 100_000 || pod2usage ("Error in argument: the --memtotal argument must be at least 100000 (in the order of kilobytes)");
if ($chromosome) {
my @chr;
if ($chromosome =~ m/#/) {
@chr = split (/#/, $chromosome);
} else {
@chr = split (/,/, $chromosome);
}
for my $i (0 .. @chr-1) {
if ($chr[$i] =~ m/^(\d+)-(\d+)$/) {
for my $j ($1 .. $2) {
$valichr{$j}++;
}
} else {
$valichr{$chr[$i]}++;
}
}
printerr "NOTICE: These chromosomes in database will be examined: ", join (",", sort keys %valichr), "\n";
}
if (not defined $firstcodondel) {
$firstcodondel = 1;
}
if (defined $aamatrixfile) {
$geneanno or pod2usage ("Error in argument: the --aamatrix argument can be used only for gene-based annotation");
$aamatrix = readAAMatrixFile ($aamatrixfile);
}
if ($gff3attr) {
$dbtype eq 'gff3' or pod2usage ("Error in argument: the --gff3attr argument can be used only if '--dbtype gff3' is used");
}
if ($thread) { #enable multi-threaded analysis (currently only filter-based annotation is supported)
$downdb and pod2usage ("Error in argument: the --thread argument is not supported for --downdb operation");
$thread > 0 or pod2usage ("Error in argument: the --thread argument must be a positive integer");
if (not eval 'use threads; 1') {
die "Error: your system does not support multi-threaded analysis. Please remove the --thread argument\n";
}
}
if (not defined $maxgenethread) {
$maxgenethread = 6;
} else {
$thread or pod2usage ("Error in argument: the --maxgenethread argument is supported only if --thread is set");
}
if (not defined $mingenelinecount) {
$mingenelinecount = 1_000_000;
} else {
$thread or pod2usage ("Error in argument: the --mingenelinecount argument is supported only if --thread is set");
}
}
# Read the AA matrxi file that scores the functional deleteriousness of a change from one amino acid to another. This is a simple tab-delimited file, with each row/column represent one amino acid
sub readAAMatrixFile {
my ($aamatrixfile) = @_;
my %aamatrix;
open (MATRIX, $aamatrixfile) or die "Error: cannot read from aamatrixfile $aamatrixfile: $!\n";
$_ = <MATRIX>;
s/[\r\n]+$//;
my @aa1 = split (/\t/, uc $_);
shift @aa1;
@aa1 == 20 or die "Error: invalid first line found in aamatrixfile (21 tab-delimited fields expected): <$_>\n";
for my $i (0 .. @aa1-1) {
$aa1[$i] =~ m/^[SRLPTAVGIFYCHQNKDEMW]$/ or die "Error: invalid amino acid identifier found in aamatrixfile (SRLPTAVGIFYCHQNKDEMW expected): <$aa1[$i]>\n";
}
while (<MATRIX>) {
s/[\r\n]+$//;
my @aa2 = split (/\t/, uc $_);
my $aa2 = shift @aa2;
@aa2 == 20 or die "Error: invalid line found in aamatrixfile (21 tab-delimited fields expected): <$_>\n";
$aa2 =~ m/^[SRLPTAVGIFYCHQNKDEMW]$/ or die "Error: invalid amino acid identifier found in aamatrixfile (SRLPTAVGIFYCHQNKDEMW expected): <$_>\n";
for my $j (0 .. @aa2-1) {
$aamatrix{$aa2.$aa1[$j]} = $aa2[$j];
}
}
close (MATRIX);
return (\%aamatrix);
}
# Parse ANNOVAR input lines and detect whether invalid input is present, if not, returns the five ANNOVAR input columns
sub detectInvalidInput {
my ($line) = @_;
my $invalid = 0;
my @nextline = split (/\s+/, $line);
my ($chr, $start, $end, $ref, $obs) = @nextline[@avcolumn];
if ( not (defined $chr and defined $start and defined $end and defined $ref and defined $obs)) {
$invalid++;
} else {
($ref, $obs) = (uc $ref, uc $obs);
$zerostart and $start++;
$chr =~ s/^chr//;
if ($chr =~ m/[^\w\.]/ or $start =~ m/[^\d]/ or $end =~ m/[^\d]/) { #chr name could contain . (example: GL000212.1, or Zv9_NA###
$invalid++;
} elsif ($ref eq '-' and $obs eq '-' #both are empty allele
or $ref =~ m/[^ACTG0\-]/ #non-standard nucleotide code
or $obs =~ m/[^ACGT0\-]/ #non-standard nucleotide code
or $start =~ m/[^\d]/ #start is not a number
or $end =~ m/[^\d]/ #end is not a number
or $start > $end #start is more than end
or $ref ne '0' and $end-$start+1 != length ($ref) #length mismatch with ref
or $ref eq '-' and $start != $end #length mismatch for insertion
) {
$invalid++;
}
}
return ($invalid, $chr, $start, $end, $ref, $obs);
}
# Control gene-based annotation by submitting a batch request to processNextQueryBatchByGeneThread(), with the support for multi-threading if $start_line, $end_line and $cur_thread is specified
sub annotateQueryByGeneThread {
my ($varfun_file, $exvarfun_file, $invalid_file, $start_line, $end_line, $cur_thread) = @_;
my ($VARFUN, $EXVARFUN, $INVALID, $QUERY);
open ($VARFUN, ">$varfun_file") or die "Error: cannot write to output file $varfun_file: $!\n";
open ($EXVARFUN, ">$exvarfun_file") or die "Error: cannot write to output file $exvarfun_file: $!\n";
open ($INVALID, ">$invalid_file") or die "Error: cannot write to output file $invalid_file: $!\n";
#my ($totalquerycount, $totalinvalidcount, $batchcount) = qw/0 0 1/;
open ($QUERY, $queryfile) or die "Error: cannot read from --queryfile ($queryfile): $!\n";
my ($genedb, $geneidmap, $cdslen, $mrnalen) = readUCSCGeneAnnotation ($dbloc);
my (@variant, $filedone, $batchdone);
my ($linecount, $batchlinecount, $invalid, $invalidcount) = (0, 0);
my ($chr, $start, $end, $ref, $obs, $info);
if ($thread and $start_line) {
for (1 .. $start_line-1) {
$_ = <$QUERY>;
}
}
while (1) {
$_ = <$QUERY>;
if ($thread) {
if ($. > $end_line) {
$_ = undef; #do not read any line any more
}
}
if (not defined $_) {
$filedone++;
} else {
$_ =~ s/[\r\n]+$//;
if ($_ =~ m/^#/ and $comment) { #comment line start with #, do not include this is $linecount
print $VARFUN "#comment\t$comment\t$_\n";
next;
}
($invalid, $chr, $start, $end, $ref, $obs) = detectInvalidInput ($_);
if ($invalid) {
print $INVALID $_, "\n"; #invalid record found
$invalidcount++;
next;
}
push @variant, [$chr, $start, $end, $ref, $obs, $., $_];
$linecount++; #does not include comment line or invalid line
$batchlinecount++;
if ($batchlinecount == $batchsize) {
$batchdone++;
}
}
if ($filedone or $batchdone) {
@variant and printerr "NOTICE: Processing next batch with ${\(scalar @variant)} unique variants in $batchlinecount input lines\n";
processNextQueryBatchByGeneThread (\@variant, $VARFUN, $EXVARFUN, $INVALID, $genedb, $geneidmap, $cdslen, $mrnalen);
@variant = ();
$batchlinecount = 0;
$batchdone = 0;
}
if ($filedone) {
close ($VARFUN);
close ($EXVARFUN);
close ($INVALID);
close ($QUERY);
last;
}
}
}
# Perform the actual gene-based annotation on a batch of input variants
sub processNextQueryBatchByGeneThread {
my ($variant, $VARFUN, $EXVARFUN, $INVALID, $genedb, $geneidmap, $cdslen, $mrnalen) = @_;
my (%refseqvar);
my ($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2);
for my $i (0 .. @$variant-1) {
my ($chr, $start, $end, $ref, $obs, $curlinecount, $nextline) = @{$variant->[$i]};
my (%intronic, %utr5, %utr3, %exonic, %upstream, %downstream, %ncrna, %intergenic, %splicing, %splicing_anno, %newutr5, %newutr3);
my $foundgenic; #variant found in genic region (between start and end position of a gene in genome)
my ($distl, $distr, $genel, $gener); #for intergenic variant, the distance and gene name to the left and right side of gene
my ($distup, $distdown); #for upstream/downstream variants, the distance to the transcript
my $bin1 = int ($start/$genomebinsize)-1; #start bin
$bin1 < 0 and $bin1=0;
my $bin2 = int ($end/$genomebinsize)+1; #end bin (usually same as start bin, unless the query is really big that spans multiple megabases)
while (not exists $genedb->{$chr, $bin1} and $bin1 > int ($start/$genomebinsize)-$expandbin) { #examine at least 5 bins (by default 5Mb) to the left to make sure that a gene is found in the bin
$bin1 > 0 or last;
$bin1--;
}
while (not exists $genedb->{$chr, $bin2} and $bin2 < int ($end/$genomebinsize)+$expandbin) { #examine at least 5 bins (by default 5Mb) to the right to make sure that a gene is found in the bin
$bin2++;
}
my (%seen);
for my $nextbin ($bin1 .. $bin2) {
exists $genedb->{$chr, $nextbin} or next; #this genome bin has no annotated gene (a complete intergenic region)
for my $nextgene (@{$genedb->{$chr, $nextbin}}) { #when $genedb->{$chr, $nextbin} is undefined, this automatically create an array!!!
($name, $dbstrand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend, $name2) = @$nextgene;
defined $name2 or printerr "WARNING: name2 field is not provided for transcript $name (start=$txstart end=$txend)\n" and $name2='';
$seen{$name, $txstart} and next; #name and txstart uniquely identify a transcript and chromosome position (sometimes same transcript may map to two nearby positions, such as nearby segmental duplications)
$seen{$name, $txstart}++; #a transcript may be in two adjacent bins, so if one is already scanned, there is no need to work on it again
my $current_ncRNA;
if ($transfun) { #variant_function output contains transcript name, rather than gene name
$name2 = $name;
$name2 =~ s/#\w+#\d+//; #20140711 (issue:multimap) multimap issue
}
if (not $foundgenic or $separate) { #this variant has not hit a genic region yet (and -separate is not specified, 20170712)
if ($start > $txend) {
defined $distl or $distl = $start-$txend and $genel=$name2;
$distl > $start-$txend and $distl = $start-$txend and $genel=$name2; #identify left closest gene
}
if ($end < $txstart) {
defined $distr or $distr = $txstart-$end and $gener=$name2;
$distr > $txstart-$end and $distr = $txstart-$end and $gener=$name2; #identify right closest gene
}
}
if ($end < $txstart) {
#query ---
#gene <-*----*->
not $separate and $foundgenic and last; #if found a genic annotation already, end the search of the bins (however, if -separate is specified, we should not end the search)
if ($end > $txstart - $neargene) {
if ($dbstrand eq '+') {
$upstream{$name2}++;
$distup = $distr;
} else {
$downstream{$name2}++;
$distdown = $distr;
}
} else {
last; #if transcript is too far away from end, end the search of the bins
}
} elsif ($start > $txend) {
#query ---
#gene <-*----*->
if ($separate || not $foundgenic and $start < $txend + $neargene) { #similar to above, if separate is specified, we still record up/downstream regardless of foundgenic
if ($dbstrand eq '+') {
$downstream{$name2}++;
$distdown = $distl;
} else {
$upstream{$name2}++;
$distup = $distl;
}
}
#} elsif ($cdsstart == $cdsend+1) { #non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend
# if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) {
# $ncrna{$name2}++;
# $foundgenic++;
# }
} else { #query overlaps with coding region of gene
#change in 2011jul24: handle ncRNA and protein coding gene together but with the ncRNA flag when printing out results in the future
if ($cdsstart == $cdsend+1) { #non-coding RNA (could be microRNA, or could be due to lack of CDS annotation for mRNA such as NR_026730 or BC039000). Previously we already did cdsstart++ so here the cdsstart is more than cdsend
if ($start >= $txstart and $start <= $txend or $end >= $txstart and $end <= $txend or $start <= $txstart and $end >= $txend) {
$ncrna{$name2}++;
$foundgenic++;
}
#now treat this ncRNA as if it is a protein-coding gene
($cdsstart, $cdsend) = ($txstart, $txend);
$current_ncRNA++; #current transcript is a noncoding transcript
}
my ($lenintron, $lenexon) = (0, 0); #cumulative intron/exon length at a given exon
my ($rcdsstart, $rcdsend, $rvarstart, $rvarend); #start of coding and variant in reference mRNA sequence
my @exonstart = @$exonstart;
my @exonend = @$exonend;
my $foundexonic;
my $utr_canno; #cDNA level annotation for UTR variants
if ($dbstrand eq '+') { #forward strand, search from left to right (first exon to last exon)
for my $k (0 .. @exonstart-1) {
$k and $lenintron += ($exonstart[$k]-$exonend[$k-1]-1); #calculate cumulative intron length
$lenexon += ($exonend[$k]-$exonstart[$k]+1);
if ($cdsstart >= $exonstart[$k]) { #calculate CDS start accurately by considering intron length
$rcdsstart = $cdsstart-$txstart-$lenintron+1;
if ($cdsstart <= $exonend[$k]) { #CDS start is within this exon
$lenexon = ($exonend[$k]-$cdsstart+1);
} else { #CDS start is in previous exon
#$lenexon += ($exonend[$k]-$exonstart[$k]+1);
}
}
if ($cdsend >= $exonstart[$k]) {
$rcdsend = $cdsend-$txstart-$lenintron+1;
}
if ($exonicsplicing) { #when -exonicsplicing argument is set, exonic variants near exon/intron boundary is reported as "exonic;splicing" variant
#splicing calculation (changed 2012may24)
if (@exonstart != 1) {
if ( $k == 0 and $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
$splicing{$name2}++;
} elsif ($k == @exonstart-1 and $start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1) {
$splicing{$name2}++;
} elsif ($k and $k < @exonstart-1) {
if ($start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1 or $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
$splicing{$name2}++; #when query start site is close to exon start or exon end
}
}
if ($k == 0 and $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
$splicing{$name2}++;
} elsif ($k == @exonstart-1 and $end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1) {
$splicing{$name2}++;
} elsif ($k and $k < @exonstart-1) {
if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]+$splicing_threshold-1 or $end >= $exonend[$k]-$splicing_threshold+1 and $end <= $exonend[$k]+$splicing_threshold) {
$splicing{$name2}++; #when query end site is close to exon start or exon end
}
}
if ($k == 0 and $start <= $exonend[$k] and $end >= $exonend[$k]) {
$splicing{$name2}++;
} elsif ($k == @exonstart-1 and $start <= $exonstart[$k] and $end>=$exonstart[$k]) {
$splicing{$name2}++;
} elsif ($k and $k < @exonstart-1) {
if ($start <= $exonstart[$k] and $end>=$exonstart[$k] or $start <= $exonend[$k] and $end >= $exonend[$k]) {
$splicing{$name2}++; #when query encompass the exon/intron boundary
}
}
}
} else {
#splicing calculation (changed 2013feb10)
#splicing calculation (changed again 2013feb21 per Mitsuhiro Komura suggestion)
if (@exonstart != 1) {
if ( $k == 0 and $start >= $exonend[$k]+1 and $start <= $exonend[$k]+$splicing_threshold) { #first exon
$splicing{$name2}++;
} elsif ($k == @exonstart-1 and $end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]-1) { #last exon
$splicing{$name2}++;
} elsif ($k and $k < @exonstart-1) { #middle exon
if ($end >= $exonstart[$k]-$splicing_threshold and $end <= $exonstart[$k]-1 or $start >= $exonend[$k]+1 and $start <= $exonend[$k]+$splicing_threshold) {
$splicing{$name2}++; #when query start site is close to exon start or exon end
}
}
}
}
#if name2 is already a splicing variant, but its detailed annotation (like c150-2A>G) is not available, and if this splicing leads to amino acid change (rather than UTR change)
if ($splicing{$name2} and $start==$end and $start>=$cdsstart) {
if ($start >= $exonstart[$k]-$splicing_threshold and $start < $exonstart[$k]) {
#------*-<---->---------<-->-------<------>----
$lenexon -= ($exonend[$k]-$exonstart[$k]); #formerly "$exonend[$k]-$exonstart[$k]+1"; changed this line 2011oct01 given German's comments. The coding portion for acceptor site should be added by one, but for donor site should be fine
$splicing_anno{$name2} .= "$name:exon${\($k+1)}:c.$lenexon-" . ($exonstart[$k]-$start) . "$ref>$obs,";
} elsif ($start > $exonend[$k] and $start <= $exonend[$k]+$splicing_threshold) {
#-------<---->-*--------<-->-------<------>----
$splicing_anno{$name2} .= "$name:exon${\($k+1)}:c.$lenexon+" . ($start-$exonend[$k]) . "$ref>$obs,";
}
}
#if name2 is already a splicing variant, but its detailed annotation (like c150-2A>G) is not available, and if it leads to UTR change
if ($splicing{$name2} and $start==$end and $start<$cdsstart) {
if ($start >= $exonstart[$k]-$splicing_threshold and $start < $exonstart[$k]) {
#------*-<---->---------<-->-------<------>----
$splicing_anno{$name2} .= "$name:exon${\($k+1)}:UTR5,";
} elsif ($start > $exonend[$k] and $start <= $exonend[$k]+$splicing_threshold) {
#-------<---->-*--------<-->-------<------>----
$splicing_anno{$name2} .= "$name:exon${\($k+1)}:UTR5,";
}
} elsif ($splicing{$name2} and $start==$end and $start>$cdsend) {
if ($start >= $exonstart[$k]-$splicing_threshold and $start < $exonstart[$k]) {
#------*-<---->---------<-->-------<------>----
$splicing_anno{$name2} .= "$name:exon${\($k+1)}:UTR3,";
} elsif ($start > $exonend[$k] and $start <= $exonend[$k]+$splicing_threshold) {
#-------<---->-*--------<-->-------<------>----
$splicing_anno{$name2} .= "$name:exon${\($k+1)}:UTR3,";
}
}
if ($start < $exonstart[$k]) {
if ($end >= $exonstart[$k]) { #exonic
$rvarstart = $exonstart[$k]-$txstart-$lenintron+1;
for my $m ($k .. @exonstart-1) {
$m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
if ($end < $exonstart[$m]) {
#query --------
#gene <--**---******---****---->
$rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
last;
} elsif ($end <= $exonend[$m]) {
#query -----------
#gene <--**---******---****---->
$rvarend = $end-$txstart-$lenintron+1;
last;
}
}
if (not defined $rvarend) {
$rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion
}
#here the trick begins to differentiate UTR versus coding exonic
if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
#query ----
#gene <--*---*->
#$utr5{$name2}++; #positive strand for UTR5
$utr_canno = "$name:c." . ($rvarstart-$rcdsstart) . '_' . ($rvarend-$rcdsstart) . "delins$obs"; #this is an indel
$utr5{$name2} .= "$utr_canno,";
} elsif ($start > $cdsend) {
#query ----
#gene <--*---*->
#$utr3{$name2}++; #positive strand for UTR3
$utr_canno = "$name:c.*" . ($rvarstart-$rcdsend) . '_*' . ($rvarend-$rcdsend) . "delins$obs";
$utr3{$name2} .= "$utr_canno,";
} else {
$exonic{$name2}++;
not $current_ncRNA and $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $curlinecount, $k+1, $nextline]; #refseq CDS start, refseq variant start. obs is non-zero (obs is specified by user)
}
$foundgenic++;
last;
} elsif ($k and $start > $exonend[$k-1]) { #intronic
$intronic{$name2}++;
$foundgenic++;
last;
}
} elsif ($start <= $exonend[$k]) { #exonic
$rvarstart = $start-$txstart-$lenintron+1;
for my $m ($k .. @exonstart-1) {
$m > $k and $lenintron += ($exonstart[$m]-$exonend[$m-1]-1);
if ($end < $exonstart[$m]) {
#query ------
#gene <--**---******---****---->
$rvarend = $exonend[$m-1]-$txstart-$lenintron+1 + ($exonstart[$m]-$exonend[$m-1]-1);
last;
} elsif ($end <= $exonend[$m]) {
#query -----------
#gene <--**---******---****---->
$rvarend = $end-$txstart-$lenintron+1;
last;
}
}
if (not defined $rvarend) {
$rvarend = $txend-$txstart-$lenintron+1; #if this value is longer than transcript length, it suggest whole gene deletion
}
#here is the trick begins to differentiate UTR versus coding exonic
if ($end < $cdsstart) { #usually disrupt/change 5' UTR region, unless the UTR per se is also separated by introns
#query ----
#gene <--*---*->
#$utr5{$name2}++; #positive strand for UTR5
if ($start==$end and $ref ne '-' and $obs ne '-') { #SNV
$utr_canno = "$name:c." . ($rvarstart-$rcdsstart) . "$ref>$obs";
} elsif ($start==$end and $ref eq '-' and $obs ne '-') { #insertion
$utr_canno = "$name:c." . ($rvarstart-$rcdsstart) . '_' . ($rvarstart-$rcdsstart+1) . "ins$obs";
} elsif ($start==$end and $ref ne '-' and $obs eq '-') { #single-base deletion
$utr_canno = "$name:c." . ($rvarstart-$rcdsstart) . "del$obs";
} elsif ($ref ne '-' and $obs eq '-') { #multi-base deletion
$utr_canno = "$name:c." . ($rvarstart-$rcdsstart) . '_' . ($rvarend-$rcdsstart) . "del$obs";
} else {
$utr_canno = "$name:c." . ($rvarstart-$rcdsstart) . '_' . ($rvarend-$rcdsstart) . "delins$obs";
}
$utr5{$name2} .= "$utr_canno,";
} elsif ($start > $cdsend) {
#query ----
#gene <--*---*->
#$utr3{$name2}++; #positive strand for UTR3
if ($start==$end and $ref ne '-' and $obs ne '-') { #SNV
$utr_canno = "$name:c.*" . ($rvarstart-$rcdsend) . "$ref>$obs";
} elsif ($start==$end and $ref eq '-' and $obs ne '-') { #insertion
$utr_canno = "$name:c.*" . ($rvarstart-$rcdsend) . '_*' . ($rvarstart-$rcdsend+1) . "ins$obs";
} elsif ($start==$end and $ref ne '-' and $obs eq '-') { #single-base deletion
$utr_canno = "$name:c.*" . ($rvarstart-$rcdsend) . "del$ref";
} elsif ($ref ne '-' and $obs eq '-') { #multi-base deletion
$utr_canno = "$name:c.*" . ($rvarstart-$rcdsend) . '_*' . ($rvarend-$rcdsend) . "del$ref";
} else {
$utr_canno = "$name:c.*" . ($rvarstart-$rcdsend) . '_*' . ($rvarend-$rcdsend) . "delins$obs";
}
$utr3{$name2} .= "$utr_canno,";
} else {
$exonic{$name2}++;
not $current_ncRNA and $obs and push @{$refseqvar{$name}}, [$rcdsstart, $rvarstart, $rvarend, '+', $curlinecount, $k+1, $nextline]; #queryindex, refseq CDS start, refseq variant start
}
$foundgenic++;
last;
}
}
} elsif ($dbstrand eq '-') { #process negative strand (in the future, this should be fused to the paragraph above for positive strands; for now, I keep them separate for easier debugging)
for (my $k = @exonstart-1; $k>=0; $k--) {
$k < @exonstart-1 and $lenintron += ($exonstart[$k+1]-$exonend[$k]-1); #length of intron before current exon
$lenexon += ($exonend[$k]-$exonstart[$k]+1); #current cDNA position (for splicing calculation)
if ($cdsend <= $exonend[$k]) { #calculate CDS start accurately by considering intron length
$rcdsstart = $txend-$cdsend-$lenintron+1;
if ($cdsend >= $exonstart[$k]) { #CDS start within this exon
$lenexon = ($cdsend-$exonstart[$k]+1); #reset lenexon since start is found
} else { #CDS start in prevous exon
#$lenexon += ($exonend[$k]-$exonstart[$k]+1);
}
}
if ($cdsstart <= $exonend[$k]) {
#$rcdsend = $cdsend-$txstart-$lenintron+1;
$rcdsend = $txend-$cdsstart-$lenintron+1;
}
if ($exonicsplicing) { #when -exonicsplicing argument is set, exonic variants near exon/intron boundary is reported as "exonic;splicing" variant
#splicing calculation (changed 2012may24)
if (@exonstart != 1) {
if ( $k == 0 and $start >= $exonend[$k]-$splicing_threshold+1 and $start <= $exonend[$k]+$splicing_threshold) {
$splicing{$name2}++;
} elsif ($k == @exonstart-1 and $start >= $exonstart[$k]-$splicing_threshold and $start <= $exonstart[$k]+$splicing_threshold-1) {
$splicing{$name2}++;