-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoding_change.pl
636 lines (522 loc) · 25 KB
/
coding_change.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
#!/usr/bin/perl
use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;
our $REVISION = '$Revision: 599af129dbcfd4e85a2da9832c4ae59898e2f3a9 $';
our $DATE = '$Date: 2017-07-17 01:17:05 -0400 (Mon, 17 Jul 2017) $';
our $AUTHOR = '$Author: Kai Wang <kaichop@gmail.com> $';
our ($verbose, $help, $man, $includesnp, $mrnaseq, $onlyAltering, $codingseq, $alltranscript, $newevf, $outfile);
our ($evffile, $genefile, $fastafile);
our (%newevf_p, %newevf_function);
our %codon1 = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G");
GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'includesnp'=>\$includesnp, 'mrnaseq'=>\$mrnaseq, 'onlyAltering'=>\$onlyAltering,
'codingseq'=>\$codingseq, 'alltranscript'=>\$alltranscript, 'newevf=s'=>\$newevf, 'outfile=s'=>\$outfile) or pod2usage ();
$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV == 3 or pod2usage ("Syntax error");
($evffile, $genefile, $fastafile) = @ARGV;
$codingseq and $mrnaseq || pod2usage ("Error in argument: --mrnaseq is required when --codingseq is specified");
if ($newevf) {
defined $alltranscript or pod2usage ("Error in argument: --alltranscript arguments are required when you specify -newevf");
}
if ($outfile) {
open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!";
}
open (EVF, $evffile) or die "Error: cannot read from evffile $evffile: $!\n";
open (GENE, $genefile) or die "Error: cannot read from genefile $genefile: $!\n";
open (FASTA, $fastafile) or die "Error: cannot read from fastafile $fastafile: $!\n";
my (@queue, %need_trans);
my (%flagged_transcript); #transcript with errors (premature stop codon, or no stop codon)
while (<EVF>) {
s/[\r\n]+$//;
m/^line\d+/ or die "Error: invalid record found in exonic_variant_function file $evffile (line number expected): <$_>\n";
my @field = split (/\t/, $_);
$field[2] =~ m/^[\w\-\.]+:([\w\.]+):wholegene/ and next;
$field[2] =~ m/unknown/i and next;
#ensembl gene name may contain - and ., where as UCSC transcript name may contain '.'.
#to handle this situation: "SNAPC1:NM_003082:exon1:c.-4_3AGGCGTGC"
$field[2] =~ m/^[\w\-\.]+?:([\w\.]+?):exon\d+:c.([\w\->]+)(:p.([\w\*]+))?/ or die "Error: invalid record found in exonic_variant_function file (exonic format error): <$_>";
while ($field[2] =~ m/[\w\-\.]+?:([\w\.]+?):exon\d+:c.([\w>]+)(:p.([\w\*]+))?/g) {
my ($transcript, $cchange, $pchange) = ($1, $2, $4);
my ($start, $end, $ref, $obs);
if ($cchange =~ m/^([ACGTacgt])(\d+)([ACGTacgt])$/) {
if ($includesnp) {
($start, $end, $ref, $obs) = ($2, $2, $1, $3);
} else {
next;
}
} elsif ($cchange =~ m/^(\d+)([ACGTacgt])>([ACGTacgt])$/) {
if ($includesnp) {
($start, $end, $ref, $obs) = ($1, $1, $2, $3);
} else {
next;
}
} elsif ($cchange =~ m/^(\d+)_(\d+)delins(\w+)/) { #block substitution
($start, $end, $obs) = ($1, $2, $3);
} elsif ($cchange =~ m/^(\d+)delins(\w+)/) { #block substitution for a single nucleotide such as c.3delinsGT
($start, $end, $ref, $obs) = ($1, $1, 'REF', $2);
} elsif ($cchange =~ m/^(\d+)del(\w+)/) { #single base deletion
($start, $end, $ref, $obs) = ($1, $1, $2, '');
} elsif ($cchange =~ m/^(\d+)_(\d+)del(\w*)/) { #multi-base deletion
($start, $end, $ref, $obs) = ($1, $2, $3, '');
} elsif ($cchange =~ m/^(\d+)_(\d+)ins(\w+)/) { #insertion
($start, $end, $ref, $obs) = ($1, $1, 0, $3); #if end is equal to start, this is an insertion
} elsif ($cchange =~ m/^(\d+)dup(\w+)/) { #insertion
($start, $end, $ref, $obs) = ($1, $1, 0, $2);
} elsif ($cchange =~ m/^(\d+)_(\d+)(\w+)/) { #non-frameshift substitution (example: c.1825_1826TT, now the ref is not included in the string)
($start, $end, $ref, $obs) = ($1, $2, 'REF', $3);
} else {
die "Error: invalid coding change format: <$cchange> within <$_>\n";
}
push @queue, [$field[0], $transcript, $start, $end, $ref, $obs, $cchange, $pchange];
$need_trans{$transcript}++;
$alltranscript or last; #if -alltranscript is set, process all record in this line; otherwise process the first transcript
}
}
my (%mrnastart, %mrnaend);
while (<GENE>) {
s/[\r\n]+$//;
my @field = split (/\t/, $_);
@field >= 11 or die "Error: invalid record found in gene file (>=11 fields expected): <$_>\n";
$field[0] =~ m/^\d+$/ and shift @field; #refGene and ensGene has bin as the first column
my ($name, $strand, $txstart, $txend, $cdsstart, $cdsend, $exonstart, $exonend) = @field[0, 2, 3, 4, 5, 6, 8, 9];
$need_trans{$name} or next;
my ($mrnastart, $mrnaend);
#next we need to make sure that there is no intron between transcription start and translation start (this is rare but it happens when cdsstart is not in the first exon)
my @exonstart = split (/,/, $exonstart);
my @exonend = split (/,/, $exonend);
$txstart++;
$cdsstart++;
@exonstart = map {$_+1} @exonstart;
if ($strand eq '+') {
#<---->-----<--->----<------>----<----->-----<--->
# **********
my $intron = 0;
for my $i (0 .. @exonstart-1) {
$i and $intron += ($exonstart[$i]-$exonend[$i-1]-1);
if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) {
$mrnastart = $cdsstart-$txstart+1-$intron;
}
if ($cdsend >= $exonstart[$i] and $cdsend <= $exonend[$i]) {
$mrnaend = $cdsend-$txstart+1-$intron;
}
}
} elsif ($strand eq '-') {
#<---->-----<--->----<------>----<----->-----<--->
# **********
my $intron = 0;
for (my $i=@exonstart-1; $i>=0; $i--) {
$i<@exonstart-1 and $intron += ($exonstart[$i+1]-$exonend[$i]-1);
if ($cdsend >= $exonstart[$i] and $cdsend <= $exonend[$i]) {
$mrnastart = $txend-$cdsend+1-$intron;
}
if ($cdsstart >= $exonstart[$i] and $cdsstart <= $exonend[$i]) {
$mrnaend = $txend-$cdsstart+1-$intron;
}
}
}
$mrnastart{$name} = $mrnastart;
$mrnaend{$name} = $mrnaend;
}
my (%mrnaseq);
my ($curname, $curseq, $curstatus); #current name, current sequence, current status (whether flagged as bad transcripts
while (<FASTA>) {
s/[\r\n]+$//;
if (m/^>([\w\.]+)/) {
if ($curseq) {
#some transcripts (such as NM_001075) occur multiple times in a file, sometimes with bad ORF annotation
if ($curstatus) {
1;
} else {
if ($mrnaseq{$curname}) { #if it already exists, the new one must be longer to replace the old one
length ($curseq) > length ($mrnaseq{$curname}) and $mrnaseq{$curname} = $curseq;
} else {
$mrnaseq{$curname} = $curseq;
}
}
}
$curname = $1;
$curseq = '';
$curstatus = 0;
if (m/does not have correct ORF annotation/) {
$curstatus = 1;
}
} else {
$curseq .= $_;
}
if ($curseq) {
$curstatus or $mrnaseq{$curname} = $curseq; #process the last sequence
}
}
#process each element in the queue
for my $i (0 .. @queue-1) {
my ($line, $transcript, $start, $end, $ref, $obs, $cchange, $pchange) = @{$queue[$i]};
$verbose and print STDERR "NOTICE: Processing $line with $transcript, $start, $end, $obs, $cchange\n";
if (not defined $mrnaseq{$transcript}) {
print STDERR "WARNING: cannot find mRNA sequence for $transcript in the fastafile $fastafile\n";
next;
}
if (not defined $mrnastart{$transcript}) {
print STDERR "WARNING: cannot find annotation for $transcript in the genefile $genefile or cannot infer the transcription start site\n";
next;
}
if (not defined $mrnaend{$transcript}) {
print STDERR "WARNING: cannot find annotation for $transcript in the genefile $genefile or cannot infer the transcription end site\n";
next;
}
if ($end > length ($mrnaseq{$transcript})) {
print STDERR "ERROR: transcript end ($mrnaend{$transcript}) for $transcript is longer than transcript length ${\(length ($mrnaseq{$transcript}))}, skipping this transcript\n";
next;
}
if ($mrnaseq) {
my $utr5 = substr ($mrnaseq{$transcript}, 0, $mrnastart{$transcript}-1);
my $utr3 = substr ($mrnaseq{$transcript}, $mrnaend{$transcript});
my ($mrna1, $mrna2);
$mrna1 = $mrnaseq{$transcript};
my $dna = substr ($mrnaseq{$transcript}, $mrnastart{$transcript}-1, $mrnaend{$transcript}-$mrnastart{$transcript}+1);
my @dna = split (//, $dna);
my ($protein1, $protein2);
my $warning = '';
my $cds = $dna;
if ($end > @dna) {
print STDERR "ERROR in $line: end position of variant ($end) in $transcript is longer than coding portion length ${\(scalar @dna)}, skipping this transcript\n";
next;
}
$protein1 = translateDNA ($dna);
$protein1 =~ m/\*\w/ and $warning = '(WARNING: Potential FASTA sequence error!!!)';
if ($start == $end and not $ref and $obs) { #this is an insertion
splice (@dna, $start, 0, $obs);
} else { #this is a substitution
splice (@dna, $start-1, $end-$start+1, $obs);
}
$dna = join ('', @dna);
$mrna2 = $utr5 . $dna . $utr3;
if ($codingseq) { #only print out the coding portion of the sequence
$mrna1 =~ s/(.{100})/$1\n/g;
print ">$line $transcript WILDTYPE CODING SEQUENCE $warning\n";
print "$cds\n";
$mrna2 =~ s/(.{100})/$1\n/g;
print ">$line $transcript CODING SEQUENCE c.$cchange\n";
print "$dna\n";
} else {
$mrna1 =~ s/(.{100})/$1\n/g;
print ">$line $transcript WILDTYPE $warning\n";
print "$mrna1\n";
$mrna2 =~ s/(.{100})/$1\n/g;
print ">$line $transcript c.$cchange\n";
print "$mrna2\n";
}
} else { #process protein sequence change
my $dna = substr ($mrnaseq{$transcript}, $mrnastart{$transcript}-1, $mrnaend{$transcript}-$mrnastart{$transcript}+1);
#my $dna = substr ($mrnaseq{$transcript}, $mrnastart{$transcript}-1);
my @dna = split (//, substr ($mrnaseq{$transcript}, $mrnastart{$transcript}-1)); #now DNA includes the UTR region (to handle stop loss scenario)
my ($protein1, $protein2);
my $warning = '';
my $inframe;
if ($end > @dna) {
print STDERR "WARNING in $line: end position of variant ($end) in $transcript is longer than coding portion length ${\(scalar @dna)}, possibly due to a deletion on a stop codon\n";
#next; #this is feasible to be observed (in ANNOVAR, if a deletion spans stop codon, we artificially increase the c. notation), and we will have to process it
}
$verbose and print STDERR "NOTICE: wild-type DNA sequence is $dna\n";
$protein1 = translateDNA ($dna);
#20170712: I decided to impose a stringent criteria here to make sure that amino acid change calculation is correct
#if ($protein1 =~ m/\*.+/ or $protein1 !~ m/\*$/) {
if ($protein1 =~ m/\*.+/) {
$flagged_transcript{$transcript}++;
$warning = '(WARNING: Potential FASTA sequence error!!!)';
next;
}
#$protein1 =~ m/\*\w/ and $warning = '(WARNING: Potential FASTA sequence error!!!)';
if ($start == $end and not $ref and $obs) { #this is an insertion (ref is treated as 0 when we read the EVF file previously)
splice (@dna, $start, 0, $obs);
$start++;
$end++; #after insertion of $obs, the position of start/end increase by one
if (length($obs) % 3 == 0) {
$inframe++;
}
} else { #this is a substitution
if ($obs eq '') { #deletion
splice (@dna, $start-1, $end-$start+1);
if ($end-$start+1 % 3 == 0) {
$inframe++;
}
} else {
splice (@dna, $start-1, $end-$start+1, split(//, $obs)); #20170712: absolutely important here to split obs into individual elements in an array!!!
if ($end-$start+1-length($obs) % 3 == 0) {
$inframe++;
}
}
}
my $aastart = int(($start-1)/3)+1;
my $aaend1 = int (($end-1)/3)+1;
my $aaend2 = int (($start+length($obs)-1-1)/3)+1;
my ($function, $aachange);
$aachange = ''; #assign default value, since synonymous variants do not have aachange
$dna = join ('', @dna);
$protein2 = translateDNA ($dna);
$verbose and print STDERR "NOTICE: mutated DNA sequence is $dna\n";
if (substr ($protein2, $aastart-1, $aaend2-$aastart+1) =~ m/\*/) {
$function = 'immediate-stopgain';
} elsif ($end>=$mrnaend{$transcript}-2) {
$function = 'immediate-stoploss';
} else {
if ($protein1 eq $protein2) {
$function = 'silent';
$onlyAltering and next;
} else {
$function = 'protein-altering'; #change nonsynonymous to protein-altering to reduce confusion among users 20150604
}
}
#I changed the paragraph below 20150604 to reduce confusion among users: for insertions, I now identify the exact position that has change (rather thant the position where nucleotide changes)
#$aachange = " (position $aastart-$aaend1 changed from " . substr ($protein1, $aastart-1, $aaend1-$aastart+1) . ' to ' . substr ($protein2, $aastart-1, $aaend2-$aastart+1). ')';
#for my $i (0 .. length($protein1)-1) {
# if (substr($protein1, $i, 1) ne substr($protein2, $i, 1)) {
# $aachange = " (position ${\($i+1)} changed from " . substr($protein1, $i, 1) . ' to ' . substr($protein2, $i, 1) . ")";
# last;
# }
#}
#Per user requests, I further changed the above paragraph to the updated one below.
#The goal is to find the exact change of all amino acids (not just the first one), until a stop codon is found
my ($pos1, $pos2, $aa1, $aa2); #pos1: first position with difference aa1:peptide with difference
my $len_max = length($protein1);
$len_max < length($protein2) and $len_max = length($protein2);
my @protein1 = split (//, $protein1);
my @protein2 = split (//, $protein2);
#print STDERR "line 287 Process next variant ($line, $transcript, $start, $end, $ref, $obs, $cchange)\nprotein1=$protein1\nprotein2=$protein2\n";
for my $i (0 .. $len_max-1) {
if ($pos1) { #already in difference zone
#$i < length ($protein1) or die "Error: ($line, $transcript, $start, $end, $ref, $obs, $cchange)\n$protein1\n$protein2";
#$i < length ($protein1) and $aa1 .= substr($protein1, $i, 1);
#$i < length ($protein2) and $aa2 .= substr($protein2, $i, 1);
$i < length ($protein1) and $aa1 .= $protein1[$i];
$i < length ($protein2) and $aa2 .= $protein2[$i];
} else {
#if (substr($protein1, $i, 1) ne substr($protein2, $i, 1)) {
# $pos1 = $i+1;
# $aa1 = substr($protein1, $i, 1);
# $aa2 = substr($protein2, $i, 1);
#}
$i > length($protein1) and next;
$i > length($protein2) and next;
if ($i == length($protein1)) { #sometimes a mutation cause stop loss so the protein2 is much longer (with UTR sequence); however, clearly the transcript has problems since * is not found
$pos1 = $i+1;
$aa1='';
$aa2=$protein2[$i];
} elsif ($i == length($protein2)) { #protein2 is shorter and now it reached
$pos1 = $i+1;
$aa1=$protein1[$i];
$aa2='';
} elsif ($protein1[$i] ne $protein2[$i]) {
$pos1 = $i+1;
$aa1 = $protein1[$i];
$aa2 = $protein2[$i];
}
}
if ($i >= length($protein2)) {
next; #do not consider protein2 now, but we still need to get all the aa1
}
#if (substr($protein2, $i, 1) eq '*') { #stop codon found in mutated protein
if ($protein2[$i] eq '*') { #stop codon found in mutated protein
if ($i < length ($protein1)) {
$pos2 = $i+1;
} else {
$pos2 = length ($protein1);
}
#$pos2 = $i+1;
last;
}
}
#special case: in hg19, chr6:30558477->A is a silent mutation on the stop codon, but previous version of the tool cannot recognize it correctly.
if (not $pos1) { #unable to find the first position with difference with mutated protein, so this is not a protein-altering mutation
#next; #process next mutation in the queue instead (I think even if this is not protein-altering, we should still print it out as exonic_variant_function did not think so)
}
#maximum of pos2 is the end of the wildtype protein
if (not $pos2) {
$pos2 = length($protein1);
}
while (1) { #sometimes aa1 and aa2 may have identical tails
if ($pos1 and $aa1 ne '*' and substr($aa1, -1) eq substr($aa2, -1)) {
$aa1 =~ s/.$//;
$aa2 =~ s/.$//;
$pos2--;
} else {
last;
}
}
if (not $pos1) {
$aachange = " (no amino acid change)";
if (not defined $pchange) {
#it is actually not that simple to handle this situation of silent change. We have to calculate it from c change instead
if ($cchange =~ m/(\d+)/) {
my $pchange_pos = int(($1-1)/3);
if ($pchange_pos >= length($protein1)) { #something is wrong here, the wildtype sequence is not complete possibly
$flagged_transcript{$transcript}++;
next;
}
my $pchange_aa = substr($protein1, $pchange_pos-1, 1);
$pchange = "$pchange_aa$pchange_pos$pchange_aa";
}
} elsif ($pchange =~ m/([\w\*]+?)(\d+)fs/) {
$pchange = "$1$2$1";
}
$function = 'silent';
} elsif ($pos1 == $pos2) { #a simple non-synonymous change
$aachange = " (position $pos1 changed from $aa1 to $aa2)";
$pchange = $aa1 . $pos1 . $aa2;
if ($aa1 eq '*') {
$function = 'immediate-stoploss';
}
if ($aa2 eq '*') {
$function = 'immediate-stopgain';
}
} elsif ($pos1 > $pos2) { #when there is an insertion, pos2 will be 1 less than pos1 (example: 1 1647893 1647893 - TTTCTT)
$aachange = " (position $pos2-$pos1 has insertion $aa2)";
$pchange = substr($protein1, $pos2-1, 1) . $pos2 . '_' . substr($protein1, $pos1-1, 1) . $pos1 . 'ins'. $aa2;
} else {
if ($inframe) { #inframe substitution
$aachange = " (position $pos1-$pos2 changed from $aa1 to $aa2)";
$pchange = substr($aa1, 0, 1) . $pos1. '_' . substr($aa1, length($aa1)-1, 1) . $pos2 . 'delins' . $aa2;
} else {
$aachange = " (position $pos1-$pos2 changed from $aa1 to $aa2)";
$pchange = substr($aa1, 0, 1) . $pos1. substr($aa2, 0, 1) . 'fs*' . length($aa2);
}
}
$protein2 =~ s/\*.+/\*/; #delete anything after the STOP codon
$protein1 =~ s/(.{100})/$1\n/g;
print ">$line $transcript WILDTYPE $warning\n";
print "$protein1\n";
$protein2 =~ s/(.{100})/$1\n/g;
print ">$line $transcript c.$cchange p.$pchange $function $aachange\n";
print "$protein2\n";
if ($newevf) { #when -newevf is specified we need to update the p. notation for the mutation, and sometimes even the exonic-variant-function
$newevf_p{$line, $transcript} = $pchange;
$newevf_function{$line, $transcript} = $function;
}
}
}
#process the new EVF file
if ($newevf) {
open (NEWEVF, ">$newevf") or die "Error: cannot write to new EVF file: $!";
open (EVF, $evffile) or die "Error: cannot read from evffile $evffile: $!\n"; #open EVF again since we will work from the begainning
while (<EVF>) {
#example: line78 nonsynonymous SNV TNFRSF14:NM_001297605:exon1:c.A50G:p.K17R,TNFRSF14:NM_003820:exon1:c.A50G:p.K17R, 1 2488153 2488153 A G
s/[\r\n]+$//;
m/^line\d+/ or die "Error: invalid record found in exonic_variant_function file $evffile (line number expected): <$_>\n";
my @field = split (/\t/, $_);
#if ($field[2] =~ m/^\w+:(\w+):wholegene/ or $field[2] =~ m/unknown/i) { #the problem is that sometimes the 2nd or 3rd entry is whole gene, so I commented this line 20170712 and handle this situation later
# print NEWEVF $_, "\n";
# next;
#}
$field[2] =~ s/,$//; #delete the last comma
my @notation = split (/,/, $field[2]);
my $new_notation;
for my $i (0 .. @notation-1) {
if ($notation[$i] =~ m/^[\w\-\.]+:(\w+):wholegene/ or $field[2] =~ m/unknown/i) {
$new_notation .= $notation[$i] . ',';
next;
}
$notation[$i] =~ m/^([\w\-\.]+?):([\w\.]+?):(exon\d+):(c.[\w\->]+)/ or die "Error: invalid notation found: <$notation[$i]>";
my ($genename, $transcript, $exon, $cdot) = ($1, $2, $3, $4);
#$flagged_transcript{$transcript} and next; #this transcript is flagged for potential error so we simply skip it
if ($newevf_function{$field[0], $transcript}) {
#two special situation to handle: (1) silent mutation that was previously treated as frameshift (2) stopgain mutation
if ($newevf_function{$field[0], $transcript} eq 'silent') { #switch nonframeshift to frameshift, example is 8 8887543 8887545 AAC -
if ($field[1] =~ m/^frameshift/) {
$field[1] = 'non'.$field[1];
}
}
if ($newevf_function{$field[0], $transcript} eq 'immediate-stoploss') { #switch nonframeshift to frameshift, example is 8 8887543 8887545 AAC -
$field[1] = 'stoploss';
}
if ($newevf_function{$field[0], $transcript} eq 'immediate-stopgain') { #switch nonframeshift to frameshift, example is 8 8887543 8887545 AAC -
$field[1] = 'stopgain';
}
$new_notation .= join (":", $genename, $transcript, $exon, $cdot, 'p.'.$newevf_p{$field[0], $transcript}) . ',';
} else {
$new_notation .= $notation[$i] . ',';
}
}
$field[2] = $new_notation; #replace old by new
print NEWEVF join("\t", @field), "\n";
}
%flagged_transcript and print STDERR "Warning: ${\(scalar (keys %flagged_transcript))} transcripts are flagged as having potential ORF issues (premature stopcodon or lack of stop codon)\n";
}
sub translateDNA {
my ($seq) = @_;
my ($nt3, $protein);
$seq = uc $seq;
#length ($seq) % 3 == 0 or printerr "WARNING: length of DNA sequence to be translated is not multiples of 3: <length=${\(length $seq)}>\n";
while ($seq =~ m/(...)/g) {
defined $codon1{$1} or print "WARNING: invalid triplets found in DNA sequence to be translated: <$1> in <$seq>\n" and die;
$protein .= $codon1{$1};
}
return $protein;
}
=head1 SYNOPSIS
coding_change.pl [arguments] <exonic-variant-function-file> <gene-def-file> <fasta-file>
Optional arguments:
-h, --help print help message
-m, --man print complete documentation
-v, --verbose use verbose output
--includesnp process SNPs (default is to process indels only)
--mrnaseq print out mRNA sequence (including UTR and coding)
--onlyAltering ignore synonymous SNPs (when -includesnp is on)
--codingseq only print coding sequence without UTR (when -mrnaseq is specified)
--alltranscript process all transcript rather than the first one (default)
--newevf <file> generate a new exonic-variant-function file with updated c. and p.
Function: infer the translated protein sequence (or mRNA sequence) for exonic
frameshift mutations (or SNPs) identified by ANNOVAR
Example: coding_change.pl ex1.avinput.exonic_variant_function humandb/hg19_refGene.txt humandb/hg19_refGeneMrna.fa
coding_change.pl ex1.avinput.exonic_variant_function humandb/hg19_refGene.txt humandb/hg19_refGeneMrna.fa -includesnp -onlyAltering
Version: $Date: 2017-07-17 01:17:05 -0400 (Mon, 17 Jul 2017) $
=head1 OPTIONS
=over 8
=item B<--help>
print a brief usage message and detailed explanation of options.
=item B<--man>
print the complete manual of the program.
=item B<--verbose>
use verbose output.
=item B<--outfile>
specify the output file name (by default output is written to STDOUT)
=item B<--includesnp>
include sequences affected by SNPs in the output. By default, only sequences
affected by exonic indels are printed out.
=item B<--mrnaseq>
print out mRNA sequences rather than protein sequences.
=item B<--onlyAltering>
when -includesnp is set, do not process synonymous variants and only print out
protein-coding alterations
=item B<--codingseq>
when -mrnaseq is specified, only print coding sequence without UTR sequence
=item B<--alltranscript>
process all transcripts for a mutation and print out separate coding changes. by
default only the first transcript in the line in the exonic_variant_function
file is processed.
=item B<--newevf>
generate an updated exonic-variant-function file with updated c. and p.
notation, based on newly calculated protein sequence. The -alltranscript
arguments are required for this argument.
=back
=head1 DESCRIPTION
This program will infer the protein sequence for frameshift mutations identified
by ANNOVAR. Typically, for non-synonymous mutations, the annotate_variation.pl
program in ANNOVAR will report the amino acid change at the position; however,
for frameshift mutations which may affect a long stretch of amino acid, ANNOVAR
will only give a frameshift annotation without printing out the new protein
sequence. This program can take ANNOVAR exonic_variant_function file and attempt
to infer the new protein sequence.
The <exonic-variant-function-file> is the one generated by
annotate_variation.pl. The <gene-def-file> is the gene definition file that is
used by annotate_variation.pl (which can be examined by looking at the LOG
message. The <fasta-file> is also used by annotate_variation.pl (which can be
examined by looking at the LOG message
=over 8
=item * B<Known Bug>
This program does not handle mitochondria mutations correctly yet.
=back
For questions, comments or bug reports, please contact me at
$Author: Kai Wang <kaichop@gmail.com> $.
=cut