forked from mailong25/self-supervised-speech-recognition
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_dict.py
46 lines (34 loc) · 1.34 KB
/
gen_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import argparse
import os
from os.path import join as join_path
import torch
import multiprocessing
import sys
from collections import Counter
from tqdm import tqdm
from sklearn.utils import shuffle
import ntpath
import soundfile
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--transcript_file", default=None, type=str,
required=True, help="Path to transcript file")
parser.add_argument("--save_dir", default=None, required=True,
type=str,help="Directory to save dictionary file")
args = parser.parse_args()
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
dictionary = os.path.join(args.save_dir,'dict.ltr.txt')
with open(args.transcript_file) as f:
data = f.read().splitlines()
words = [d.split('\t')[1].upper() for d in data]
letters = [d.replace(' ','|') for d in words]
letters = [' '.join(list(d)) + ' |' for d in letters]
chars = [l.split() for l in letters]
chars = [j for i in chars for j in i]
char_stats = list(Counter(chars).items())
char_stats = sorted(char_stats, key=lambda x : x[1], reverse = True)
char_stats = [c[0] + ' ' + str(c[1]) for c in char_stats]
with open(dictionary,'w') as f:
f.write('\n'.join(char_stats))
main()