-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcaption_generator.py
66 lines (55 loc) · 2.21 KB
/
caption_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import whisper
import cv2
def generate_captions(audio_file):
print("Transcribing audio...")
model = whisper.load_model("tiny")
result = model.transcribe(audio_file, verbose = False, language = "en")
print("Audio transcribed")
return result
def generate_lines(text, start, end, fps, width, char_width):
print("Generating lines...")
lines = []
total_frames = int((end - start) * fps)
start_frame = int(start * fps)
total_chars = len(text)
words = text.split()
current_line = ""
line_start_frame = start_frame
line_length = 0
for word in words:
word_length = (len(word) + 1) * char_width
if line_length + word_length <= width:
current_line += word + " "
line_length += word_length
else:
line_end_frame = line_start_frame + int(line_length / char_width * total_frames / total_chars)
lines.append([current_line.strip(), line_start_frame, line_end_frame])
current_line = word + " "
line_start_frame = line_end_frame
line_length = word_length
if current_line:
line_end_frame = start_frame + total_frames
lines.append([current_line, line_start_frame, line_end_frame])
print("Lines generated")
return lines
def generate_text_lines(result, video_file):
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.8
FONT_THICKNESS = 2
cap = cv2.VideoCapture(video_file)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
asp = 16/9
ret, frame = cap.read()
width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
width = width - (width * 0.1)
fps = cap.get(cv2.CAP_PROP_FPS)
temp_text = result["segments"][0]["text"]
temp_textsize = cv2.getTextSize(temp_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
char_width = int(temp_textsize[0] / len(temp_text))
text_lines = []
for i in result["segments"]:
lines = generate_lines(i["text"], i["start"], i["end"], fps, width, char_width)
text_lines.extend(lines)
cap.release()
return text_lines