marquesafonso commited on
Commit
8d799e6
·
1 Parent(s): b9806d2

add device_type; add char overflow and long pause heuristics

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -0
  2. main.py +4 -2
  3. src/transcriber.py +63 -26
Dockerfile CHANGED
@@ -1,6 +1,8 @@
1
  # Use an official Python runtime as a parent image
2
  FROM python:3.11-slim-bullseye
3
 
 
 
4
  RUN useradd -m -u 1000 user
5
 
6
  # Set the working directory in the container to /app
 
1
  # Use an official Python runtime as a parent image
2
  FROM python:3.11-slim-bullseye
3
 
4
+ # For local setup use:
5
+ # USER root
6
  RUN useradd -m -u 1000 user
7
 
8
  # Set the working directory in the container to /app
main.py CHANGED
@@ -14,13 +14,14 @@ def main():
14
  model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
15
  "turbo",
16
  "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
 
17
  text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
18
  srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
19
  text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
20
  json_output = gr.JSON(label="JSON Transcription")
21
  gr.Interface(
22
  fn=transcriber,
23
- inputs=[file, file_type, max_words_per_line, task, model_version],
24
  outputs=[text_output, srt_file, text_clean_output, json_output],
25
  allow_flagging="never"
26
  )
@@ -32,13 +33,14 @@ def main():
32
  model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
33
  "turbo",
34
  "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
 
35
  text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
36
  srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
37
  text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
38
  json_output = gr.JSON(label="JSON Transcription")
39
  gr.Interface(
40
  fn=transcriber,
41
- inputs=[file, file_type, max_words_per_line, task, model_version],
42
  outputs=[text_output, srt_file, text_clean_output, json_output],
43
  allow_flagging="never"
44
  )
 
14
  model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
15
  "turbo",
16
  "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
17
+ device_type = gr.Radio(choices=["desktop", "mobile"], value="desktop", label="Select Device")
18
  text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
19
  srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
20
  text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
21
  json_output = gr.JSON(label="JSON Transcription")
22
  gr.Interface(
23
  fn=transcriber,
24
+ inputs=[file, file_type, max_words_per_line, task, model_version, device_type],
25
  outputs=[text_output, srt_file, text_clean_output, json_output],
26
  allow_flagging="never"
27
  )
 
33
  model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
34
  "turbo",
35
  "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
36
+ device_type = gr.Radio(choices=["desktop", "mobile"], value="desktop", label="Select Device")
37
  text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
38
  srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
39
  text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
40
  json_output = gr.JSON(label="JSON Transcription")
41
  gr.Interface(
42
  fn=transcriber,
43
+ inputs=[file, file_type, max_words_per_line, task, model_version, device_type],
44
  outputs=[text_output, srt_file, text_clean_output, json_output],
45
  allow_flagging="never"
46
  )
src/transcriber.py CHANGED
@@ -20,48 +20,84 @@ def convert_seconds_to_time(seconds):
20
  milliseconds = int((remainder - whole_seconds) * 1000)
21
  return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
22
 
23
- def write_srt(segments, max_words_per_line, srt_path):
24
- with open(srt_path, "w", encoding='utf-8') as file:
25
- result = ''
26
- result_clean = []
27
- json_output = {
28
- "lines": []
29
- }
30
 
 
 
 
 
31
  line_counter = 1
32
- for _, segment in enumerate(segments):
33
- words_in_line = []
34
- for w, word in enumerate(segment.words):
35
- words_in_line.append(word)
36
 
37
- # Write the line if max words limit reached or it's the last word in the segment
38
- if len(words_in_line) == max_words_per_line or w == len(segment.words) - 1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  if words_in_line:
40
  start_time = convert_seconds_to_time(words_in_line[0].start)
41
  end_time = convert_seconds_to_time(words_in_line[-1].end)
42
- line_text = ' '.join([w.word.strip() for w in words_in_line])
43
 
44
- # SRT format
45
  result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
46
- result_clean += [line_text]
47
 
48
- # JSON format
49
  json_output["lines"].append({
50
  "line_index": line_counter,
51
  "start": words_in_line[0].start,
52
  "end": words_in_line[-1].end,
53
  "text": line_text,
54
  "words": [
55
- {
56
- "word": w.word.strip(),
57
- "start": w.start,
58
- "end": w.end
59
- } for w in words_in_line
60
  ]
61
  })
62
-
63
  line_counter += 1
64
- words_in_line = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  file.write(result)
67
  return result, srt_path, " ".join(result_clean), json.dumps(json_output)
@@ -71,7 +107,8 @@ def transcriber(file_input:gr.File,
71
  file_type: str,
72
  max_words_per_line:int,
73
  task:str,
74
- model_version:str):
 
75
  srt_filepath = os.path.normpath(f"{file_input.split('.')[0]}.srt")
76
  if file_type == "video" :
77
  audio_input = convert_video_to_audio(file_input)
@@ -86,4 +123,4 @@ def transcriber(file_input:gr.File,
86
  vad_parameters=dict(min_silence_duration_ms=500),
87
  word_timestamps=True
88
  )
89
- return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath)
 
20
  milliseconds = int((remainder - whole_seconds) * 1000)
21
  return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
22
 
23
+ def write_srt(segments, max_words_per_line, srt_path, device_type):
24
+
25
+ # Pause and char heuristics
26
+ max_chars = 26 if device_type == "mobile" else 42
27
+ pause_threshold = 2.0
 
 
28
 
29
+ with open(srt_path, "w", encoding="utf-8") as file:
30
+ result = ""
31
+ result_clean = []
32
+ json_output = {"lines": []}
33
  line_counter = 1
 
 
 
 
34
 
35
+ words_in_line = []
36
+
37
+ for segment in segments:
38
+ for word in segment.words:
39
+ # Check if adding this word breaks char limit
40
+ tentative_line = " ".join([w.word.strip() for w in words_in_line + [word]])
41
+
42
+ # Detect pause (gap from previous word)
43
+ long_pause = False
44
+ if words_in_line:
45
+ prev_word = words_in_line[-1]
46
+ if word.start - prev_word.end >= pause_threshold:
47
+ long_pause = True
48
+
49
+ word_overflow = len(words_in_line) >= max_words_per_line
50
+ char_overflow = len(tentative_line) > max_chars
51
+ # Break conditions
52
+ if (word_overflow or char_overflow or long_pause):
53
+ # Finalize current line
54
  if words_in_line:
55
  start_time = convert_seconds_to_time(words_in_line[0].start)
56
  end_time = convert_seconds_to_time(words_in_line[-1].end)
57
+ line_text = " ".join([w.word.strip() for w in words_in_line])
58
 
59
+ # SRT
60
  result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
61
+ result_clean.append(line_text)
62
 
63
+ # JSON
64
  json_output["lines"].append({
65
  "line_index": line_counter,
66
  "start": words_in_line[0].start,
67
  "end": words_in_line[-1].end,
68
  "text": line_text,
69
  "words": [
70
+ {"word": w.word.strip(), "start": w.start, "end": w.end}
71
+ for w in words_in_line
 
 
 
72
  ]
73
  })
 
74
  line_counter += 1
75
+
76
+ # Start a fresh line with the current word
77
+ words_in_line = [word]
78
+ else:
79
+ # keep adding words
80
+ words_in_line.append(word)
81
+
82
+ # Flush last line
83
+ if words_in_line:
84
+ start_time = convert_seconds_to_time(words_in_line[0].start)
85
+ end_time = convert_seconds_to_time(words_in_line[-1].end)
86
+ line_text = " ".join([w.word.strip() for w in words_in_line])
87
+
88
+ result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
89
+ result_clean.append(line_text)
90
+
91
+ json_output["lines"].append({
92
+ "line_index": line_counter,
93
+ "start": words_in_line[0].start,
94
+ "end": words_in_line[-1].end,
95
+ "text": line_text,
96
+ "words": [
97
+ {"word": w.word.strip(), "start": w.start, "end": w.end}
98
+ for w in words_in_line
99
+ ]
100
+ })
101
 
102
  file.write(result)
103
  return result, srt_path, " ".join(result_clean), json.dumps(json_output)
 
107
  file_type: str,
108
  max_words_per_line:int,
109
  task:str,
110
+ model_version:str,
111
+ device_type: str):
112
  srt_filepath = os.path.normpath(f"{file_input.split('.')[0]}.srt")
113
  if file_type == "video" :
114
  audio_input = convert_video_to_audio(file_input)
 
123
  vad_parameters=dict(min_silence_duration_ms=500),
124
  word_timestamps=True
125
  )
126
+ return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath, device_type=device_type)