Spaces:

marquesafonso
/

multilang-asr-transcriber

Running

App Files Files Community

marquesafonso commited on 17 days ago

Commit

8d799e6

1 Parent(s): b9806d2

add device_type; add char overflow and long pause heuristics

Browse files

Files changed (3) hide show

Dockerfile +2 -0
main.py +4 -2
src/transcriber.py +63 -26

Dockerfile CHANGED Viewed

@@ -1,6 +1,8 @@
 # Use an official Python runtime as a parent image
 FROM python:3.11-slim-bullseye
 RUN useradd -m -u 1000 user
 # Set the working directory in the container to /app

 # Use an official Python runtime as a parent image
 FROM python:3.11-slim-bullseye
+# For local setup use:
+# USER root
 RUN useradd -m -u 1000 user
 # Set the working directory in the container to /app

main.py CHANGED Viewed

@@ -14,13 +14,14 @@ def main():
                 model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
                                                 "turbo",
                                                 "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
                 text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
                 srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
                 text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
                 json_output = gr.JSON(label="JSON Transcription")
                 gr.Interface(
                     fn=transcriber,
-                    inputs=[file, file_type, max_words_per_line, task, model_version],
                     outputs=[text_output, srt_file, text_clean_output, json_output],
                     allow_flagging="never"
                 )
@@ -32,13 +33,14 @@ def main():
                 model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
                                                 "turbo",
                                                 "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
                 text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
                 srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
                 text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
                 json_output = gr.JSON(label="JSON Transcription")
                 gr.Interface(
                     fn=transcriber,
-                    inputs=[file, file_type, max_words_per_line, task, model_version],
                     outputs=[text_output, srt_file, text_clean_output, json_output],
                     allow_flagging="never"
                 )

                 model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
                                                 "turbo",
                                                 "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
+                device_type = gr.Radio(choices=["desktop", "mobile"], value="desktop", label="Select Device")
                 text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
                 srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
                 text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
                 json_output = gr.JSON(label="JSON Transcription")
                 gr.Interface(
                     fn=transcriber,
+                    inputs=[file, file_type, max_words_per_line, task, model_version, device_type],
                     outputs=[text_output, srt_file, text_clean_output, json_output],
                     allow_flagging="never"
                 )
                 model_version = gr.Radio(choices=["deepdml/faster-whisper-large-v3-turbo-ct2",
                                                 "turbo",
                                                 "large-v3"], value="deepdml/faster-whisper-large-v3-turbo-ct2", label="Select Model")
+                device_type = gr.Radio(choices=["desktop", "mobile"], value="desktop", label="Select Device")
                 text_output = gr.Textbox(label="SRT Text transcription", show_copy_button=True)
                 srt_file = gr.File(file_count="single", type="filepath", file_types=[".srt"], label="SRT file")
                 text_clean_output = gr.Textbox(label="Text transcription", show_copy_button=True)
                 json_output = gr.JSON(label="JSON Transcription")
                 gr.Interface(
                     fn=transcriber,
+                    inputs=[file, file_type, max_words_per_line, task, model_version, device_type],
                     outputs=[text_output, srt_file, text_clean_output, json_output],
                     allow_flagging="never"
                 )

src/transcriber.py CHANGED Viewed

@@ -20,48 +20,84 @@ def convert_seconds_to_time(seconds):
     milliseconds = int((remainder - whole_seconds) * 1000)
     return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
-def write_srt(segments, max_words_per_line, srt_path):
-    with open(srt_path, "w", encoding='utf-8') as file:
-        result = ''
-        result_clean = []
-        json_output = {
-            "lines": []
-        }
         line_counter = 1
-        for _, segment in enumerate(segments):
-            words_in_line = []
-            for w, word in enumerate(segment.words):
-                words_in_line.append(word)
-                # Write the line if max words limit reached or it's the last word in the segment
-                if len(words_in_line) == max_words_per_line or w == len(segment.words) - 1:
                     if words_in_line:
                         start_time = convert_seconds_to_time(words_in_line[0].start)
                         end_time = convert_seconds_to_time(words_in_line[-1].end)
-                        line_text = ' '.join([w.word.strip() for w in words_in_line])
-                        # SRT format
                         result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
-                        result_clean += [line_text]
-                        # JSON format
                         json_output["lines"].append({
                             "line_index": line_counter,
                             "start": words_in_line[0].start,
                             "end": words_in_line[-1].end,
                             "text": line_text,
                             "words": [
-                                {
-                                    "word": w.word.strip(),
-                                    "start": w.start,
-                                    "end": w.end
-                                } for w in words_in_line
                             ]
                         })
                         line_counter += 1
-                    words_in_line = []
         file.write(result)
         return result, srt_path, " ".join(result_clean), json.dumps(json_output)
@@ -71,7 +107,8 @@ def transcriber(file_input:gr.File,
                 file_type: str,
                 max_words_per_line:int,
                 task:str,
-                model_version:str):
     srt_filepath = os.path.normpath(f"{file_input.split('.')[0]}.srt")
     if file_type == "video" :
         audio_input = convert_video_to_audio(file_input)
@@ -86,4 +123,4 @@ def transcriber(file_input:gr.File,
         vad_parameters=dict(min_silence_duration_ms=500),
         word_timestamps=True
     )
-    return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath)

     milliseconds = int((remainder - whole_seconds) * 1000)
     return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
+def write_srt(segments, max_words_per_line, srt_path, device_type):
+    # Pause and char heuristics
+    max_chars = 26 if device_type == "mobile" else 42
+    pause_threshold = 2.0
+    with open(srt_path, "w", encoding="utf-8") as file:
+        result = ""
+        result_clean = []
+        json_output = {"lines": []}
         line_counter = 1
+        words_in_line = []
+        for segment in segments:
+            for word in segment.words:
+                # Check if adding this word breaks char limit
+                tentative_line = " ".join([w.word.strip() for w in words_in_line + [word]])
+                # Detect pause (gap from previous word)
+                long_pause = False
+                if words_in_line:
+                    prev_word = words_in_line[-1]
+                    if word.start - prev_word.end >= pause_threshold:
+                        long_pause = True
+                word_overflow = len(words_in_line) >= max_words_per_line
+                char_overflow = len(tentative_line) > max_chars
+                # Break conditions
+                if (word_overflow or char_overflow or long_pause):
+                    # Finalize current line
                     if words_in_line:
                         start_time = convert_seconds_to_time(words_in_line[0].start)
                         end_time = convert_seconds_to_time(words_in_line[-1].end)
+                        line_text = " ".join([w.word.strip() for w in words_in_line])
+                        # SRT
                         result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
+                        result_clean.append(line_text)
+                        # JSON
                         json_output["lines"].append({
                             "line_index": line_counter,
                             "start": words_in_line[0].start,
                             "end": words_in_line[-1].end,
                             "text": line_text,
                             "words": [
+                                {"word": w.word.strip(), "start": w.start, "end": w.end}
+                                for w in words_in_line
                             ]
                         })
                         line_counter += 1
+                    # Start a fresh line with the current word
+                    words_in_line = [word]
+                else:
+                    # keep adding words
+                    words_in_line.append(word)
+        # Flush last line
+        if words_in_line:
+            start_time = convert_seconds_to_time(words_in_line[0].start)
+            end_time = convert_seconds_to_time(words_in_line[-1].end)
+            line_text = " ".join([w.word.strip() for w in words_in_line])
+            result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
+            result_clean.append(line_text)
+            json_output["lines"].append({
+                "line_index": line_counter,
+                "start": words_in_line[0].start,
+                "end": words_in_line[-1].end,
+                "text": line_text,
+                "words": [
+                    {"word": w.word.strip(), "start": w.start, "end": w.end}
+                    for w in words_in_line
+                ]
+            })
         file.write(result)
         return result, srt_path, " ".join(result_clean), json.dumps(json_output)
                 file_type: str,
                 max_words_per_line:int,
                 task:str,
+                model_version:str,
+                device_type: str):
     srt_filepath = os.path.normpath(f"{file_input.split('.')[0]}.srt")
     if file_type == "video" :
         audio_input = convert_video_to_audio(file_input)
         vad_parameters=dict(min_silence_duration_ms=500),
         word_timestamps=True
     )
+    return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath, device_type=device_type)