#! /usr/bin/env -S uv run # /// script # requires-python = ">=3.10" # dependencies = ["yt-dlp", "whisper-cpp-pybind", "faster-whisper", "gradio"] # /// # dependencies = ["yt-dlp", "mlx-whisper", "sherpa-onnx"] # dependencies = ["yt-dlp", "soundfile", "pywhispercpp", "mlx-whisper"] from pathlib import Path # import mlx_whisper import yt_dlp from faster_whisper import WhisperModel # from pywhispercpp.model import Model as whisper_cpp # from whispercpp import Whisper def download(url): ydl_opts = { "format": "m4a/bestaudio/best", "postprocessors": [ { "key": "FFmpegExtractAudio", "preferredcodec": "wav", } ], "outtmpl": "%(id)s.%(ext)s", "postprocessor_args": [ "-ar", "16000", "-ac", "1", ], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: error_code = ydl.download([url]) if error_code != 0: raise Exception("Download failed") info = ydl.extract_info(url, download=False) # print(info["id"]) return info["id"] + ".wav" ''' def asr_mlx(wav_file): result = mlx_whisper.transcribe( wav_file, path_or_hf_repo="mlx-community/whisper-large-v3-turbo", ) for seg in result["segments"]: print(seg["text"]) def asr_cpp(wav_file): from whisper_cpp import Whisper whisper = Whisper("/home/ubuntu/code/whisper.cpp/models/ggml-large-v3-turbo.bin") # print(wav_file) whisper.transcribe(wav_file, diarize=True) whisper.output(output_csv=True, output_jsn=True, output_lrc=True, output_srt=True, output_txt=True, output_vtt=True, log_score=True) ''' def asr_fast(wav_file): model = WhisperModel('large-v3-turbo', device="cpu", compute_type="float32") # model = Whisper.from_pretrained( # "ggml-large-v3-turbo.bin", # basedir="/home/ubuntu/code/whisper.cpp/models/", # for dev # ) segments, info = model.transcribe( wav_file, language="zh", initial_prompt="以下是普通话的句子。" ) # print(info) # result = [s.text for s in segments if '请不吝点赞' not in s.text] result = [s.text for s in segments] print(result) return '\n'.join(result) import gradio as gr def handleURL(url): f = download(url) return asr_fast(f) # return "Hello, " + name + "!" * int(intensity) # demo = gr.Interface( fn=handleURL, inputs=["text"], outputs=["text"]) with gr.Blocks() as demo: gr.Markdown( """ # fetch video from url, and transcibe to text --- running very poor server with performance, maybe 1.5x time cost with orignal video processing. try yourself locally if your have macbook with silicon or high performance GPU. """) name = gr.Textbox(label="video url for transcibe") output = gr.Textbox(label="Output") greet_btn = gr.Button("submit") greet_btn.click(fn=handleURL, inputs=name, outputs=output) if __name__ == "__main__": import sys u = ( sys.argv[1] if len(sys.argv) > 1 else "https://www.bilibili.com/video/BV1ZMNrejEnH/" ) # w = download(u) # w = "/tmp/BV1ZMNrejEnH.wav" # w = "/tmp/1746897004.wav" # w = "BV1ZMNrejEnH.wav" # w = "abc.wav" # asr_fast(w) # asr_mlx(w) # print(r) demo.launch(share=True)