Spaces:
Running
Running
#! /usr/bin/env -S uv run | |
# /// script | |
# requires-python = ">=3.10" | |
# dependencies = ["yt-dlp", "whisper-cpp-pybind", "faster-whisper", "gradio"] | |
# /// | |
# dependencies = ["yt-dlp", "mlx-whisper", "sherpa-onnx"] | |
# dependencies = ["yt-dlp", "soundfile", "pywhispercpp", "mlx-whisper"] | |
from pathlib import Path | |
# import mlx_whisper | |
import yt_dlp | |
from faster_whisper import WhisperModel | |
# from pywhispercpp.model import Model as whisper_cpp | |
# from whispercpp import Whisper | |
def download(url): | |
ydl_opts = { | |
"format": "m4a/bestaudio/best", | |
"postprocessors": [ | |
{ | |
"key": "FFmpegExtractAudio", | |
"preferredcodec": "wav", | |
} | |
], | |
"outtmpl": "%(id)s.%(ext)s", | |
"postprocessor_args": [ | |
"-ar", | |
"16000", | |
"-ac", | |
"1", | |
], | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
error_code = ydl.download([url]) | |
if error_code != 0: | |
raise Exception("Download failed") | |
info = ydl.extract_info(url, download=False) | |
# print(info["id"]) | |
return info["id"] + ".wav" | |
''' | |
def asr_mlx(wav_file): | |
result = mlx_whisper.transcribe( | |
wav_file, | |
path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |
) | |
for seg in result["segments"]: | |
print(seg["text"]) | |
def asr_cpp(wav_file): | |
from whisper_cpp import Whisper | |
whisper = Whisper("/home/ubuntu/code/whisper.cpp/models/ggml-large-v3-turbo.bin") | |
# print(wav_file) | |
whisper.transcribe(wav_file, diarize=True) | |
whisper.output(output_csv=True, output_jsn=True, output_lrc=True, output_srt=True, output_txt=True, output_vtt=True, log_score=True) | |
''' | |
def asr_fast(wav_file): | |
model = WhisperModel('large-v3-turbo', device="cpu", compute_type="float32") | |
# model = Whisper.from_pretrained( | |
# "ggml-large-v3-turbo.bin", | |
# basedir="/home/ubuntu/code/whisper.cpp/models/", # for dev | |
# ) | |
segments, info = model.transcribe( | |
wav_file, language="zh", initial_prompt="以下是普通话的句子。" | |
) | |
# print(info) | |
# result = [s.text for s in segments if '请不吝点赞' not in s.text] | |
result = [s.text for s in segments] | |
print(result) | |
return '\n'.join(result) | |
import gradio as gr | |
def handleURL(url): | |
f = download(url) | |
return asr_fast(f) | |
# return "Hello, " + name + "!" * int(intensity) | |
# demo = gr.Interface( fn=handleURL, inputs=["text"], outputs=["text"]) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# fetch video from url, and transcibe to text | |
--- | |
running very poor server with performance, maybe 1.5x time cost with orignal video processing. | |
try yourself locally if your have macbook with silicon or high performance GPU. | |
<https://gist.github.com/yanyaoer/5cc7b0dd6729f306ad3cb740d501cabd#file-0-video2text-py> | |
""") | |
name = gr.Textbox(label="video url for transcibe") | |
output = gr.Textbox(label="Output") | |
greet_btn = gr.Button("submit") | |
greet_btn.click(fn=handleURL, inputs=name, outputs=output) | |
if __name__ == "__main__": | |
import sys | |
u = ( | |
sys.argv[1] if len(sys.argv) > 1 else "https://www.bilibili.com/video/BV1ZMNrejEnH/" | |
) | |
# w = download(u) | |
# w = "/tmp/BV1ZMNrejEnH.wav" | |
# w = "/tmp/1746897004.wav" | |
# w = "BV1ZMNrejEnH.wav" | |
# w = "abc.wav" | |
# asr_fast(w) | |
# asr_mlx(w) | |
# print(r) | |
demo.launch(share=True) | |