File size: 3,272 Bytes
cf44123
 
 
 
 
 
 
 
 
 
 
 
 
55f7ace
cf44123
55f7ace
cf44123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb2c63f
cf44123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb2c63f
 
cf44123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb2c63f
 
cf44123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#! /usr/bin/env -S uv run
# /// script
# requires-python = ">=3.10"
# dependencies = ["yt-dlp", "whisper-cpp-pybind", "faster-whisper", "gradio"]
# ///

# dependencies = ["yt-dlp", "mlx-whisper", "sherpa-onnx"]
# dependencies = ["yt-dlp", "soundfile", "pywhispercpp", "mlx-whisper"]

from pathlib import Path

# import mlx_whisper
import yt_dlp
from faster_whisper import WhisperModel
# from pywhispercpp.model import Model as whisper_cpp
# from whispercpp import Whisper

def download(url):
  ydl_opts = {
    "format": "m4a/bestaudio/best",
    "postprocessors": [
      {
        "key": "FFmpegExtractAudio",
        "preferredcodec": "wav",
      }
    ],
    "outtmpl": "%(id)s.%(ext)s",
    "postprocessor_args": [
      "-ar",
      "16000",
      "-ac",
      "1",
    ],
  }

  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download([url])
    if error_code != 0:
      raise Exception("Download failed")

    info = ydl.extract_info(url, download=False)
    # print(info["id"])
    return info["id"] + ".wav"


'''
def asr_mlx(wav_file):
  result = mlx_whisper.transcribe(
    wav_file,
    path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
  )

  for seg in result["segments"]:
    print(seg["text"])


def asr_cpp(wav_file):
  from whisper_cpp import Whisper
  whisper = Whisper("/home/ubuntu/code/whisper.cpp/models/ggml-large-v3-turbo.bin")
  # print(wav_file)
  whisper.transcribe(wav_file, diarize=True)
  whisper.output(output_csv=True, output_jsn=True, output_lrc=True, output_srt=True, output_txt=True, output_vtt=True, log_score=True)
'''


def asr_fast(wav_file):
  model = WhisperModel('large-v3-turbo', device="cpu", compute_type="float32")
  # model = Whisper.from_pretrained(
  #   "ggml-large-v3-turbo.bin",
  #   basedir="/home/ubuntu/code/whisper.cpp/models/",  # for dev
  # )
  segments, info = model.transcribe(
    wav_file, language="zh", initial_prompt="以下是普通话的句子。"
  )
  # print(info)

  # result = [s.text for s in segments if '请不吝点赞' not in s.text]
  result = [s.text for s in segments]
  print(result)
  return '\n'.join(result)


import gradio as gr

def handleURL(url):
    f = download(url)
    return asr_fast(f)
    # return "Hello, " + name + "!" * int(intensity)

# demo = gr.Interface( fn=handleURL, inputs=["text"], outputs=["text"])

with gr.Blocks() as demo:
    gr.Markdown(
    """
    # fetch video from url, and transcibe to text
    ---
    
    running very poor server with performance, maybe 1.5x time cost with orignal video processing.  
    try yourself locally if your have macbook with silicon or high performance GPU.  
    <https://gist.github.com/yanyaoer/5cc7b0dd6729f306ad3cb740d501cabd#file-0-video2text-py>

    """)
    name = gr.Textbox(label="video url for transcibe")
    output = gr.Textbox(label="Output")
    greet_btn = gr.Button("submit")
    greet_btn.click(fn=handleURL, inputs=name, outputs=output)


if __name__ == "__main__":
  import sys

  u = (
    sys.argv[1] if len(sys.argv) > 1 else "https://www.bilibili.com/video/BV1ZMNrejEnH/"
  )
  # w = download(u)
  # w = "/tmp/BV1ZMNrejEnH.wav"
  # w = "/tmp/1746897004.wav"
  # w = "BV1ZMNrejEnH.wav"
  # w = "abc.wav"
  # asr_fast(w)
  # asr_mlx(w)
  # print(r)
  demo.launch(share=True)