Spaces:

yanyaoer
/

video2text

Running

App Files Files Community

video2text / main.py

yanyaoer

Upload folder using huggingface_hub

55f7ace verified 28 days ago

raw

history blame contribute delete

3.27 kB

	#! /usr/bin/env -S uv run
	# /// script
	# requires-python = ">=3.10"
	# dependencies = ["yt-dlp", "whisper-cpp-pybind", "faster-whisper", "gradio"]
	# ///

	# dependencies = ["yt-dlp", "mlx-whisper", "sherpa-onnx"]
	# dependencies = ["yt-dlp", "soundfile", "pywhispercpp", "mlx-whisper"]

	from pathlib import Path

	# import mlx_whisper
	import yt_dlp
	from faster_whisper import WhisperModel
	# from pywhispercpp.model import Model as whisper_cpp
	# from whispercpp import Whisper

	def download(url):
	ydl_opts = {
	"format": "m4a/bestaudio/best",
	"postprocessors": [
	{
	"key": "FFmpegExtractAudio",
	"preferredcodec": "wav",
	}
	],
	"outtmpl": "%(id)s.%(ext)s",
	"postprocessor_args": [
	"-ar",
	"16000",
	"-ac",
	"1",
	],
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	error_code = ydl.download([url])
	if error_code != 0:
	raise Exception("Download failed")

	info = ydl.extract_info(url, download=False)
	# print(info["id"])
	return info["id"] + ".wav"


	'''
	def asr_mlx(wav_file):
	result = mlx_whisper.transcribe(
	wav_file,
	path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
	)

	for seg in result["segments"]:
	print(seg["text"])


	def asr_cpp(wav_file):
	from whisper_cpp import Whisper
	whisper = Whisper("/home/ubuntu/code/whisper.cpp/models/ggml-large-v3-turbo.bin")
	# print(wav_file)
	whisper.transcribe(wav_file, diarize=True)
	whisper.output(output_csv=True, output_jsn=True, output_lrc=True, output_srt=True, output_txt=True, output_vtt=True, log_score=True)
	'''


	def asr_fast(wav_file):
	model = WhisperModel('large-v3-turbo', device="cpu", compute_type="float32")
	# model = Whisper.from_pretrained(
	# "ggml-large-v3-turbo.bin",
	# basedir="/home/ubuntu/code/whisper.cpp/models/", # for dev
	# )
	segments, info = model.transcribe(
	wav_file, language="zh", initial_prompt="以下是普通话的句子。"
	)
	# print(info)

	# result = [s.text for s in segments if '请不吝点赞' not in s.text]
	result = [s.text for s in segments]
	print(result)
	return '\n'.join(result)


	import gradio as gr

	def handleURL(url):
	f = download(url)
	return asr_fast(f)
	# return "Hello, " + name + "!" * int(intensity)

	# demo = gr.Interface( fn=handleURL, inputs=["text"], outputs=["text"])

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# fetch video from url, and transcibe to text
	---

	running very poor server with performance, maybe 1.5x time cost with orignal video processing.
	try yourself locally if your have macbook with silicon or high performance GPU.
	<https://gist.github.com/yanyaoer/5cc7b0dd6729f306ad3cb740d501cabd#file-0-video2text-py>

	""")
	name = gr.Textbox(label="video url for transcibe")
	output = gr.Textbox(label="Output")
	greet_btn = gr.Button("submit")
	greet_btn.click(fn=handleURL, inputs=name, outputs=output)


	if __name__ == "__main__":
	import sys

	u = (
	sys.argv[1] if len(sys.argv) > 1 else "https://www.bilibili.com/video/BV1ZMNrejEnH/"
	)
	# w = download(u)
	# w = "/tmp/BV1ZMNrejEnH.wav"
	# w = "/tmp/1746897004.wav"
	# w = "BV1ZMNrejEnH.wav"
	# w = "abc.wav"
	# asr_fast(w)
	# asr_mlx(w)
	# print(r)
	demo.launch(share=True)