Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
import torchaudio | |
from diffusers import StableDiffusionPipeline | |
from TTS.api import TTS | |
import moviepy.editor as mp | |
import numpy as np | |
import os | |
from PIL import Image | |
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10): | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_duration = 0 | |
for word in words: | |
current_chunk.append(word) | |
current_duration += 1 / words_per_second | |
if current_duration >= min_sec: | |
if current_duration >= max_sec or len(current_chunk) > 20: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_duration = 0 | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
total_time = sum([min(max(len(chunk.split()) / words_per_second, min_sec), max_sec) for chunk in chunks]) | |
print(f"Total estimated time for video: {total_time:.2f} seconds") | |
return chunks | |
def generate_speech(text): | |
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC") | |
wav_path = "speech.wav" | |
tts.tts_to_file(text=text, file_path=wav_path) | |
return wav_path | |
def generate_images(chunks, image_size=(640, 480)): | |
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") | |
pipe.to("cuda" if torch.cuda.is_available() else "cpu") | |
image_paths = [] | |
for i, chunk in enumerate(chunks): | |
print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...") # Printing part of the chunk | |
image = pipe(chunk).images[0] | |
image = image.resize(image_size) | |
img_path = f"image_{i}.png" | |
image.save(img_path) | |
image_paths.append(img_path) | |
return image_paths | |
def create_video(images, durations, speech_path, image_size=(640, 480)): | |
clips = [mp.ImageClip(img).set_duration(dur).resize(image_size) for img, dur in zip(images, durations)] | |
black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1) | |
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2) | |
video = mp.concatenate_videoclips([black_start] + clips + [black_end]) | |
audio = mp.AudioFileClip(speech_path) | |
final_video = video.set_audio(audio) | |
final_video.write_videofile("output.mp4", fps=24) | |
return "output.mp4" | |
def process_text(text, image_size): | |
chunks = estimate_chunk_durations(text) | |
speech_path = generate_speech(text) | |
image_paths = generate_images(chunks, image_size) | |
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks] | |
video_path = create_video(image_paths, durations, speech_path, image_size) | |
return video_path | |
with gr.Blocks() as demo: | |
gr.Markdown("# Text-to-Video Generator using AI 🎥") | |
text_input = gr.Textbox(label="Enter your text") | |
file_input = gr.File(label="Or upload a .txt file") | |
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480") | |
process_btn = gr.Button("Generate Video") | |
output_video = gr.Video() | |
def handle_request(text, file, image_size): | |
if file is not None: | |
text = open(file.name, "r").read() | |
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)} | |
return process_text(text, image_size_dict[image_size]) | |
process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input], outputs=output_video) | |
demo.launch() | |