text-to-video / app.py
nezihtopaloglu's picture
video size choice and more debugging metrics
cd94723
raw
history blame
3.47 kB
import gradio as gr
import torch
import torchaudio
from diffusers import StableDiffusionPipeline
from TTS.api import TTS
import moviepy.editor as mp
import numpy as np
import os
from PIL import Image
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10):
words = text.split()
chunks = []
current_chunk = []
current_duration = 0
for word in words:
current_chunk.append(word)
current_duration += 1 / words_per_second
if current_duration >= min_sec:
if current_duration >= max_sec or len(current_chunk) > 20:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_duration = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
total_time = sum([min(max(len(chunk.split()) / words_per_second, min_sec), max_sec) for chunk in chunks])
print(f"Total estimated time for video: {total_time:.2f} seconds")
return chunks
def generate_speech(text):
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
wav_path = "speech.wav"
tts.tts_to_file(text=text, file_path=wav_path)
return wav_path
def generate_images(chunks, image_size=(640, 480)):
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
image_paths = []
for i, chunk in enumerate(chunks):
print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...") # Printing part of the chunk
image = pipe(chunk).images[0]
image = image.resize(image_size)
img_path = f"image_{i}.png"
image.save(img_path)
image_paths.append(img_path)
return image_paths
def create_video(images, durations, speech_path, image_size=(640, 480)):
clips = [mp.ImageClip(img).set_duration(dur).resize(image_size) for img, dur in zip(images, durations)]
black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1)
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
video = mp.concatenate_videoclips([black_start] + clips + [black_end])
audio = mp.AudioFileClip(speech_path)
final_video = video.set_audio(audio)
final_video.write_videofile("output.mp4", fps=24)
return "output.mp4"
def process_text(text, image_size):
chunks = estimate_chunk_durations(text)
speech_path = generate_speech(text)
image_paths = generate_images(chunks, image_size)
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
video_path = create_video(image_paths, durations, speech_path, image_size)
return video_path
with gr.Blocks() as demo:
gr.Markdown("# Text-to-Video Generator using AI 🎥")
text_input = gr.Textbox(label="Enter your text")
file_input = gr.File(label="Or upload a .txt file")
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
process_btn = gr.Button("Generate Video")
output_video = gr.Video()
def handle_request(text, file, image_size):
if file is not None:
text = open(file.name, "r").read()
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
return process_text(text, image_size_dict[image_size])
process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input], outputs=output_video)
demo.launch()