Flux-VIDEO

Running on Zero

App Files Files Community

Flux-VIDEO / app.py

openfree

Update app.py

12bb0b1 verified 24 days ago

raw

history blame

24.6 kB

	import gradio as gr
	import numpy as np
	from PIL import Image, ImageDraw
	from gradio_client import Client, handle_file
	import random
	import tempfile
	import os
	import logging
	import torch
	from diffusers import AutoencoderKL, TCDScheduler
	from diffusers.models.model_loading_utils import load_state_dict
	from huggingface_hub import hf_hub_download

	# Spaces GPU
	try:
	import spaces
	except:
	# GPU 데코레이터가 없을 때를 위한 더미 데코레이터
	class spaces:
	@staticmethod
	def GPU(duration=None):
	def decorator(func):
	return func
	return decorator

	# 환경 변수 설정
	os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"

	# MMAudio 관련 임포트
	try:
	import mmaudio
	from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
	setup_eval_logging)
	from mmaudio.model.flow_matching import FlowMatching
	from mmaudio.model.networks import MMAudio, get_my_mmaudio
	from mmaudio.model.sequence_config import SequenceConfig
	from mmaudio.model.utils.features_utils import FeaturesUtils
	MMAUDIO_AVAILABLE = True
	except ImportError:
	MMAUDIO_AVAILABLE = False
	logging.warning("MMAudio not available. Sound generation will be disabled.")

	# ControlNet 모델 로드
	try:
	from controlnet_union import ControlNetModel_Union
	from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline

	# ControlNet 설정 및 로드
	config_file = hf_hub_download(
	"xinsir/controlnet-union-sdxl-1.0",
	filename="config_promax.json",
	)

	config = ControlNetModel_Union.load_config(config_file)
	controlnet_model = ControlNetModel_Union.from_config(config)

	model_file = hf_hub_download(
	"xinsir/controlnet-union-sdxl-1.0",
	filename="diffusion_pytorch_model_promax.safetensors",
	)
	state_dict = load_state_dict(model_file)
	loaded_keys = list(state_dict.keys())

	result = ControlNetModel_Union._load_pretrained_model(
	controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
	)

	model = result[0]
	model = model.to(device="cuda", dtype=torch.float16)

	# VAE 로드
	vae = AutoencoderKL.from_pretrained(
	"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
	).to("cuda")

	# 파이프라인 로드
	pipe = StableDiffusionXLFillPipeline.from_pretrained(
	"SG161222/RealVisXL_V5.0_Lightning",
	torch_dtype=torch.float16,
	vae=vae,
	controlnet=model,
	variant="fp16",
	).to("cuda")

	pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)

	OUTPAINT_MODEL_LOADED = True
	except Exception as e:
	logging.error(f"Failed to load outpainting models: {str(e)}")
	OUTPAINT_MODEL_LOADED = False

	# MMAudio 모델 설정 및 로드
	if MMAUDIO_AVAILABLE:
	try:
	# CUDA 설정
	if torch.cuda.is_available():
	device = torch.device("cuda")
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.benchmark = True
	else:
	device = torch.device("cpu")

	dtype = torch.bfloat16

	# 모델 설정
	model_cfg: ModelConfig = all_model_cfg['large_44k_v2']
	model_cfg.download_if_needed()

	setup_eval_logging()

	# 모델 로드
	def get_mmaudio_model():
	with torch.cuda.device(device):
	seq_cfg = model_cfg.seq_cfg
	net: MMAudio = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
	net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
	logging.info(f'Loaded MMAudio weights from {model_cfg.model_path}')

	feature_utils = FeaturesUtils(
	tod_vae_ckpt=model_cfg.vae_path,
	synchformer_ckpt=model_cfg.synchformer_ckpt,
	enable_conditions=True,
	mode=model_cfg.mode,
	bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
	need_vae_encoder=False
	).to(device, dtype).eval()

	return net, feature_utils, seq_cfg

	mmaudio_net, mmaudio_feature_utils, mmaudio_seq_cfg = get_mmaudio_model()
	MMAUDIO_LOADED = True
	except Exception as e:
	logging.error(f"Failed to load MMAudio models: {str(e)}")
	MMAUDIO_LOADED = False
	else:
	MMAUDIO_LOADED = False

	# API URLs
	TEXT2IMG_API_URL = "http://211.233.58.201:7896"
	VIDEO_API_URL = "http://211.233.58.201:7875"

	# 로깅 설정
	logging.basicConfig(level=logging.INFO)

	# Image size presets
	IMAGE_PRESETS = {
	"커스텀": {"width": 1024, "height": 1024},
	"1:1 정사각형": {"width": 1024, "height": 1024},
	"4:3 표준": {"width": 1024, "height": 768},
	"16:9 와이드스크린": {"width": 1024, "height": 576},
	"9:16 세로형": {"width": 576, "height": 1024},
	"6:19 특수 세로형": {"width": 324, "height": 1024},
	"Instagram 정사각형": {"width": 1080, "height": 1080},
	"Instagram 스토리": {"width": 1080, "height": 1920},
	"Instagram 가로형": {"width": 1080, "height": 566},
	"Facebook 커버": {"width": 820, "height": 312},
	"Twitter 헤더": {"width": 1500, "height": 500},
	"YouTube 썸네일": {"width": 1280, "height": 720},
	"LinkedIn 배너": {"width": 1584, "height": 396},
	}

	def update_dimensions(preset):
	if preset in IMAGE_PRESETS:
	return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
	return 1024, 1024

	def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
	if not prompt:
	return None, "프롬프트를 입력해주세요"

	try:
	client = Client(TEXT2IMG_API_URL)
	if seed == -1:
	seed = random.randint(0, 9999999)

	result = client.predict(
	prompt=prompt,
	width=int(width),
	height=int(height),
	guidance=float(guidance),
	inference_steps=int(inference_steps),
	seed=int(seed),
	do_img2img=False,
	init_image=None,
	image2image_strength=0.8,
	resize_img=True,
	api_name="/generate_image"
	)
	return result[0], f"사용된 시드: {result[1]}"
	except Exception as e:
	logging.error(f"Image generation error: {str(e)}")
	return None, f"오류: {str(e)}"

	@spaces.GPU(duration=60)
	@torch.inference_mode()
	def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5, target_duration=8.0):
	"""비디오에 사운드를 추가하는 함수"""
	if not MMAUDIO_LOADED:
	logging.error("MMAudio model not loaded")
	return video_path

	try:
	rng = torch.Generator(device=device)
	rng.manual_seed(seed)
	fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)

	# 비디오 로드 - target_duration 사용
	clip_frames, sync_frames, actual_duration = load_video(video_path, target_duration)
	clip_frames = clip_frames.unsqueeze(0)
	sync_frames = sync_frames.unsqueeze(0)
	mmaudio_seq_cfg.duration = actual_duration
	mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)

	# 오디오 생성
	audios = generate(clip_frames,
	sync_frames, [prompt],
	negative_text=[negative_prompt],
	feature_utils=mmaudio_feature_utils,
	net=mmaudio_net,
	fm=fm,
	rng=rng,
	cfg_strength=cfg_strength)
	audio = audios.float().cpu()[0]

	# 비디오와 오디오 결합
	video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
	make_video(video_path,
	video_save_path,
	audio,
	sampling_rate=mmaudio_seq_cfg.sampling_rate,
	duration_sec=mmaudio_seq_cfg.duration)

	return video_save_path
	except Exception as e:
	logging.error(f"Video to audio error: {str(e)}")
	import traceback
	traceback.print_exc()
	return video_path

	def generate_video_from_image(image, prompt="", length=4.0):
	if image is None:
	return None

	try:
	# 이미지 저장
	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
	temp_path = fp.name
	Image.fromarray(image).save(temp_path)

	# API 호출
	client = Client(VIDEO_API_URL)
	result = client.predict(
	input_image=handle_file(temp_path),
	prompt=prompt if prompt else "Generate natural motion",
	n_prompt="",
	seed=random.randint(0, 9999999),
	use_teacache=True,
	video_length=float(length),
	api_name="/process"
	)

	os.unlink(temp_path)

	if result and len(result) > 0:
	video_dict = result[0]
	return video_dict.get("video") if isinstance(video_dict, dict) else None

	except Exception as e:
	logging.error(f"Video generation error: {str(e)}")
	return None

	def add_sound_to_video(video_path, sound_prompt, sound_negative_prompt="music"):
	if not video_path or not MMAUDIO_LOADED:
	return video_path

	try:
	return video_to_audio(
	video_path=video_path,
	prompt=sound_prompt,
	negative_prompt=sound_negative_prompt,
	seed=random.randint(0, 9999999),
	num_steps=25,
	cfg_strength=4.5,
	target_duration=8.0 # 기본값 사용
	)
	except Exception as e:
	logging.error(f"Sound addition error: {str(e)}")
	return video_path


	def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
	"""이미지와 마스크를 준비하는 함수"""
	if image is None:
	return None, None

	# PIL 이미지로 변환
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image).convert('RGB')

	target_size = (width, height)

	# 이미지를 타겟 크기에 맞게 조정
	scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
	new_width = int(image.width * scale_factor)
	new_height = int(image.height * scale_factor)

	# 이미지 리사이즈
	source = image.resize((new_width, new_height), Image.LANCZOS)

	# 오버랩 계산
	overlap_x = int(new_width * (overlap_percentage / 100))
	overlap_y = int(new_height * (overlap_percentage / 100))
	overlap_x = max(overlap_x, 1)
	overlap_y = max(overlap_y, 1)

	# 정렬에 따른 마진 계산
	if alignment == "가운데":
	margin_x = (target_size[0] - new_width) // 2
	margin_y = (target_size[1] - new_height) // 2
	elif alignment == "왼쪽":
	margin_x = 0
	margin_y = (target_size[1] - new_height) // 2
	elif alignment == "오른쪽":
	margin_x = target_size[0] - new_width
	margin_y = (target_size[1] - new_height) // 2
	elif alignment == "위":
	margin_x = (target_size[0] - new_width) // 2
	margin_y = 0
	elif alignment == "아래":
	margin_x = (target_size[0] - new_width) // 2
	margin_y = target_size[1] - new_height

	# 배경 이미지 생성
	background = Image.new('RGB', target_size, (255, 255, 255))
	background.paste(source, (margin_x, margin_y))

	# 마스크 생성
	mask = Image.new('L', target_size, 255)
	mask_draw = ImageDraw.Draw(mask)

	# 마스크 영역 그리기
	white_gaps_patch = 2

	left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
	right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
	top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
	bottom_overlap = margin_y + new_height - overlap_y if alignment != "아래" else margin_y + new_height

	mask_draw.rectangle([
	(left_overlap, top_overlap),
	(right_overlap, bottom_overlap)
	], fill=0)

	return background, mask

	@spaces.GPU(duration=24)
	def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
	"""이미지 아웃페인팅 실행"""
	if image is None:
	return None

	if not OUTPAINT_MODEL_LOADED:
	return Image.new('RGB', (width, height), (200, 200, 200))

	try:
	# 이미지와 마스크 준비
	background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
	if background is None:
	return None

	# cnet_image 생성 (마스크 영역을 검은색으로)
	cnet_image = background.copy()
	cnet_image.paste(0, (0, 0), mask)

	# 프롬프트 준비
	final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"

	# GPU에서 실행
	with torch.autocast(device_type="cuda", dtype=torch.float16):
	(
	prompt_embeds,
	negative_prompt_embeds,
	pooled_prompt_embeds,
	negative_pooled_prompt_embeds,
	) = pipe.encode_prompt(final_prompt, "cuda", True)

	# 생성 프로세스
	for generated_image in pipe(
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
	image=cnet_image,
	num_inference_steps=num_steps
	):
	# 중간 결과 (필요시 사용)
	pass

	# 최종 이미지
	final_image = generated_image

	# RGBA로 변환하고 마스크 적용
	final_image = final_image.convert("RGBA")
	cnet_image.paste(final_image, (0, 0), mask)

	return cnet_image

	except Exception as e:
	logging.error(f"Outpainting error: {str(e)}")
	return background if 'background' in locals() else None

	# CSS
	css = """
	:root {
	--primary-color: #f8c3cd;
	--secondary-color: #b3e5fc;
	--background-color: #f5f5f7;
	--card-background: #ffffff;
	--text-color: #424242;
	--accent-color: #ffb6c1;
	--success-color: #c8e6c9;
	--warning-color: #fff9c4;
	--shadow-color: rgba(0, 0, 0, 0.1);
	--border-radius: 12px;
	}
	.gradio-container {
	max-width: 1200px !important;
	margin: 0 auto !important;
	}
	.panel-box {
	border-radius: var(--border-radius) !important;
	box-shadow: 0 8px 16px var(--shadow-color) !important;
	background-color: var(--card-background) !important;
	padding: 20px !important;
	margin-bottom: 20px !important;
	}
	#generate-btn, #video-btn, #outpaint-btn {
	background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
	font-size: 1.1rem !important;
	padding: 12px 24px !important;
	margin-top: 10px !important;
	width: 100% !important;
	}
	.tabitem {
	min-height: 700px !important;
	}
	"""

	# Gradio Interface
	demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 생성기")

	with demo:
	gr.Markdown("# 🎨 Ginigen 스튜디오")

	with gr.Tabs() as tabs:
	# 첫 번째 탭: 텍스트 to 이미지
	with gr.Tab("텍스트→이미지→비디오", elem_classes="tabitem"):
	with gr.Row(equal_height=True):
	# 입력 컬럼
	with gr.Column(scale=1):
	with gr.Group(elem_classes="panel-box"):
	gr.Markdown("### 📝 이미지 생성 설정")

	prompt = gr.Textbox(
	label="프롬프트(한글/영어 가능)",
	placeholder="생성하고 싶은 이미지를 설명하세요...",
	lines=3
	)

	size_preset = gr.Dropdown(
	choices=list(IMAGE_PRESETS.keys()),
	value="1:1 정사각형",
	label="크기 프리셋"
	)

	with gr.Row():
	width = gr.Slider(256, 2048, 1024, step=64, label="너비")
	height = gr.Slider(256, 2048, 1024, step=64, label="높이")

	with gr.Row():
	guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="가이던스")
	steps = gr.Slider(1, 50, 30, step=1, label="스텝")

	seed = gr.Number(label="시드 (-1=랜덤)", value=-1)

	generate_btn = gr.Button("🎨 이미지 생성", variant="primary", elem_id="generate-btn")

	with gr.Group(elem_classes="panel-box"):
	gr.Markdown("### 🎬 비디오 생성 설정")

	video_prompt = gr.Textbox(
	label="(선택) 비디오 프롬프트(영어로 입력)",
	placeholder="비디오의 움직임을 설명하세요... (비워두면 기본 움직임 적용)",
	lines=2
	)

	video_length = gr.Slider(
	minimum=1,
	maximum=60,
	value=4,
	step=0.5,
	label="비디오 길이 (초)",
	info="1초에서 60초까지 선택 가능합니다"
	)

	# 사운드 생성 옵션 추가
	sound_generation = gr.Radio(
	choices=["사운드 없음", "사운드 생성"],
	value="사운드 없음",
	label="사운드 옵션",
	info="비디오에 사운드를 추가할지 선택하세요"
	)

	# 사운드 관련 입력 필드 (조건부 표시)
	with gr.Column(visible=False) as sound_options:
	sound_prompt = gr.Textbox(
	label="사운드 프롬프트 (선택)",
	placeholder="생성할 사운드를 설명하세요... (비워두면 비디오 프롬프트 사용)",
	lines=2
	)
	sound_negative_prompt = gr.Textbox(
	label="사운드 네거티브 프롬프트",
	value="music",
	lines=1
	)

	video_btn = gr.Button("🎬 비디오로 변환", variant="secondary", elem_id="video-btn")

	# 출력 컬럼
	with gr.Column(scale=1):
	with gr.Group(elem_classes="panel-box"):
	gr.Markdown("### 🖼️ 생성 결과")

	output_image = gr.Image(label="생성된 이미지", type="numpy")
	output_seed = gr.Textbox(label="시드 정보")
	output_video = gr.Video(label="생성된 비디오")

	# 두 번째 탭: 이미지 아웃페인팅
	with gr.Tab("이미지 비율 변경/생성", elem_classes="tabitem"):
	with gr.Row(equal_height=True):
	# 입력 컬럼
	with gr.Column(scale=1):
	with gr.Group(elem_classes="panel-box"):
	gr.Markdown("### 🖼️ 이미지 업로드")

	input_image = gr.Image(
	label="원본 이미지",
	type="numpy"
	)

	outpaint_prompt = gr.Textbox(
	label="프롬프트 (선택)",
	placeholder="확장할 영역에 대한 설명...",
	lines=2
	)

	with gr.Group(elem_classes="panel-box"):
	gr.Markdown("### ⚙️ 아웃페인팅 설정")

	outpaint_size_preset = gr.Dropdown(
	choices=list(IMAGE_PRESETS.keys()),
	value="16:9 와이드스크린",
	label="목표 크기 프리셋"
	)

	with gr.Row():
	outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="목표 너비")
	outpaint_height = gr.Slider(256, 2048, 720, step=64, label="목표 높이")

	alignment = gr.Dropdown(
	choices=["가운데", "왼쪽", "오른쪽", "위", "아래"],
	value="가운데",
	label="정렬"
	)

	overlap_percentage = gr.Slider(
	minimum=1,
	maximum=50,
	value=10,
	step=1,
	label="마스크 오버랩 (%)"
	)

	outpaint_steps = gr.Slider(
	minimum=4,
	maximum=12,
	value=8,
	step=1,
	label="추론 스텝"
	)

	outpaint_btn = gr.Button("🎨 아웃페인팅 실행", variant="primary", elem_id="outpaint-btn")

	# 출력 컬럼
	with gr.Column(scale=1):
	with gr.Group(elem_classes="panel-box"):
	gr.Markdown("### 🖼️ 결과")

	outpaint_result = gr.Image(label="아웃페인팅 결과")

	# 이벤트 연결 - 첫 번째 탭
	size_preset.change(update_dimensions, [size_preset], [width, height])

	generate_btn.click(
	generate_text_to_image,
	[prompt, width, height, guidance, steps, seed],
	[output_image, output_seed]
	)

	# 사운드 옵션 표시/숨김
	def toggle_sound_options(choice):
	return gr.update(visible=(choice == "사운드 생성"))

	sound_generation.change(
	toggle_sound_options,
	[sound_generation],
	[sound_options]
	)


	video_btn.click(
	generate_video_from_image,
	[output_image, video_prompt, video_length], # 원래대로 3개 매개변수만
	[output_video]
	)

	# 사운드 추가는 별도 버튼으로
	sound_btn = gr.Button("🔊 비디오에 사운드 추가", visible=False)
	sound_btn.click(
	add_sound_to_video,
	[output_video, sound_prompt, sound_negative_prompt],
	[output_video]
	)


	# 이벤트 연결 - 두 번째 탭
	outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])

	outpaint_btn.click(
	outpaint_image,
	[input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
	[outpaint_result]
	)

	demo.launch()