Ghibli-Multilingual-Text-rendering

Running on Zero

File size: 26,645 Bytes

import spaces
import os
import re
import json
import time
import torch
import tempfile
import io
import random
import string
import logging
from typing import Tuple, Optional, List, Dict, Any, Union

from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
import gradio as gr

from safetensors.torch import save_file
from src.pipeline import FluxPipeline
from src.transformer_flux import FluxTransformer2DModel
from src.lora_helper import set_single_lora, set_multi_lora, unset_lora

# Google Gemini API 추가
from google import genai
from google.genai import types

# Initialize the image processor
base_path = "black-forest-labs/FLUX.1-dev"    
lora_base_path = "./models"

# System prompt that will be hidden from users but automatically added to their input
SYSTEM_PROMPT = "Ghibli Studio style, Charming hand-drawn anime-style illustration"

# 로깅 설정
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load the model
pipe = FluxPipeline.from_pretrained(base_path, torch_dtype=torch.bfloat16)
transformer = FluxTransformer2DModel.from_pretrained(base_path, subfolder="transformer", torch_dtype=torch.bfloat16)
pipe.transformer = transformer
pipe.to("cuda")

def clear_cache(transformer):
    for name, attn_processor in transformer.attn_processors.items():
        attn_processor.bank_kv.clear()

#######################################
# Utility Functions
#######################################

# Simple Timer Class
class timer:
    def __init__(self, method_name="timed process"):
        self.method = method_name
    def __enter__(self):
        self.start = time.time()
        print(f"[TIMER] {self.method} starts")
    def __exit__(self, exc_type, exc_val, exc_tb):
        end = time.time()
        print(f"[TIMER] {self.method} took {round(end - self.start, 2)}s")

# 간단한 번역 기능 (한글 -> 영어)
def maybe_translate_to_english(text: str) -> str:
    """
    텍스트에 한글이 포함되어 있으면 영어로 번역, 아니면 그대로 반환
    """
    if not text or not re.search("[가-힣]", text):
        return text
    
    try:
        # 간단한 번역 규칙 (실제 프로덕션에서는 API 사용 권장)
        translations = {
            "안녕하세요": "Hello",
            "환영합니다": "Welcome",
            "아름다운 당신": "Beautiful You",
            "안녕": "Hello",
            "고양이": "Cat",
            "배너": "Banner",
            "썬글라스": "Sunglasses",
            "착용한": "wearing",
            "흰색": "white"
        }
        
        # 전체 문장에 대한 대략적인 번역
        for kr, en in translations.items():
            if kr in text:
                text = text.replace(kr, en)
                
        print(f"[TRANSLATE] Translated Korean text: '{text}'")
        return text
    except Exception as e:
        print(f"[WARNING] Translation failed: {e}")
        return text

def save_binary_file(file_name, data):
    with open(file_name, "wb") as f:
        f.write(data)

#######################################
# Gemini API Functions
#######################################

def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"):
    """
    - 추가 지시사항(AIP)을 전달해 이미지 기반 편집을 수행.
    - 응답이 '이미지'면 저장, '텍스트'면 누적하여 반환.
    """
    # API 키 가져오기 (환경 변수 GAPI_TOKEN 사용)
    api_key = os.getenv("GAPI_TOKEN", None)
    if not api_key:
        raise ValueError("GAPI_TOKEN is missing. Please set an API key.")

    client = genai.Client(api_key=api_key)
    files = [client.files.upload(file=file_name)]
    
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=files[0].uri,
                    mime_type=files[0].mime_type,
                ),
                types.Part.from_text(text=text),
            ],
        ),
    ]

    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        top_k=40,
        max_output_tokens=8192,
        response_modalities=["image", "text"],
        response_mime_type="text/plain",
    )

    text_response = ""
    image_path = None

    # 임시 파일에 이미지 저장 가능하도록 준비
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
        temp_path = tmp.name
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=generate_content_config,
        ):
            if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts:
                continue

            candidate = chunk.candidates[0].content.parts[0]
            # 만약 inline_data(이미지 데이터)가 있다면 -> 실제 이미지 편집 결과
            if candidate.inline_data:
                save_binary_file(temp_path, candidate.inline_data.data)
                print(f"File of mime type {candidate.inline_data.mime_type} saved to: {temp_path}")
                image_path = temp_path
                # 이미지 한 장만 확보하면 중단
                break
            else:
                # inline_data가 없으면 텍스트 데이터이므로 누적
                text_response += chunk.text + "\n"
    
    del files
    return image_path, text_response

def change_text_in_image_two_times(original_image, instruction):
    """
    Call the text-modification API twice (Google Gemini), returning 2 final variations.
    """
    if original_image is None:
        raise gr.Error("처리할 이미지가 없습니다. 먼저 이미지를 생성해주세요.")
        
    results = []
    for version_tag in ["(A)", "(B)"]:
        mod_instruction = f"{instruction} {version_tag}"
        try:
            # 이미지 저장용 임시 파일 생성
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                original_path = tmp.name
                
                # PIL 이미지 객체인 경우 저장
                print(f"[DEBUG] Saving image of type {type(original_image)} to temporary file")
                if isinstance(original_image, Image.Image):
                    original_image.save(original_path, format="PNG")
                    print(f"[DEBUG] Saved image to temporary file: {original_path}")
                else:
                    raise gr.Error(f"예상된 PIL Image가 아닌 {type(original_image)} 타입이 제공되었습니다.")

            print(f"[DEBUG] Google Gemini API에 보내는 지시사항: {mod_instruction}")
            image_path, text_response = generate_by_google_genai(
                text=mod_instruction,
                file_name=original_path
            )
            
            if image_path:
                print(f"[DEBUG] Received image from Gemini API: {image_path}")
                try:
                    with open(image_path, "rb") as f:
                        image_data = f.read()
                    new_img = Image.open(io.BytesIO(image_data))
                    results.append(new_img)
                except Exception as img_err:
                    print(f"[ERROR] Failed to process Gemini image: {img_err}")
                    results.append(original_image)
            else:
                # 만약 이미지 응답이 없고, 텍스트만 온 경우
                print(f"[WARNING] 이미지가 반환되지 않았습니다. 텍스트 응답: {text_response}")
                results.append(original_image)
                
        except Exception as e:
            logging.exception(f"Text modification error: {e}")
            # 오류가 나도 원본 이미지라도 반환
            print(f"[ERROR] 텍스트 수정 중 오류 발생: {e}")
            results.append(original_image)
            
    return results

#######################################
# Image Generation Functions
#######################################

@spaces.GPU()
def single_condition_generate_image(user_prompt, spatial_img, height, width, seed):
    # Combine the system prompt with user prompt
    full_prompt = f"{SYSTEM_PROMPT}, {user_prompt}" if user_prompt else SYSTEM_PROMPT
    
    # Set the Ghibli LoRA
    lora_path = os.path.join(lora_base_path, "Ghibli.safetensors")
    set_single_lora(pipe.transformer, lora_path, lora_weights=[1], cond_size=512)
    
    # Process the image
    spatial_imgs = [spatial_img] if spatial_img else []
    image = pipe(
        full_prompt,
        height=int(height),
        width=int(width),
        guidance_scale=3.5,
        num_inference_steps=25,
        max_sequence_length=512,
        generator=torch.Generator("cpu").manual_seed(seed), 
        subject_images=[],
        spatial_images=spatial_imgs,
        cond_size=512,
    ).images[0]
    clear_cache(pipe.transformer)
    return image

@spaces.GPU()
def text_rendering_generate_image(user_prompt, input_text, text_color, text_size, text_position, spatial_img, height, width, seed):
    """
    Generate image with Ghibli style and then send to Gemini API for multilingual text rendering
    """
    try:
        # Step 1: Generate the base image using FLUX
        print(f"[DEBUG] Generating base image with FLUX")
        full_prompt = f"{SYSTEM_PROMPT}, {user_prompt}" if user_prompt else SYSTEM_PROMPT
        
        # Set the Ghibli LoRA
        lora_path = os.path.join(lora_base_path, "Ghibli.safetensors")
        set_single_lora(pipe.transformer, lora_path, lora_weights=[1], cond_size=512)
        
        # Process the image
        spatial_imgs = [spatial_img] if spatial_img else []
        base_image = pipe(
            full_prompt,
            height=int(height),
            width=int(width),
            guidance_scale=3.5,
            num_inference_steps=25,
            max_sequence_length=512,
            generator=torch.Generator("cpu").manual_seed(seed), 
            subject_images=[],
            spatial_images=spatial_imgs,
            cond_size=512,
        ).images[0]
        clear_cache(pipe.transformer)
        
        # If no text is provided, return the base image
        if not input_text or not input_text.strip():
            return [base_image, base_image]
            
        # Step 2: Build the instruction for Gemini API
        instruction = f"Add the text '{input_text}' to this image in {text_color} color"
        
        # Add position information
        if text_position == "top":
            instruction += " at the top of the image"
        elif text_position == "bottom":
            instruction += " at the bottom of the image"
        else:  # center
            instruction += " at the center of the image"
            
        # Add size information
        if text_size <= 40:
            instruction += " in small size"
        elif text_size <= 120:
            instruction += " in medium size"
        else:
            instruction += " in large size"
            
        instruction += ". Make sure the text is clearly visible and readable."
        
        # Step 3: Call Gemini API to generate two variations
        print(f"[DEBUG] Sending to Gemini API with instruction: {instruction}")
        return change_text_in_image_two_times(base_image, instruction)
        
    except Exception as e:
        logging.exception(f"Text rendering error: {e}")
        # Create a dummy image in case of error
        dummy_img = Image.new('RGB', (width, height), color=(255, 200, 200))
        draw = ImageDraw.Draw(dummy_img)
        draw.text((width//2, height//2), f"Error: {str(e)}", fill="black", anchor="mm")
        return [dummy_img, dummy_img]

# Load example images
def load_examples():
    examples = []
    test_img_dir = "./test_imgs"
    example_prompts = [
        " ",
        "saying 'HELLO' in 'speech bubble'",
        "background 'alps'"
    ]
    
    for i, filename in enumerate(["00.jpg", "02.jpg", "03.jpg"]):
        img_path = os.path.join(test_img_dir, filename)
        if os.path.exists(img_path):
            # Use dimensions from original code for each specific example
            if filename == "00.jpg":
                height, width = 680, 1024
            elif filename == "02.jpg":
                height, width = 560, 1024
            elif filename == "03.jpg":
                height, width = 1024, 768
            else:
                height, width = 768, 768
                
            examples.append([
                example_prompts[i % len(example_prompts)],  # User prompt (without system prompt)
                Image.open(img_path),                       # Reference image
                height,                                     # Height
                width,                                      # Width
                i + 1                                       # Seed
            ])
    
    return examples

# Load examples for text rendering tab
def load_text_examples():
    examples = []
    test_img_dir = "./test_imgs"
    
    example_data = [
        {
            "prompt": "cute character with speech bubble",
            "text": "Hello World!",
            "color": "#ffffff",
            "size": 72,
            "position": "center",
            "filename": "00.jpg",
            "height": 680,
            "width": 1024,
            "seed": 123
        },
        {
            "prompt": "landscape with message",
            "text": "안녕하세요!",
            "color": "#ffff00",
            "size": 100,
            "position": "top",
            "filename": "03.jpg",
            "height": 1024,
            "width": 768,
            "seed": 456
        },
        {
            "prompt": "character with subtitles",
            "text": "こんにちは世界!",
            "color": "#00ffff",
            "size": 90,
            "position": "bottom",
            "filename": "02.jpg",
            "height": 560,
            "width": 1024,
            "seed": 789
        }
    ]
    
    for example in example_data:
        img_path = os.path.join(test_img_dir, example["filename"])
        if os.path.exists(img_path):
            examples.append([
                example["prompt"],
                example["text"],
                example["color"],
                example["size"],
                example["position"],
                Image.open(img_path),
                example["height"],
                example["width"],
                example["seed"]
            ])
    
    return examples

# Function to check API availability - modified to work directly
def check_api_status():
    # Check Gemini API availability
    api_key = os.getenv("GAPI_TOKEN")
    gemini_available = api_key is not None
    
    if gemini_available:
        return """<div class="api-status api-connected">✓ Connected to FLUX.1 and Gemini API</div>"""
    else:
        return """<div class="api-status api-disconnected">✗ Gemini API connection issue. Please check GAPI_TOKEN environment variable.</div>"""

# CSS for improved UI
css = """
:root {
    --primary-color: #4a6670;
    --accent-color: #ff8a65;
    --background-color: #f5f5f5;
    --card-background: #ffffff;
    --text-color: #333333;
    --border-radius: 10px;
    --shadow: 0 4px 6px rgba(0,0,0,0.1);
}

body {
    background-color: var(--background-color);
    color: var(--text-color);
    font-family: 'Helvetica Neue', Arial, sans-serif;
}

.container {
    max-width: 1200px;
    margin: 0 auto;
    padding: 20px;
}

.gr-header {
    background: linear-gradient(135deg, #668796 0%, #4a6670 100%);
    padding: 24px;
    border-radius: var(--border-radius);
    margin-bottom: 24px;
    box-shadow: var(--shadow);
    text-align: center;
}

.gr-header h1 {
    color: white;
    font-size: 2.5rem;
    margin: 0;
    font-weight: 700;
}

.gr-header p {
    color: rgba(255, 255, 255, 0.9);
    font-size: 1.1rem;
    margin-top: 8px;
}

.gr-panel {
    background-color: var(--card-background);
    border-radius: var(--border-radius);
    padding: 16px;
    box-shadow: var(--shadow);
}

.gr-button {
    background-color: var(--accent-color);
    border: none;
    color: white;
    padding: 10px 20px;
    border-radius: 5px;
    font-size: 16px;
    font-weight: bold;
    cursor: pointer;
    transition: transform 0.1s, background-color 0.3s;
}

.gr-button:hover {
    background-color: #ff7043;
    transform: translateY(-2px);
}

.gr-input, .gr-select {
    border-radius: 5px;
    border: 1px solid #ddd;
    padding: 10px;
    width: 100%;
}

.gr-form {
    display: grid;
    gap: 16px;
}

.gr-box {
    background-color: var(--card-background);
    border-radius: var(--border-radius);
    padding: 20px;
    box-shadow: var(--shadow);
    margin-bottom: 20px;
}

.gr-gallery {
    display: grid;
    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
    gap: 16px;
}

.gr-gallery-item {
    overflow: hidden;
    border-radius: var(--border-radius);
    box-shadow: var(--shadow);
    transition: transform 0.3s;
}

.gr-gallery-item:hover {
    transform: scale(1.02);
}

.gr-image {
    width: 100%;
    height: auto;
    object-fit: cover;
}

.gr-footer {
    text-align: center;
    margin-top: 40px;
    padding: 20px;
    color: #666;
    font-size: 14px;
}

.gr-examples-gallery {
    margin-top: 20px;
}

/* Responsive adjustments */
@media (max-width: 768px) {
    .gr-header h1 {
        font-size: 1.8rem;
    }
    
    .gr-panel {
        padding: 12px;
    }
}

/* Ghibli-inspired accent colors */
.gr-accent-1 {
    background-color: #95ccd9;
}

.gr-accent-2 {
    background-color: #74ad8c;
}

.gr-accent-3 {
    background-color: #f9c06b;
}

.text-rendering-options {
    background-color: #f0f8ff;
    padding: 16px;
    border-radius: var(--border-radius);
    margin-top: 16px;
}

.api-status {
    font-size: 14px;
    color: #666;
    text-align: center;
    margin-bottom: 10px;
}

.api-connected {
    color: green;
}

.api-disconnected {
    color: red;
}
"""

# Create the Gradio Blocks interface
with gr.Blocks(css=css) as demo:
    gr.HTML("""
    <div class="gr-header">
        <h1>✨ Ghibli Multilingual Text-Rendering ✨</h1>
        <p>Transform your ideas into magical Ghibli-inspired artwork with multilingual text</p>
    </div>
    """)
    
    # API Status - 직접 호출해서 초기 상태 설정
    api_status = gr.Markdown(check_api_status(), visible=True)
    
    with gr.Tabs():
        with gr.Tab("Create Ghibli Art"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.HTML("""
                    <div class="gr-box">
                        <h3>🎨 Your Creative Input</h3>
                        <p>Describe what you want to see in your Ghibli-inspired image</p>
                    </div>
                    """)
                    
                    user_prompt = gr.Textbox(
                        label="Your description", 
                        placeholder="Describe what you want to see (e.g., a cat sitting by the window)",
                        lines=2
                    )
                    
                    spatial_img = gr.Image(
                        label="Reference Image (Optional)", 
                        type="pil",
                        elem_classes="gr-image-upload"
                    )
                    
                    with gr.Group():
                        with gr.Row():
                            height = gr.Slider(minimum=256, maximum=1024, step=64, label="Height", value=768)
                            width = gr.Slider(minimum=256, maximum=1024, step=64, label="Width", value=768)
                        
                        seed = gr.Slider(minimum=1, maximum=9999, step=1, label="Seed", value=42, 
                                        info="Change for different variations")
                    
                    generate_btn = gr.Button("✨ Generate Ghibli Art", variant="primary", elem_classes=["generate-btn"])
                
                with gr.Column(scale=1):
                    gr.HTML("""
                    <div class="gr-box">
                        <h3>✨ Your Magical Creation</h3>
                        <p>Your Ghibli-inspired artwork will appear here</p>
                    </div>
                    """)
                    output_image = gr.Image(label="Generated Image", elem_classes="gr-output-image")
            
            gr.HTML("""
            <div class="gr-box gr-examples-gallery">
                <h3>✨ Inspiration Gallery</h3>
                <p>Click on any example to try it out</p>
            </div>
            """)
            
            # Add examples
            examples = load_examples()
            gr.Examples(
                examples=examples,
                inputs=[user_prompt, spatial_img, height, width, seed],
                outputs=output_image,
                fn=single_condition_generate_image,
                cache_examples=False,
                examples_per_page=4
            )
            
            # Link the button to the function
            generate_btn.click(
                single_condition_generate_image,
                inputs=[user_prompt, spatial_img, height, width, seed],
                outputs=output_image
            )
        
        # Second tab for Image & Multilingual Text Rendering with Gemini API
        with gr.Tab("Image & Multilingual Text Rendering"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.HTML("""
                    <div class="gr-box">
                        <h3>🌈 Art with Multilingual Text</h3>
                        <p>Create Ghibli-style images with beautiful text in any language using Gemini AI</p>
                    </div>
                    """)
                    
                    text_user_prompt = gr.Textbox(
                        label="Image Description", 
                        placeholder="Describe what you want to see (e.g., a character with speech bubble)",
                        lines=2
                    )
                    
                    with gr.Group(elem_classes="text-rendering-options"):
                        input_text = gr.Textbox(
                            label="Multilingual Text to Add", 
                            placeholder="Enter text in any language (Korean, Japanese, English, etc.)",
                            lines=1
                        )
                        
                        with gr.Row():
                            text_color = gr.ColorPicker(
                                label="Text Color", 
                                value="#FFFFFF"
                            )
                            
                            text_size = gr.Slider(
                                minimum=24, 
                                maximum=200, 
                                step=4, 
                                label="Text Size", 
                                value=72
                            )
                        
                        text_position = gr.Radio(
                            ["top", "center", "bottom"], 
                            label="Text Position", 
                            value="center"
                        )
                    
                    text_spatial_img = gr.Image(
                        label="Reference Image (Optional)", 
                        type="pil",
                        elem_classes="gr-image-upload"
                    )
                    
                    with gr.Group():
                        with gr.Row():
                            text_height = gr.Slider(minimum=256, maximum=1024, step=64, label="Height", value=768)
                            text_width = gr.Slider(minimum=256, maximum=1024, step=64, label="Width", value=768)
                        
                        text_seed = gr.Slider(minimum=1, maximum=9999, step=1, label="Seed", value=42, 
                                           info="Change for different variations")
                    
                    text_generate_btn = gr.Button("✨ Generate Art with Multilingual Text", variant="primary", elem_classes=["generate-btn"])
                
                with gr.Column(scale=1):
                    gr.HTML("""
                    <div class="gr-box">
                        <h3>✨ Your Text Creations (Two Variations)</h3>
                        <p>Two versions of your Ghibli-inspired artwork with text will appear here</p>
                    </div>
                    """)
                    
                    with gr.Row():
                        text_output_image1 = gr.Image(
                            label="Variation A", 
                            type="pil",
                            elem_classes="gr-output-image"
                        )
                        text_output_image2 = gr.Image(
                            label="Variation B", 
                            type="pil",
                            elem_classes="gr-output-image"
                        )
            
            gr.HTML("""
            <div class="gr-box gr-examples-gallery">
                <h3>✨ Multilingual Text Examples</h3>
                <p>Click on any example to try it out</p>
            </div>
            """)
            
            # Add text rendering examples
            text_examples = load_text_examples()
            gr.Examples(
                examples=text_examples,
                inputs=[text_user_prompt, input_text, text_color, text_size, text_position, 
                        text_spatial_img, text_height, text_width, text_seed],
                outputs=[text_output_image1, text_output_image2],
                fn=text_rendering_generate_image,
                cache_examples=False,
                examples_per_page=3
            )
            
            # Link the text render button to the function
            text_generate_btn.click(
                text_rendering_generate_image,
                inputs=[text_user_prompt, input_text, text_color, text_size, text_position, 
                        text_spatial_img, text_height, text_width, text_seed],
                outputs=[text_output_image1, text_output_image2]
            )
    
    gr.HTML("""
    <div class="gr-footer">
        <p>Powered by FLUX.1, Ghibli LoRA, and Google Gemini API • Created with ❤️</p>
    </div>
    """)

# Launch the Gradio app
demo.queue().launch()