Spaces:

sohanAI
/

df-gan-text-to-image

Sleeping

App Files Files Community

sohanAI commited on Mar 25

Commit

76d118b

verified ·

1 Parent(s): 65125c9

Upload 9 files

Browse files

Files changed (9) hide show

.gitignore +36 -0
.huggingface-space +9 -0
README-HF.md +31 -0
README.md +34 -13
app.py +239 -0
demo.ipynb +1 -0
download_models.py +56 -0
requirements.txt +16 -0
startup.sh +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,36 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Jupyter Notebook
+.ipynb_checkpoints
+# Data directories
+data/
+DF-GAN/
+# Model files
+*.pth
+*.pickle
+*.npz
+# Generated images
+samples/
+.DS_Store

.huggingface-space ADDED Viewed

	@@ -0,0 +1,9 @@

+title: DF-GAN Bird Image Generator
+emoji: 🐦
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 3.50.0
+app_file: app.py
+pinned: false
+license: cc-by-nc-sa-4.0

README-HF.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# DF-GAN Bird Image Generator 🐦
+This Hugging Face Space demonstrates the [DF-GAN model](https://arxiv.org/abs/2008.05865) for generating bird images from text descriptions.
+## How It Works
+1. Enter a text description of a bird
+2. Select how many images you want to generate (1-4)
+3. Optionally add a random seed for reproducible results
+4. Click "Generate Image"
+5. The model will generate realistic bird images based on your description
+## Example Descriptions
+Try these example descriptions:
+- "this bird has an orange bill, a white belly and white eyebrows"
+- "a small bird with a red head, breast, and belly and black wings"
+- "this bird is yellow with black and has a long, pointy beak"
+- "this is a grey bodied bird with light grey wings and a white breast"
+## About the Model
+The DF-GAN (Deep Fusion GAN) model is a text-to-image synthesis model introduced in the paper "DF-GAN: A Simple and Effective Baseline for Text-to-Image Synthesis" (CVPR 2022). This demo uses the pre-trained bird model that was trained on the CUB-200-2011 dataset.
+This demo runs on CPU, so image generation may take a few seconds.
+## Credits
+This Space uses the official implementation of DF-GAN from [tobran/DF-GAN](https://github.com/tobran/DF-GAN).
+Made with ❤️ by [Your Name]

README.md CHANGED Viewed

@@ -1,13 +1,34 @@
----
-title: Df Gan Text To Image
-emoji: 🐨
-colorFrom: pink
-colorTo: purple
-sdk: gradio
-sdk_version: 5.23.0
-app_file: app.py
-pinned: false
-short_description: DF-GAN Text to Image Generation
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# DF-GAN Bird Image Generator
+This application uses the DF-GAN (Deep Fusion GAN) model to generate bird images based on text descriptions. Just enter a description of a bird, and the model will generate a realistic image that matches your description.
+## About the Model
+This application uses the pre-trained bird model from the [DF-GAN: A Simple and Effective Baseline for Text-to-Image Synthesis](https://arxiv.org/abs/2008.05865) paper (CVPR 2022). DF-GAN is a text-to-image synthesis model that can generate high-quality images from textual descriptions.
+## How to Use
+1. Enter a description of a bird in the text box (e.g., "a yellow bird with a black head")
+2. Choose how many images you want to generate (1-4)
+3. Optionally, set a random seed for reproducible results
+4. Click "Generate Image" button
+5. View the generated bird images that match your description
+## Examples
+Try these example descriptions:
+- "this bird has an orange bill, a white belly and white eyebrows"
+- "a small bird with a red head, breast, and belly and black wings"
+- "this bird is yellow with black and has a long, pointy beak"
+- "this bird is white in color, and has a orange beak"
+## Implementation Details
+This application uses the following components:
+- DF-GAN architecture for text-to-image synthesis
+- DAMSM text encoder for embedding text descriptions
+- Gradio for the web interface
+## Credits
+This implementation is based on the official DF-GAN repository: [tobran/DF-GAN](https://github.com/tobran/DF-GAN)

app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import sys
+import random
+import torch
+import pickle
+import numpy as np
+from PIL import Image
+import torch.nn.functional as F
+import gradio as gr
+from omegaconf import OmegaConf
+from scipy.stats import truncnorm
+import subprocess
+# First run the download_models.py script if models haven't been downloaded
+if not os.path.exists('data/state_epoch_1220.pth') or not os.path.exists('data/text_encoder200.pth'):
+    print("Downloading necessary model files...")
+    try:
+        subprocess.check_call([sys.executable, "download_models.py"])
+    except subprocess.CalledProcessError as e:
+        print(f"Error downloading models: {e}")
+        print("Please run download_models.py manually before starting the app.")
+# Add the code directory to the Python path
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "DF-GAN/code"))
+# Import necessary modules from the DF-GAN code
+from models.DAMSM import RNN_ENCODER
+from models.GAN import NetG
+# Utility functions
+def load_model_weights(model, weights, multi_gpus=False, train=False):
+    """Load model weights with proper handling of module prefix"""
+    if list(weights.keys())[0].find('module')==-1:
+        pretrained_with_multi_gpu = False
+    else:
+        pretrained_with_multi_gpu = True
+    if (multi_gpus==False) or (train==False):
+        if pretrained_with_multi_gpu:
+            state_dict = {
+                key[7:]: value
+                for key, value in weights.items()
+            }
+        else:
+            state_dict = weights
+    else:
+        state_dict = weights
+    model.load_state_dict(state_dict)
+    return model
+def get_tokenizer():
+    """Get NLTK tokenizer"""
+    from nltk.tokenize import RegexpTokenizer
+    tokenizer = RegexpTokenizer(r'\w+')
+    return tokenizer
+def truncated_noise(batch_size=1, dim_z=100, truncation=1.0, seed=None):
+    """Generate truncated noise"""
+    state = None if seed is None else np.random.RandomState(seed)
+    values = truncnorm.rvs(-2, 2, size=(batch_size, dim_z), random_state=state).astype(np.float32)
+    return truncation * values
+def tokenize_and_build_captions(input_text, wordtoix):
+    """Tokenize text and convert to indices using wordtoix mapping"""
+    tokenizer = get_tokenizer()
+    tokens = tokenizer.tokenize(input_text.lower())
+    cap = []
+    for t in tokens:
+        t = t.encode('ascii', 'ignore').decode('ascii')
+        if len(t) > 0 and t in wordtoix:
+            cap.append(wordtoix[t])
+    # Create padded array for the caption
+    max_len = 18  # As defined in the bird.yml
+    cap_array = np.zeros(max_len, dtype='int64')
+    cap_len = len(cap)
+    if cap_len <= max_len:
+        cap_array[:cap_len] = cap
+    else:
+        # Truncate if too long
+        cap_array = cap[:max_len]
+        cap_len = max_len
+    return cap_array, cap_len
+def encode_caption(caption, caption_len, text_encoder, device):
+    """Encode caption using text encoder"""
+    with torch.no_grad():
+        caption = torch.tensor([caption]).to(device)
+        caption_len = torch.tensor([caption_len]).to(device)
+        hidden = text_encoder.init_hidden(1)
+        _, sent_emb = text_encoder(caption, caption_len, hidden)
+    return sent_emb
+def save_img(img_tensor):
+    """Convert image tensor to PIL Image"""
+    im = img_tensor.data.cpu().numpy()
+    # [-1, 1] --> [0, 255]
+    im = (im + 1.0) * 127.5
+    im = im.astype(np.uint8)
+    im = np.transpose(im, (1, 2, 0))
+    im = Image.fromarray(im)
+    return im
+# Load configuration
+config = {
+    'z_dim': 100,
+    'cond_dim': 256,
+    'imsize': 256,
+    'nf': 32,
+    'ch_size': 3,
+    'truncation': True,
+    'trunc_rate': 0.88,
+}
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Using device: {device}")
+# Load vocab and models
+def load_models():
+    # Load vocabulary
+    with open('data/captions_DAMSM.pickle', 'rb') as f:
+        x = pickle.load(f)
+        wordtoix = x[3]
+        ixtoword = x[2]
+        del x
+    # Initialize text encoder
+    text_encoder = RNN_ENCODER(len(wordtoix), nhidden=config['cond_dim'])
+    text_encoder_path = 'data/text_encoder200.pth'
+    state_dict = torch.load(text_encoder_path, map_location='cpu')
+    text_encoder = load_model_weights(text_encoder, state_dict)
+    text_encoder.to(device)
+    for p in text_encoder.parameters():
+        p.requires_grad = False
+    text_encoder.eval()
+    # Initialize generator
+    netG = NetG(config['nf'], config['z_dim'], config['cond_dim'], config['imsize'], config['ch_size'])
+    netG_path = 'data/state_epoch_1220.pth'
+    state_dict = torch.load(netG_path, map_location='cpu')
+    netG = load_model_weights(netG, state_dict['model']['netG'])
+    netG.to(device)
+    netG.eval()
+    return wordtoix, ixtoword, text_encoder, netG
+wordtoix, ixtoword, text_encoder, netG = load_models()
+def generate_image(text_input, num_images=1, seed=None):
+    """Generate images from text description"""
+    if not text_input.strip():
+        return [None] * num_images
+    cap_array, cap_len = tokenize_and_build_captions(text_input, wordtoix)
+    if cap_len == 0:
+        return [Image.new('RGB', (256, 256), color='red')] * num_images
+    sent_emb = encode_caption(cap_array, cap_len, text_encoder, device)
+    # Set random seed if provided
+    if seed is not None:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+    # Generate multiple images if requested
+    result_images = []
+    with torch.no_grad():
+        for _ in range(num_images):
+            # Generate noise
+            if config['truncation']:
+                noise = truncated_noise(1, config['z_dim'], config['trunc_rate'])
+                noise = torch.tensor(noise, dtype=torch.float).to(device)
+            else:
+                noise = torch.randn(1, config['z_dim']).to(device)
+            # Generate image
+            fake_img = netG(noise, sent_emb)
+            img = save_img(fake_img[0])
+            result_images.append(img)
+    return result_images
+# Create Gradio interface
+def generate_images_interface(text, num_images, random_seed):
+    seed = int(random_seed) if random_seed else None
+    return generate_image(text, num_images, seed)
+with gr.Blocks(title="Bird Image Generator") as demo:
+    gr.Markdown("# Bird Image Generator using DF-GAN")
+    gr.Markdown("Enter a description of a bird and the model will generate corresponding images.")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Bird Description",
+                placeholder="Enter a description of a bird (e.g., 'a small bird with a red head and black wings')",
+                lines=3
+            )
+            num_images = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Number of Images")
+            seed = gr.Textbox(label="Random Seed (optional)", placeholder="Leave empty for random results")
+            submit_btn = gr.Button("Generate Image")
+        with gr.Column():
+            image_output = gr.Gallery(label="Generated Images").style(grid=2, height="auto")
+    submit_btn.click(
+        fn=generate_images_interface,
+        inputs=[text_input, num_images, seed],
+        outputs=image_output
+    )
+    gr.Markdown("## Example Descriptions")
+    example_descriptions = [
+        "this bird has an orange bill, a white belly and white eyebrows",
+        "a small bird with a red head, breast, and belly and black wings",
+        "this bird is yellow with black and has a long, pointy beak",
+        "this bird is white in color, and has a orange beak"
+    ]
+    gr.Examples(
+        examples=[[desc, 1, ""] for desc in example_descriptions],
+        inputs=[text_input, num_images, seed],
+        outputs=image_output,
+        fn=generate_images_interface
+    )
+# Launch the app with appropriate configurations for Hugging Face Spaces
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",  # Bind to all network interfaces
+        share=False,            # Don't use share links
+        favicon_path="https://raw.githubusercontent.com/tobran/DF-GAN/main/framework.png"
+    )

demo.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+

download_models.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import sys
+import subprocess
+import gdown
+import shutil
+import nltk
+from pathlib import Path
+# Install NLTK data
+nltk.download('punkt')
+# Create directories
+os.makedirs('DF-GAN/code/models', exist_ok=True)
+os.makedirs('data', exist_ok=True)
+# Clone the DF-GAN repository
+if not os.path.exists('DF-GAN/.git'):
+    print("Cloning DF-GAN repository...")
+    subprocess.run(["git", "clone", "https://github.com/tobran/DF-GAN.git", "DF-GAN_temp"])
+    # Move only necessary files to avoid duplicates
+    shutil.copytree('DF-GAN_temp/code/models', 'DF-GAN/code/models', dirs_exist_ok=True)
+    shutil.copytree('DF-GAN_temp/code/lib', 'DF-GAN/code/lib', dirs_exist_ok=True)
+    # Clean up
+    shutil.rmtree('DF-GAN_temp')
+    print("Repository cloned and organized.")
+# Download model files
+# DF-GAN pretrained bird model
+bird_model_url = 'https://drive.google.com/uc?id=1rzfcCvGwU8vLCrn5reWxmrAMms6WQGA6'
+bird_model_path = 'data/state_epoch_1220.pth'
+# Text encoder for birds
+text_encoder_url = 'https://drive.google.com/uc?id=1xwIyLPYtYn9YGPIcRuWXxaxcw_oPGQK4'
+text_encoder_path = 'data/text_encoder200.pth'
+# Captions DAMSM pickle file
+captions_pickle_url = 'https://drive.google.com/uc?id=1FfNMRpOZGaO3mKYyj2VDVEW1ChZ12lJp'
+captions_pickle_path = 'data/captions_DAMSM.pickle'
+# Download if files don't exist
+if not os.path.exists(bird_model_path):
+    print(f"Downloading bird model to {bird_model_path}...")
+    gdown.download(bird_model_url, bird_model_path, quiet=False)
+if not os.path.exists(text_encoder_path):
+    print(f"Downloading text encoder to {text_encoder_path}...")
+    gdown.download(text_encoder_url, text_encoder_path, quiet=False)
+if not os.path.exists(captions_pickle_path):
+    print(f"Downloading captions pickle to {captions_pickle_path}...")
+    gdown.download(captions_pickle_url, captions_pickle_path, quiet=False)
+print("All model files downloaded and prepared successfully!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+flask==2.0.1
+torch>=1.9.0
+torchvision>=0.10.0
+Pillow>=9.0.0
+nltk>=3.6.0
+gunicorn==20.1.0
+python-dotenv==0.19.0
+requests==2.26.0
+matplotlib==3.5.1
+tqdm>=4.62.0
+numpy>=1.20.0
+scipy>=1.7.0
+omegaconf>=2.1.0
+gradio>=3.50.0
+easydict>=1.9
+gdown>=4.6.0

startup.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+# Install NLTK data
+python -c "import nltk; nltk.download('punkt')"
+# Run the download_models.py script to get the models
+python download_models.py
+# Start the Gradio app
+python app.py