Spaces:
Runtime error
Runtime error
File size: 11,495 Bytes
a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 32b650b a57fa2f 32b650b a57fa2f 0626a14 32b650b 0626a14 32b650b e20bdd1 32b650b a57fa2f e20bdd1 a57fa2f 93dba5e a57fa2f 32b650b a57fa2f 32b650b 4c16d19 32b650b 54e4efb 32b650b f129e08 a57fa2f 32b650b e20bdd1 32b650b e20bdd1 32b650b a57fa2f 32b650b f129e08 32b650b f129e08 32b650b a57fa2f 32b650b 8663a23 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 a57fa2f 0626a14 eb80da0 a57fa2f 547f21c 7cd016f a57fa2f 54e4efb 4bf8c13 a57fa2f 23d0a15 7cd016f a57fa2f 0626a14 4f18f12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
import subprocess
subprocess.run('pip install -e .', shell=True)
print("Installed the repo!")
# GLIDE imports
from typing import Tuple
from IPython.display import display
from PIL import Image
import PIL
import PIL.ImageOps
import numpy as np
import torch as th
import torch.nn.functional as F
from glide_text2im.download import load_checkpoint
from glide_text2im.model_creation import (
create_model_and_diffusion,
model_and_diffusion_defaults,
model_and_diffusion_defaults_upsampler
)
# gradio app imports
import gradio as gr
from torchvision.transforms import ToTensor, ToPILImage
image_to_tensor = ToTensor()
tensor_to_image = ToPILImage()
# This notebook supports both CPU and GPU.
# On CPU, generating one sample may take on the order of 20 minutes.
# On a GPU, it should be under a minute.
has_cuda = th.cuda.is_available()
device = th.device('cpu' if not has_cuda else 'cuda')
# Create base model.
options = model_and_diffusion_defaults()
options['inpaint'] = True
options['use_fp16'] = has_cuda
options['timestep_respacing'] = '100' # use 100 diffusion steps for fast sampling
model, diffusion = create_model_and_diffusion(**options)
model.eval()
if has_cuda:
model.convert_to_fp16()
model.to(device)
model.load_state_dict(load_checkpoint('base-inpaint', device))
print('total base parameters', sum(x.numel() for x in model.parameters()))
# Create upsampler model.
options_up = model_and_diffusion_defaults_upsampler()
options_up['inpaint'] = True
options_up['use_fp16'] = has_cuda
options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
model_up, diffusion_up = create_model_and_diffusion(**options_up)
model_up.eval()
if has_cuda:
model_up.convert_to_fp16()
model_up.to(device)
model_up.load_state_dict(load_checkpoint('upsample-inpaint', device))
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))
# Sampling parameters
batch_size = 1
guidance_scale = 5.0
# Tune this parameter to control the sharpness of 256x256 images.
# A value of 1.0 is sharper, but sometimes results in grainy artifacts.
upsample_temp = 0.997
# Create an classifier-free guidance sampling function
def model_fn(x_t, ts, **kwargs):
half = x_t[: len(x_t) // 2]
combined = th.cat([half, half], dim=0)
model_out = model(combined, ts, **kwargs)
eps, rest = model_out[:, :3], model_out[:, 3:]
cond_eps, uncond_eps = th.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
eps = th.cat([half_eps, half_eps], dim=0)
return th.cat([eps, rest], dim=1)
def denoised_fn(x_start):
# Force the model to have the exact right x_start predictions
# for the part of the image which is known.
return (
x_start * (1 - model_kwargs['inpaint_mask'])
+ model_kwargs['inpaint_image'] * model_kwargs['inpaint_mask']
)
def show_images(batch: th.Tensor):
""" Display a batch of images inline. """
scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
return Image.fromarray(reshaped.numpy())
def read_image(pil_img, size: int = 256) -> Tuple[th.Tensor, th.Tensor]:
pil_img = pil_img.convert('RGB')
pil_img = pil_img.resize((size, size), resample=Image.BICUBIC)
img = np.array(pil_img)
return th.from_numpy(img)[None].permute(0, 3, 1, 2).float() / 127.5 - 1
def read_mask(pil_img_full, size: int = 256) -> Tuple[th.Tensor, th.Tensor]:
pil_img_full = pil_img_full.convert('RGBA')
pil_img = pil_img_full.getchannel( 'A' ) # Mode 'L'
pil_img = pil_img.resize((size, size), resample=PIL.Image.BILINEAR)
img = np.array(pil_img)[..., np.newaxis]
img[img < 255] = 0 # extremely aggressive transparency removal. maybe too aggressive. slider?
return th.from_numpy(img)[None].permute(0, 3, 1, 2).float() / 255.0
def pil_to_numpy(pil_img: Image) -> Tuple[th.Tensor, th.Tensor]:
img = np.array(pil_img)
return th.from_numpy(img)[None].permute(0, 3, 1, 2).float() / 127.5 - 1
model_kwargs = dict()
def inpaint(input_img, prompt):
print(prompt)
# Save as png for later mask detection :)
# input_img_256 = input_img.convert('RGB').resize((256, 256), resample=Image.BICUBIC)
# input_img_64 = input_img.convert('RGB').resize((64, 64), resample=Image.BICUBIC)
# input_img_with_mask_converted = input_img.convert('RGBA').getchannel( 'A' ) # Mode 'L'
# input_img_with_mask_64 = input_img_with_mask_converted.resize((64, 64), resample=Image.BICUBIC)
# TODO: make 256x256 mask more accurate when upscaling?
# input_img_with_mask_256 = input_img_with_mask_converted.resize((256, 256), resample=Image.BICUBIC)
# return input_img, input_img_with_mask_64
# Source image we are inpainting
source_image_256 = read_image(input_img, size=256)
source_image_64 = read_image(input_img, size=64)
source_mask_64 = read_mask(input_img, size=64)
source_mask_256 = F.interpolate(source_mask_64, (256, 256), mode='nearest')
# source_mask_64 = (source_mask_64>0.5).float()
# 0.75 = transparency threshold
# pixels below 75% opacity will be deleted
# add as an option in the future?
# these are better but will leave a "mark"
# source_mask_256 = read_mask(input_img, size=256)
# source_mask_256 = (source_mask_256>0.5).float()
# source_mask_256 = F.interpolate(source_mask_64, (256, 256), mode='nearest')
# source_image_256 = pil_to_numpy(input_img_256)
# source_image_64 = pil_to_numpy(input_img_64)
# Since gradio doesn't supply which pixels were drawn, we need to find it ourselves!
# Assuming that all black pixels are meant for inpainting.
# input_img_with_mask_64 = input_img_with_mask.convert('L').resize((64, 64), resample=Image.BICUBIC)
# gray_scale_source_image_64 = image_to_tensor(input_img_with_mask_64)
# gray_scale_source_image_256 = image_to_tensor(input_img_with_mask_256)
# source_mask_64 = (gray_scale_source_image_64!=0).float()
# source_mask_256 = (gray_scale_source_image_256!=0).float()
# source_mask_64_img = tensor_to_image(source_mask_64)
# The mask should always be a boolean 64x64 mask, and then we
# can upsample it for the second stage.
# source_mask_64 = source_mask_64.unsqueeze(0)
# source_mask_256 = source_mask_256.unsqueeze(0)
# source_mask_256 = F.interpolate(source_mask_64, (256, 256), mode='nearest')
##############################
# Sample from the base model #
##############################
# Create the text tokens to feed to the model.
tokens = model.tokenizer.encode(prompt)
tokens, mask = model.tokenizer.padded_tokens_and_mask(
tokens, options['text_ctx']
)
# Create the classifier-free guidance tokens (empty)
full_batch_size = batch_size * 2
uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask(
[], options['text_ctx']
)
# Pack the tokens together into model kwargs.
global model_kwargs
model_kwargs = dict(
tokens=th.tensor(
[tokens] * batch_size + [uncond_tokens] * batch_size, device=device
),
mask=th.tensor(
[mask] * batch_size + [uncond_mask] * batch_size,
dtype=th.bool,
device=device,
),
# Masked inpainting image
inpaint_image=(source_image_64 * source_mask_64).repeat(full_batch_size, 1, 1, 1).to(device),
inpaint_mask=source_mask_64.repeat(full_batch_size, 1, 1, 1).to(device),
)
# Sample from the base model.
model.del_cache()
samples = diffusion.p_sample_loop(
model_fn,
(full_batch_size, 3, options["image_size"], options["image_size"]),
device=device,
clip_denoised=True,
progress=True,
model_kwargs=model_kwargs,
cond_fn=None,
denoised_fn=denoised_fn,
)[:batch_size]
model.del_cache()
##############################
# Upsample the 64x64 samples #
##############################
tokens = model_up.tokenizer.encode(prompt)
tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
tokens, options_up['text_ctx']
)
# Create the model conditioning dict.
model_kwargs = dict(
# Low-res image to upsample.
low_res=((samples+1)*127.5).round()/127.5 - 1,
# Text tokens
tokens=th.tensor(
[tokens] * batch_size, device=device
),
mask=th.tensor(
[mask] * batch_size,
dtype=th.bool,
device=device,
),
# Masked inpainting image.
inpaint_image=(source_image_256 * source_mask_256).repeat(batch_size, 1, 1, 1).to(device),
inpaint_mask=source_mask_256.repeat(batch_size, 1, 1, 1).to(device),
)
# Sample from the base model.
model_up.del_cache()
up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
up_samples = diffusion_up.p_sample_loop(
model_up,
up_shape,
noise=th.randn(up_shape, device=device) * upsample_temp,
device=device,
clip_denoised=True,
progress=True,
model_kwargs=model_kwargs,
cond_fn=None,
denoised_fn=denoised_fn,
)[:batch_size]
model_up.del_cache()
return show_images(up_samples)
gradio_inputs = [gr.inputs.Image(type='pil',
label="Input Image",
image_mode="RGBA"),
gr.inputs.Textbox(label='Text Prompt')]
# gradio_outputs = [gr.outputs.Image(label='Auto-Detected Mask (From drawn black pixels)')]
gradio_outputs = [gr.outputs.Image(label='Inpainted Image')]
examples = [['desert_full.png', 'a car in the desert'],
['pumpkin.png', 'a pumpkin growing in a spooky forest'],
['windows.png', 'foggy california forest outside the window']]
title = "Transparency inpainting using GLIDE"
# description = "[WARNING: Queue times may take 4-6 minutes per person if there's no GPU! If there is a GPU, it'll take around 60 seconds] Using GLIDE to inpaint black regions of an input image! Instructions: 1) For the 'Input Image', upload an image. 2) For the 'Input Image with Mask', draw a black-colored mask (either manually with something like Paint, or by using gradio's built-in image editor & add a black-colored shape) IT MUST BE BLACK COLOR, but doesn't have to be rectangular! This is because it auto-detects the mask based on 0 (black) pixel values! 3) For the Conditional Text, type something you'd like to see the black region get filled in with :)"
description = "An implementation of OpenAI's GLIDE model that inpaints transparency with image data based on a text prompt.\n\n*NOTE:* Since this space currently runs on the CPU, queue times are relatively long - expect to wait 3-5 minutes for each image."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.10741' target='_blank'>GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models</a> | <a href='https://github.com/openai/glide-text2im' target='_blank'>Github Repo</a></p> "
iface = gr.Interface(fn=inpaint, inputs=gradio_inputs,
outputs=gradio_outputs,
examples=examples, title=title,
description=description, article=article,
enable_queue=True)
iface.launch(enable_queue=True) |