Spaces:

siijiawei
/

gorgeous

Sleeping

gorgeous / app.py

Kam-Woh Ng

push to hf

5ba0490 5 months ago

13.9 kB

	import glob
	import os
	import random

	import gradio as gr
	import numpy as np
	import torch
	import torch.utils.checkpoint
	from PIL import Image
	from diffusers import (
	AutoencoderKL,
	UNet2DConditionModel,
	UniPCMultistepScheduler,
	)
	from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel
	from torchvision.transforms import transforms
	from transformers import AutoTokenizer, PretrainedConfig

	from face_parsing import inference as face_parsing_inference

	# ----------------------------------------------------------------

	# Define model paths and other parameters

	# sd 1.5
	# pretrained_model_name_or_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
	# controlnet_path = "siijiawei/gorgeous-mafor-sd1-5"

	# sd 2.1
	pretrained_model_name_or_path = "stabilityai/stable-diffusion-2-1-base"
	controlnet_path = "siijiawei/gorgeous-mafor-sd2-1"

	image_sets = sorted(glob.glob("makeup_assets/*"))
	textual_inversion_paths = sorted(glob.glob("makeup_assets/*"))

	prompt_template = "A woman with {} makeup on face"
	MAX_SEED = np.iinfo(np.int32).max
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.float16 if torch.cuda.is_available() else torch.float32


	# ----------------------------------------------------------------


	def import_model_class_from_model_name_or_path(
	pretrained_model_name_or_path: str, revision: str
	):
	text_encoder_config = PretrainedConfig.from_pretrained(
	pretrained_model_name_or_path,
	subfolder="text_encoder",
	revision=revision,
	)
	model_class = text_encoder_config.architectures[0]

	if model_class == "CLIPTextModel":
	from transformers import CLIPTextModel

	return CLIPTextModel
	elif model_class == "RobertaSeriesModelWithTransformation":
	from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
	RobertaSeriesModelWithTransformation,
	)

	return RobertaSeriesModelWithTransformation
	else:
	raise ValueError(f"{model_class} is not supported.")


	# ----------------------------------------------------------------

	# Initialize components
	tokenizer = AutoTokenizer.from_pretrained(
	pretrained_model_name_or_path,
	subfolder="tokenizer",
	use_fast=False,
	)
	text_encoder_cls = import_model_class_from_model_name_or_path(
	pretrained_model_name_or_path, "main"
	)
	text_encoder = text_encoder_cls.from_pretrained(
	pretrained_model_name_or_path, subfolder="text_encoder"
	)
	vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
	unet = UNet2DConditionModel.from_pretrained(
	pretrained_model_name_or_path, subfolder="unet"
	)
	controlnet = ControlNetModel.from_pretrained(
	controlnet_path,
	use_safetensors=True,
	torch_dtype=torch.float16,
	# subfolder="controlnet",
	).to(device)

	vae.to(device, dtype=dtype)
	unet.to(device, dtype=dtype)
	text_encoder.to(device, dtype=dtype)
	controlnet.to(device, dtype=dtype)

	pipeline = StableDiffusionControlNetInpaintPipeline.from_pretrained(
	pretrained_model_name_or_path,
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	controlnet=controlnet,
	safety_checker=None,
	torch_dtype=dtype,
	use_safetensors=True,
	)
	pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
	pipeline.to(device)

	textual_inversion_tokens = [f"<v{i}>" for i in range(len(textual_inversion_paths))]
	pipeline.load_textual_inversion(textual_inversion_paths, token=textual_inversion_tokens)

	generator = torch.Generator(device=device).manual_seed(42)

	preprocess_transform = transforms.Compose(
	[transforms.Resize(512), transforms.CenterCrop(512)]
	)


	# ----------------------------------------------------------------


	# Helper functions
	def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	return seed


	def make_inpaint_condition(image, image_mask):
	image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
	image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
	assert image.shape[0:1] == image_mask.shape[0:1]
	image[image_mask > 0.5] = -1.0 # set as masked pixel
	image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
	image = torch.from_numpy(image)
	return image


	# ----------------------------------------------------------------


	def create_image(
	idea_set_target,
	input_image,
	prompt,
	n_prompt,
	control_scale,
	guidance_scale,
	num_inference_steps,
	seed,
	):
	if input_image is not None:
	# Generate mask
	input_image_path = "input_image.png"
	input_image.save(input_image_path)

	input_image = preprocess_transform(input_image)
	mask_image = face_parsing_inference.get_face_mask(input_image).convert("L")

	print("idea_set_target", idea_set_target)

	set_index = int(idea_set_target.split(":")[0].replace("Set ", "")) - 1 # start from 1

	# Prepare prompt
	token = textual_inversion_tokens[set_index]
	prompt = prompt.replace("{}", token)
	print(prompt)

	# Generate image
	blurred_mask = pipeline.mask_processor.blur(mask_image, blur_factor=10)
	masked_image = make_inpaint_condition(input_image, blurred_mask)

	generator = torch.Generator(device=device).manual_seed(seed)
	with torch.autocast("cuda"):
	output = pipeline(
	prompt=prompt,
	image=input_image,
	mask_image=blurred_mask,
	control_image=input_image,
	num_inference_steps=int(num_inference_steps),
	generator=generator,
	negative_prompt=n_prompt,
	controlnet_conditioning_scale=float(control_scale),
	guidance_scale=float(guidance_scale),
	)

	output_image = output.images[0]
	return output_image
	return None


	# ----------------------------------------------------------------


	def read_image_from_dirpath(dirpath):
	img_paths = sorted(
	glob.glob(dirpath + "/*.png")
	+ glob.glob(dirpath + "/*.jpeg")
	+ glob.glob(dirpath + "/*.jpg")
	)
	imgs = [Image.open(p) for p in img_paths[:5]]

	if len(imgs) < 5:
	imgs += [Image.new(mode="RGB", size=(200, 200)) for _ in range(5 - len(imgs))]

	return imgs




	image_sets = [
	{
	"label": f"Set {i + 1}: {os.path.basename(image_sets[i])}",
	"images": read_image_from_dirpath(image_sets[i]),
	}
	for i in range(len(image_sets))
	]

	labels = [image_set["label"] for image_set in image_sets]

	def display_images(set_label):
	print("?")
	set_index = int(set_label.split(":")[0].replace("Set ", "")) - 1 # start from 1
	image_set = image_sets[set_index]
	return [image_set["label"]] + image_set["images"]


	# ----------------------------------------------------------------

	# Gradio UI setup
	block = gr.Blocks(
	css="""
	footer {visibility: hidden}
	.title-background {
	background-color: #f7e4da; /* Light brown background */
	color: #1d1d1d; /* Dark text color */
	padding: 20px; /* Padding for top and bottom */
	text-align: center;
	width: 100%; /* Set width to 100% */
	margin: 0 auto; /* Center alignment */
	max-width: 1200px; /* Max width to keep content centered */
	box-sizing: border-box; /* Ensure padding is inside the box model */
	}
	.gr-button {
	background-color: #c2410c !important; /* Brown color for buttons */
	color: white !important; /* Text color */
	}
	.gr-dropdown, .gr-slider, .gr-textbox {
	border-color: #c2410c !important; /* Brown color for borders */
	}
	.gr-label, .gr-markdown {
	color: #c2410c !important; /* Brown color for text */
	}
	.content-description {
	text-align: center;
	max-width: 1200px; /* Ensure same max width as title */
	margin: 0 auto; /* Center alignment */
	box-sizing: border-box;
	}
	"""
	).queue(max_size=10, api_open=False)

	with block:
	# Title with background
	gr.Markdown(
	"""
	<div class="title-background">
	<h1 style='font-weight: 10px; font-size: 40px;'>💄<b>Gorgeous</b>: Creating Narrative-Driven Makeup Ideas via Image Prompt 💡</h1>
	</div>
	"""
	)
	# Description with center alignment
	gr.Markdown(
	"""
	<div class="content-description">Introducing \( \textbf{Gorgeous} \), a diffusion-based generative method that revolutionizes
	the makeup industry by empowering user creativity via image prompts. Unlike
	traditional makeup transfer methods that focus on replicating existing make-
	ups, Gorgeous, for the first time, empowers users to integrate narrative elements
	into makeup ideation using image prompts. The result is a makeup concept
	that vividly reflects user’s expression via images, offering imaginative makeup
	ideas for physical makeup applications. To achieve this, Gorgeous establishes a
	foundational framework, ensuring the model learns “what makeup is” before inte-
	grating narrative elements. A pseudo-pairing strategy, utilizing a face parsing and
	content-style disentangling network, addresses unpaired data challenges, enabling
	the model to do makeup training on bare faces. Users can input images repre-
	senting their ideas (e.g., fire), from which Gorgeous extracts context embeddings
	to guide our proposed makeup inpainting algorithm, conceptualizing creative,
	narrative-driven makeup ideas for targeted facial regions. Comprehensive exper-
	iments underscore the effectiveness of Gorgeous, paving a way for a
	new dimension in digital makeup artistry and application!</div>
	"""
	)

	with gr.Tabs():
	with gr.Row():
	with gr.Column():
	with gr.Row():
	image_pil = gr.Image(
	label="Targeted face (e.g., your face)", type="pil", height=256
	)
	generated_image = gr.Image(
	label="Generated Image", type="pil", height=256
	)

	with gr.Row():
	set_dropdown = gr.Dropdown(
	choices=[
	labels[i]
	for i in range(len(image_sets))
	],
	label="Select Image Set",
	value=labels[0],
	)
	image_label = gr.Label()
	image_boxes = [gr.Image() for _ in range(5)]

	set_dropdown.change(
	display_images,
	set_dropdown,
	outputs=[image_label] + image_boxes,
	)

	with gr.Row():
	scale = gr.Slider(
	minimum=0,
	maximum=30,
	step=0.01,
	value=20.0,
	label="Guidance scale (Adjust the slider to steer the influence of the idea chosen on the generation.)",
	)
	control_scale = gr.Slider(
	minimum=0,
	maximum=1,
	step=0.01,
	value=1,
	label="Control scale (Adjust the slider to control face fidelity.)",
	)
	num_inference_steps = gr.Slider(
	minimum=20,
	maximum=100,
	step=1,
	value=50,
	label="Number of inference steps",
	)

	# prompt_template = "A woman with {} makeup on face"

	with gr.Row():
	prompt = gr.Textbox(
	label='Prompt (the set is represented by "{}")',
	value="A photo of a woman with {} on face",
	)

	with gr.Row():
	n_prompt = gr.Textbox(
	label="Negative Prompt",
	value="worst quality, normal quality, low quality, low res, blurry, text, watermark, logo, banner, extra digits, cropped, jpeg artifacts, signature, username, error, sketch ,duplicate, ugly, monochrome, horror, geometry, mutation, disgusting",
	)

	with gr.Row():
	seed = gr.Slider(
	minimum=0, maximum=MAX_SEED, value=1, step=1, label="Seed Value"
	)
	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

	generate_button = gr.Button("Generate Image")

	generate_button.click(
	fn=randomize_seed_fn,
	inputs=[seed, randomize_seed],
	outputs=seed,
	queue=False,
	api_name=False,
	).then(
	fn=create_image,
	inputs=[
	set_dropdown,
	image_pil,
	prompt,
	n_prompt,
	control_scale,
	scale,
	num_inference_steps,
	seed,
	],
	outputs=generated_image,
	)

	gr.Markdown("### Article")


	block.launch(debug=True)