Spaces:

Qwen
/

Qwen-Image-Edit

Running on Zero

App Files Files Community

Qwen-Image-Edit / app.py

littlebird13

Add AoTI + FA3 (#14)

7825c36 verified 7 days ago

raw

history blame

11.9 kB

	import gradio as gr
	import numpy as np
	import random
	import torch
	import spaces

	from PIL import Image

	from optimization import optimize_pipeline_
	from qwenimage.pipeline_qwen_image_edit import QwenImageEditPipeline
	from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
	from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3

	import os
	import base64
	import json

	SYSTEM_PROMPT = '''
	# Edit Instruction Rewriter
	You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.

	Please strictly follow the rewriting rules below:

	## 1. General Principles
	- Keep the rewritten prompt concise. Avoid overly long sentences and reduce unnecessary descriptive language.
	- If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
	- Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
	- All added objects or modifications must align with the logic and style of the edited input image’s overall scene.

	## 2. Task Type Handling Rules
	### 1. Add, Delete, Replace Tasks
	- If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
	- If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
	> Original: "Add an animal"
	> Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
	- Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
	- For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.

	### 2. Text Editing Tasks
	- All text content must be enclosed in English double quotes `" "`. Do not translate or alter the original language of the text, and do not change the capitalization.
	- For text replacement tasks, always use the fixed template:
	- `Replace "xx" to "yy"`.
	- `Replace the xx bounding box to "yy"`.
	- If the user does not specify text content, infer and add concise text based on the instruction and the input image’s context. For example:
	> Original: "Add a line of text" (poster)
	> Rewritten: "Add text \"LIMITED EDITION\" at the top center with slight shadow"
	- Specify text position, color, and layout in a concise way.

	### 3. Human Editing Tasks
	- Maintain the person’s core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
	- If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
	- For expression changes, they must be natural and subtle, never exaggerated.
	- If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
	- For background change tasks, emphasize maintaining subject consistency at first.
	- Example:
	> Original: "Change the person’s hat"
	> Rewritten: "Replace the man’s hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"

	### 4. Style Transformation or Enhancement Tasks
	- If a style is specified, describe it concisely with key visual traits. For example:
	> Original: "Disco style"
	> Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"
	- If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
	- For coloring tasks, including restoring old photos, always use the fixed template: "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
	- If there are other changes, place the style description at the end.

	## 3. Rationality and Logic Checks
	- Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.
	- Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).

	# Output Format Example
	```json
	{
	"Rewritten": "..."
	}
	'''

	def polish_prompt(prompt, img):
	prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {prompt}\n\nRewritten Prompt:"
	success=False
	while not success:
	try:
	result = api(prompt, [img])
	# print(f"Result: {result}")
	# print(f"Polished Prompt: {polished_prompt}")
	if isinstance(result, str):
	result = result.replace('```json','')
	result = result.replace('```','')
	result = json.loads(result)
	else:
	result = json.loads(result)

	polished_prompt = result['Rewritten']
	polished_prompt = polished_prompt.strip()
	polished_prompt = polished_prompt.replace("\n", " ")
	success = True
	except Exception as e:
	print(f"[Warning] Error during API call: {e}")
	return polished_prompt


	def encode_image(pil_image):
	import io
	buffered = io.BytesIO()
	pil_image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")




	def api(prompt, img_list, model="qwen-vl-max-latest", kwargs={}):
	import dashscope
	api_key = os.environ.get('DASH_API_KEY')
	if not api_key:
	raise EnvironmentError("DASH_API_KEY is not set")
	assert model in ["qwen-vl-max-latest"], f"Not implemented model {model}"
	sys_promot = "you are a helpful assistant, you should provide useful answers to users."
	messages = [
	{"role": "system", "content": sys_promot},
	{"role": "user", "content": []}]
	for img in img_list:
	messages[1]["content"].append(
	{"image": f"data:image/png;base64,{encode_image(img)}"})
	messages[1]["content"].append({"text": f"{prompt}"})

	response_format = kwargs.get('response_format', None)

	response = dashscope.MultiModalConversation.call(
	api_key=api_key,
	model=model, # For example, use qwen-plus here. You can change the model name as needed. Model list: https://help.aliyun.com/zh/model-studio/getting-started/models
	messages=messages,
	result_format='message',
	response_format=response_format,
	)

	if response.status_code == 200:
	return response.output.choices[0].message.content[0]['text']
	else:
	raise Exception(f'Failed to post: {response}')

	# --- Model Loading ---
	dtype = torch.bfloat16
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load the model pipeline
	pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=dtype).to(device)
	pipe.transformer.__class__ = QwenImageTransformer2DModel
	pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())

	# --- Ahead-of-time compilation ---
	optimize_pipeline_(pipe, image=Image.new("RGB", (1024, 1024)), prompt="prompt")

	# --- UI Constants and Helpers ---
	MAX_SEED = np.iinfo(np.int32).max

	# --- Main Inference Function (with hardcoded negative prompt) ---
	@spaces.GPU(duration=120)
	def infer(
	image,
	prompt,
	seed=120,
	randomize_seed=False,
	true_guidance_scale=4.0,
	num_inference_steps=50,
	rewrite_prompt=True,
	progress=gr.Progress(track_tqdm=True),
	):
	"""
	Generates an image using the local Qwen-Image diffusers pipeline.
	"""
	# Hardcode the negative prompt as requested
	negative_prompt = " "

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	# Set up the generator for reproducibility
	generator = torch.Generator(device=device).manual_seed(seed)

	print(f"Calling pipeline with prompt: '{prompt}'")
	print(f"Negative Prompt: '{negative_prompt}'")
	print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}")
	if rewrite_prompt:
	prompt = polish_prompt(prompt, image)
	print(f"Rewritten Prompt: {prompt}")

	# Generate the image
	images = pipe(
	image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	num_inference_steps=num_inference_steps,
	generator=generator,
	true_cfg_scale=true_guidance_scale,
	num_images_per_prompt=1
	).images

	return images[0], seed

	# --- Examples and UI Layout ---
	examples = []

	css = """
	#col-container {
	margin: 0 auto;
	max-width: 1024px;
	}
	#edit_text{
	margin-top: -62px !important
	}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML('<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Logo" width="400" style="display: block; margin: 0 auto;">')
	gr.Markdown("[Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series. Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or diffusers.")
	with gr.Row():
	with gr.Column():
	input_image = gr.Image(label="Input Image", show_label=False, type="pil")

	result = gr.Image(label="Result", show_label=False, type="pil")
	with gr.Row():
	prompt = gr.Text(
	label="Prompt",
	show_label=False,
	placeholder="describe the edit instruction",
	container=False,
	)
	run_button = gr.Button("Edit!", variant="primary")

	with gr.Accordion("Advanced Settings", open=False):
	# Negative prompt UI element is removed here

	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=0,
	)

	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

	with gr.Row():

	true_guidance_scale = gr.Slider(
	label="True guidance scale",
	minimum=1.0,
	maximum=10.0,
	step=0.1,
	value=4.0
	)

	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=1,
	maximum=50,
	step=1,
	value=50,
	)

	rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=True)

	gr.Examples(examples=[
	["neon_sign.png", "change the text to read 'Qwen Image Edit is here'"],
	["cat_sitting.jpg", "make the cat floating in the air and holding a sign that reads 'this is fun' written with a blue crayon"],
	["pie.png", "turn the style of the photo to vintage comic book"]],
	inputs=[input_image, prompt],
	outputs=[result, seed],
	fn=infer,
	cache_examples="lazy")

	gr.on(
	triggers=[run_button.click, prompt.submit],
	fn=infer,
	inputs=[
	input_image,
	prompt,
	seed,
	randomize_seed,
	true_guidance_scale,
	num_inference_steps,
	rewrite_prompt,
	],
	outputs=[result, seed],
	)

	if __name__ == "__main__":
	demo.launch()