multimodalart HF Staff commited on
Commit
c7eeeda
·
verified ·
1 Parent(s): 64740c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -52
app.py CHANGED
@@ -3,42 +3,201 @@ import numpy as np
3
  import random
4
  import torch
5
  import spaces
 
 
6
 
7
  from PIL import Image
8
- from diffusers import QwenImageEditPipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- import os
11
 
12
  # --- Model Loading ---
13
  dtype = torch.bfloat16
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- # Load the model pipeline
17
- pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=dtype).to(device)
18
- pipe.load_lora_weights(
19
- "lightx2v/Qwen-Image-Lightning", weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors"
20
- )
21
- pipe.fuse_lora()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # --- UI Constants and Helpers ---
24
  MAX_SEED = np.iinfo(np.int32).max
25
 
26
- # --- Main Inference Function (with hardcoded negative prompt) ---
27
- @spaces.GPU(duration=120)
28
  def infer(
29
  image,
30
  prompt,
31
  seed=42,
32
  randomize_seed=False,
33
- guidance_scale=4.0,
34
- num_inference_steps=50,
 
 
35
  progress=gr.Progress(track_tqdm=True),
36
  ):
37
  """
38
- Generates an image using the local Qwen-Image diffusers pipeline.
39
  """
40
- # Hardcode the negative prompt as requested
41
- negative_prompt = "text, watermark, copyright, blurry, low resolution"
42
 
43
  if randomize_seed:
44
  seed = random.randint(0, MAX_SEED)
@@ -46,55 +205,89 @@ def infer(
46
  # Set up the generator for reproducibility
47
  generator = torch.Generator(device=device).manual_seed(seed)
48
 
49
- print(f"Calling pipeline with prompt: '{prompt}'")
50
  print(f"Negative Prompt: '{negative_prompt}'")
51
- print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {guidance_scale}")
 
 
 
 
52
 
53
- # Generate the image
54
- image = pipe(
55
- image,
56
- prompt=prompt,
57
- negative_prompt=negative_prompt,
58
- num_inference_steps=num_inference_steps,
59
- generator=generator,
60
- true_cfg_scale=1.0,
61
- guidance_scale=guidance_scale # Use a fixed default for distilled guidance
62
- ).images[0]
 
 
 
 
63
 
64
- return image, seed
65
 
66
  # --- Examples and UI Layout ---
67
- examples = []
 
 
 
 
68
 
69
  css = """
70
  #col-container {
71
  margin: 0 auto;
72
  max-width: 1024px;
73
  }
 
 
 
 
 
 
74
  #edit_text{margin-top: -62px !important}
75
  """
76
 
77
  with gr.Blocks(css=css) as demo:
78
  with gr.Column(elem_id="col-container"):
79
- gr.HTML('<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_logo.png" alt="Qwen-Image Logo" width="400" style="display: block; margin: 0 auto;">')
80
- gr.HTML('<h1 style="text-align: center;margin-left: 80px;color: #5b47d1;font-style: italic;">Edit Fast</h1>', elem_id="edit_text")
81
- gr.Markdown("[Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series. Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or diffusers.")
 
 
 
 
 
 
 
 
 
82
  with gr.Row():
83
  with gr.Column():
84
- input_image = gr.Image(label="Input Image", show_label=False, type="pil")
85
- prompt = gr.Text(
86
- label="Prompt",
87
- show_label=False,
88
- placeholder="describe the edit instruction",
89
- container=False,
90
  )
91
- run_button = gr.Button("Edit!", variant="primary")
92
-
93
- result = gr.Image(label="Result", show_label=False, type="pil")
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  with gr.Accordion("Advanced Settings", open=False):
96
- # Negative prompt UI element is removed here
97
-
98
  seed = gr.Slider(
99
  label="Seed",
100
  minimum=0,
@@ -106,23 +299,38 @@ with gr.Blocks(css=css) as demo:
106
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
107
 
108
  with gr.Row():
109
- guidance_scale = gr.Slider(
110
- label="Guidance scale",
111
- minimum=0.0,
112
  maximum=10.0,
113
  step=0.1,
114
- value=4.0,
115
  )
116
 
117
  num_inference_steps = gr.Slider(
118
  label="Number of inference steps",
 
 
 
 
 
 
 
 
 
119
  minimum=1,
120
- maximum=50,
121
  step=1,
122
- value=8,
 
 
 
 
 
 
123
  )
124
 
125
- # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
126
 
127
  gr.on(
128
  triggers=[run_button.click, prompt.submit],
@@ -130,11 +338,12 @@ with gr.Blocks(css=css) as demo:
130
  inputs=[
131
  input_image,
132
  prompt,
133
- # negative_prompt is no longer an input from the UI
134
  seed,
135
  randomize_seed,
136
- guidance_scale,
137
  num_inference_steps,
 
 
138
  ],
139
  outputs=[result, seed],
140
  )
 
3
  import random
4
  import torch
5
  import spaces
6
+ import os
7
+ import json
8
 
9
  from PIL import Image
10
+ from diffusers import QwenImageEditPipeline, FlowMatchEulerDiscreteScheduler
11
+ from huggingface_hub import InferenceClient
12
+ import math
13
+
14
+ # --- Prompt Enhancement using Hugging Face InferenceClient ---
15
+ def polish_prompt_hf(original_prompt, system_prompt):
16
+ """
17
+ Rewrites the prompt using a Hugging Face InferenceClient.
18
+ """
19
+ # Ensure HF_TOKEN is set
20
+ api_key = os.environ.get("HF_TOKEN")
21
+ if not api_key:
22
+ print("Warning: HF_TOKEN not set. Falling back to original prompt.")
23
+ return original_prompt
24
+
25
+ try:
26
+ # Initialize the client
27
+ client = InferenceClient(
28
+ provider="cerebras",
29
+ api_key=api_key,
30
+ )
31
+
32
+ # Format the messages for the chat completions API
33
+ messages = [
34
+ {"role": "system", "content": system_prompt},
35
+ {"role": "user", "content": original_prompt}
36
+ ]
37
+
38
+ # Call the API
39
+ completion = client.chat.completions.create(
40
+ model="Qwen/Qwen3-235B-A22B-Instruct-2507",
41
+ messages=messages,
42
+ )
43
+
44
+ # Parse the response
45
+ result = completion.choices[0].message.content
46
+
47
+ # Try to extract JSON if present
48
+ if '{"Rewritten"' in result:
49
+ try:
50
+ # Clean up the response
51
+ result = result.replace('```json', '').replace('```', '')
52
+ result_json = json.loads(result)
53
+ polished_prompt = result_json.get('Rewritten', result)
54
+ except:
55
+ polished_prompt = result
56
+ else:
57
+ polished_prompt = result
58
+
59
+ polished_prompt = polished_prompt.strip().replace("\n", " ")
60
+ return polished_prompt
61
+
62
+ except Exception as e:
63
+ print(f"Error during API call to Hugging Face: {e}")
64
+ # Fallback to original prompt if enhancement fails
65
+ return original_prompt
66
+
67
+
68
+ def polish_prompt(prompt, img):
69
+ """
70
+ Main function to polish prompts for image editing using HF inference.
71
+ """
72
+ SYSTEM_PROMPT = '''
73
+ # Edit Instruction Rewriter
74
+ You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
75
+
76
+ Please strictly follow the rewriting rules below:
77
+
78
+ ## 1. General Principles
79
+ - Keep the rewritten prompt **concise**. Avoid overly long sentences and reduce unnecessary descriptive language.
80
+ - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
81
+ - Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
82
+ - All added objects or modifications must align with the logic and style of the edited input image's overall scene.
83
+
84
+ ## 2. Task Type Handling Rules
85
+ ### 1. Add, Delete, Replace Tasks
86
+ - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
87
+ - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
88
+ > Original: "Add an animal"
89
+ > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
90
+ - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
91
+ - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
92
+
93
+ ### 2. Text Editing Tasks
94
+ - All text content must be enclosed in English double quotes " ". Do not translate or alter the original language of the text, and do not change the capitalization.
95
+ - **For text replacement tasks, always use the fixed template:**
96
+ - Replace "xx" to "yy".
97
+ - Replace the xx bounding box to "yy".
98
+ - If the user does not specify text content, infer and add concise text based on the instruction and the input image's context. For example:
99
+ > Original: "Add a line of text" (poster)
100
+ > Rewritten: "Add text "LIMITED EDITION" at the top center with slight shadow"
101
+ - Specify text position, color, and layout in a concise way.
102
+
103
+ ### 3. Human Editing Tasks
104
+ - Maintain the person's core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
105
+ - If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
106
+ - **For expression changes, they must be natural and subtle, never exaggerated.**
107
+ - If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
108
+ - For background change tasks, emphasize maintaining subject consistency at first.
109
+ - Example:
110
+ > Original: "Change the person's hat"
111
+ > Rewritten: "Replace the man's hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"
112
+
113
+ ### 4. Style Transformation or Enhancement Tasks
114
+ - If a style is specified, describe it concisely with key visual traits. For example:
115
+ > Original: "Disco style"
116
+ > Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"
117
+ - If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
118
+ - **For coloring tasks, including restoring old photos, always use the fixed template:** "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
119
+ - If there are other changes, place the style description at the end.
120
+
121
+ ## 3. Rationality and Logic Checks
122
+ - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.
123
+ - Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).
124
+
125
+ # Output Format
126
+ Return only the rewritten instruction text directly, without JSON formatting or any other wrapper.
127
+ '''
128
+
129
+ # Note: We're not actually using the image in the HF version,
130
+ # but keeping the interface consistent
131
+ full_prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {prompt}\n\nRewritten Prompt:"
132
+
133
+ return polish_prompt_hf(full_prompt, SYSTEM_PROMPT)
134
 
 
135
 
136
  # --- Model Loading ---
137
  dtype = torch.bfloat16
138
  device = "cuda" if torch.cuda.is_available() else "cpu"
139
 
140
+ # Scheduler configuration for Lightning
141
+ scheduler_config = {
142
+ "base_image_seq_len": 256,
143
+ "base_shift": math.log(3),
144
+ "invert_sigmas": False,
145
+ "max_image_seq_len": 8192,
146
+ "max_shift": math.log(3),
147
+ "num_train_timesteps": 1000,
148
+ "shift": 1.0,
149
+ "shift_terminal": None,
150
+ "stochastic_sampling": False,
151
+ "time_shift_type": "exponential",
152
+ "use_beta_sigmas": False,
153
+ "use_dynamic_shifting": True,
154
+ "use_exponential_sigmas": False,
155
+ "use_karras_sigmas": False,
156
+ }
157
+
158
+ # Initialize scheduler with Lightning config
159
+ scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
160
+
161
+ # Load the edit pipeline with Lightning scheduler
162
+ pipe = QwenImageEditPipeline.from_pretrained(
163
+ "Qwen/Qwen-Image-Edit",
164
+ scheduler=scheduler,
165
+ torch_dtype=dtype
166
+ ).to(device)
167
+
168
+ # Load Lightning LoRA weights for acceleration
169
+ try:
170
+ pipe.load_lora_weights(
171
+ "lightx2v/Qwen-Image-Lightning",
172
+ weight_name="Qwen-Image-Lightning-8steps-V1.1.safetensors"
173
+ )
174
+ pipe.fuse_lora()
175
+ print("Successfully loaded Lightning LoRA weights")
176
+ except Exception as e:
177
+ print(f"Warning: Could not load Lightning LoRA weights: {e}")
178
+ print("Continuing with base model...")
179
 
180
  # --- UI Constants and Helpers ---
181
  MAX_SEED = np.iinfo(np.int32).max
182
 
183
+ # --- Main Inference Function ---
184
+ @spaces.GPU(duration=60)
185
  def infer(
186
  image,
187
  prompt,
188
  seed=42,
189
  randomize_seed=False,
190
+ true_guidance_scale=1.0,
191
+ num_inference_steps=8, # Default to 8 steps for fast inference
192
+ rewrite_prompt=True,
193
+ num_images_per_prompt=1,
194
  progress=gr.Progress(track_tqdm=True),
195
  ):
196
  """
197
+ Generates an edited image using the Qwen-Image-Edit pipeline with Lightning acceleration.
198
  """
199
+ # Hardcode the negative prompt as in the original
200
+ negative_prompt = " "
201
 
202
  if randomize_seed:
203
  seed = random.randint(0, MAX_SEED)
 
205
  # Set up the generator for reproducibility
206
  generator = torch.Generator(device=device).manual_seed(seed)
207
 
208
+ print(f"Original prompt: '{prompt}'")
209
  print(f"Negative Prompt: '{negative_prompt}'")
210
+ print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}")
211
+
212
+ if rewrite_prompt:
213
+ prompt = polish_prompt(prompt, image)
214
+ print(f"Rewritten Prompt: {prompt}")
215
 
216
+ # Generate the edited image
217
+ try:
218
+ images = pipe(
219
+ image,
220
+ prompt=prompt,
221
+ negative_prompt=negative_prompt,
222
+ num_inference_steps=num_inference_steps,
223
+ generator=generator,
224
+ true_cfg_scale=true_guidance_scale,
225
+ num_images_per_prompt=num_images_per_prompt
226
+ ).images
227
+ except Exception as e:
228
+ print(f"Error during inference: {e}")
229
+ raise e
230
 
231
+ return images, seed
232
 
233
  # --- Examples and UI Layout ---
234
+ examples = [
235
+ # You can add example pairs of [image_path, prompt] here
236
+ # ["path/to/image1.jpg", "Replace the background with a beach scene"],
237
+ # ["path/to/image2.jpg", "Add a red hat to the person"],
238
+ ]
239
 
240
  css = """
241
  #col-container {
242
  margin: 0 auto;
243
  max-width: 1024px;
244
  }
245
+ #logo-title {
246
+ text-align: center;
247
+ }
248
+ #logo-title img {
249
+ width: 400px;
250
+ }
251
  #edit_text{margin-top: -62px !important}
252
  """
253
 
254
  with gr.Blocks(css=css) as demo:
255
  with gr.Column(elem_id="col-container"):
256
+ gr.HTML("""
257
+ <div id="logo-title">
258
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Edit Logo" width="400" style="display: block; margin: 0 auto;">
259
+ <h2 style="font-style: italic;color: #5b47d1;margin-top: -33px !important;margin-left: 133px;">Fast, 8-steps with Lightning LoRA</h2>
260
+ </div>
261
+ """)
262
+ gr.Markdown("""
263
+ [Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series.
264
+ This demo uses the [Qwen-Image-Lightning](https://huggingface.co/lightx2v/Qwen-Image-Lightning) LoRA for accelerated inference.
265
+ Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or diffusers.
266
+ """)
267
+
268
  with gr.Row():
269
  with gr.Column():
270
+ input_image = gr.Image(
271
+ label="Input Image",
272
+ show_label=True,
273
+ type="pil"
 
 
274
  )
275
+ result = gr.Gallery(
276
+ label="Result",
277
+ show_label=True,
278
+ type="pil"
279
+ )
280
+
281
+ with gr.Row():
282
+ prompt = gr.Text(
283
+ label="Edit Instruction",
284
+ show_label=False,
285
+ placeholder="Describe the edit instruction (e.g., 'Replace the background with a sunset', 'Add a red hat', 'Remove the person')",
286
+ container=False,
287
+ )
288
+ run_button = gr.Button("Edit!", variant="primary")
289
 
290
  with gr.Accordion("Advanced Settings", open=False):
 
 
291
  seed = gr.Slider(
292
  label="Seed",
293
  minimum=0,
 
299
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
300
 
301
  with gr.Row():
302
+ true_guidance_scale = gr.Slider(
303
+ label="True guidance scale",
304
+ minimum=1.0,
305
  maximum=10.0,
306
  step=0.1,
307
+ value=1.0
308
  )
309
 
310
  num_inference_steps = gr.Slider(
311
  label="Number of inference steps",
312
+ minimum=4,
313
+ maximum=28,
314
+ step=1,
315
+ value=8
316
+ )
317
+
318
+ with gr.Row():
319
+ num_images_per_prompt = gr.Slider(
320
+ label="Number of images per prompt",
321
  minimum=1,
322
+ maximum=4,
323
  step=1,
324
+ value=1,
325
+ visible=False
326
+ )
327
+
328
+ rewrite_prompt = gr.Checkbox(
329
+ label="Enhance prompt (using HF Inference)",
330
+ value=True
331
  )
332
 
333
+ # gr.Examples(examples=examples, inputs=[input_image, prompt], outputs=[result, seed], fn=infer, cache_examples=False)
334
 
335
  gr.on(
336
  triggers=[run_button.click, prompt.submit],
 
338
  inputs=[
339
  input_image,
340
  prompt,
 
341
  seed,
342
  randomize_seed,
343
+ true_guidance_scale,
344
  num_inference_steps,
345
+ rewrite_prompt,
346
+ num_images_per_prompt,
347
  ],
348
  outputs=[result, seed],
349
  )