LPX55 commited on
Commit
5cf7772
·
1 Parent(s): ff2724e

testing: 3rd iter

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app_v3.py +224 -0
  3. requirements.txt +2 -1
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 4.44.1
8
- app_file: app_v2.py
9
  pinned: true
10
  license: other
11
  tags:
 
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 4.44.1
8
+ app_file: app_v3.py
9
  pinned: true
10
  license: other
11
  tags:
app_v3.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import spaces
3
+ import os
4
+ from diffusers.utils import load_image
5
+ from diffusers.hooks import apply_group_offloading
6
+ from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
7
+ from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
8
+ from transformers import T5EncoderModel
9
+ from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
10
+ from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
11
+ from liger_kernel.transformers import apply_liger_kernel_to_llama
12
+ from PIL import Image
13
+ from threading import Thread
14
+ from typing import Generator
15
+ from peft import PeftModel, PeftConfig
16
+ import gradio as gr
17
+
18
+ huggingface_token = os.getenv("HUGGINFACE_TOKEN")
19
+ MAX_SEED = 1000000
20
+ MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
21
+ cap_processor = AutoProcessor.from_pretrained(MODEL_PATH)
22
+ cap_model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
23
+ assert isinstance(cap_model, LlavaForConditionalGeneration), f"Expected LlavaForConditionalGeneration, got {type(cap_model)}"
24
+ cap_model.eval()
25
+ apply_liger_kernel_to_llama(model=cap_model.language_model)
26
+
27
+ text_encoder_2_unquant = T5EncoderModel.from_pretrained(
28
+ "LPX55/FLUX.1-merged_uncensored",
29
+ subfolder="text_encoder_2",
30
+ torch_dtype=torch.bfloat16,
31
+ token=huggingface_token
32
+ )
33
+
34
+ pipe = FluxControlNetPipeline.from_pretrained(
35
+ "LPX55/FLUX.1M-8step_upscaler-cnet",
36
+ torch_dtype=torch.bfloat16,
37
+ text_encoder_2=text_encoder_2_unquant,
38
+ token=huggingface_token
39
+ )
40
+ pipe.to("cuda")
41
+
42
+ @spaces.GPU()
43
+ @torch.no_grad()
44
+ def caption(input_image: Image.Image, prompt: str, temperature: float, top_p: float, max_new_tokens: int, log_prompt: bool) -> Generator[str, None, None]:
45
+ torch.cuda.empty_cache()
46
+ if input_image is None:
47
+ yield "No image provided. Please upload an image."
48
+ return
49
+ if log_prompt:
50
+ print(f"PromptLog: {repr(prompt)}")
51
+ convo = [
52
+ {
53
+ "role": "system",
54
+ "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
55
+ },
56
+ {
57
+ "role": "user",
58
+ "content": prompt.strip(),
59
+ },
60
+ ]
61
+ convo_string = cap_processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
62
+ assert isinstance(convo_string, str)
63
+ inputs = cap_processor(text=[convo_string], images=[input_image], return_tensors="pt").to('cuda')
64
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
65
+ streamer = TextIteratorStreamer(cap_processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
66
+ generate_kwargs = dict(
67
+ **inputs,
68
+ max_new_tokens=max_new_tokens,
69
+ do_sample=True if temperature > 0 else False,
70
+ suppress_tokens=None,
71
+ use_cache=True,
72
+ temperature=temperature if temperature > 0 else None,
73
+ top_k=None,
74
+ top_p=top_p if temperature > 0 else None,
75
+ streamer=streamer,
76
+ )
77
+ _= cap_model.generate(**generate_kwargs)
78
+
79
+ output = cap_model.generate(**generate_kwargs)
80
+ print(f"Generated {len(output[0])} tokens")
81
+
82
+ @spaces.GPU(duration=10)
83
+ @torch.no_grad()
84
+ def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale, seed, guidance_end):
85
+ generator = torch.Generator().manual_seed(seed)
86
+ # Load control image
87
+ control_image = load_image(control_image)
88
+ w, h = control_image.size
89
+ w = w - w % 32
90
+ h = h - h % 32
91
+ control_image = control_image.resize((int(w * scale), int(h * scale)), resample=2) # Resample.BILINEAR
92
+ print("Size to: " + str(control_image.size[0]) + ", " + str(control_image.size[1]))
93
+ print(f"PromptLog: {repr(prompt)}")
94
+ with torch.inference_mode():
95
+ image = pipe(
96
+ generator=generator,
97
+ prompt=prompt,
98
+ control_image=control_image,
99
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
100
+ num_inference_steps=steps,
101
+ guidance_scale=guidance_scale,
102
+ height=control_image.size[1],
103
+ width=control_image.size[0],
104
+ control_guidance_start=0.0,
105
+ control_guidance_end=guidance_end,
106
+ ).images[0]
107
+ return image
108
+
109
+ def process_image(control_image, user_prompt, system_prompt, scale, steps,
110
+ controlnet_conditioning_scale, guidance_scale, seed,
111
+ guidance_end, temperature, top_p, max_new_tokens, log_prompt):
112
+ # Initialize with empty caption
113
+ final_prompt = user_prompt.strip()
114
+
115
+ # If no user prompt provided, generate a caption first
116
+ if not final_prompt:
117
+ # Generate caption
118
+ caption_gen = caption(
119
+ input_image=control_image,
120
+ prompt=system_prompt,
121
+ temperature=temperature,
122
+ top_p=top_p,
123
+ max_new_tokens=max_new_tokens,
124
+ log_prompt=log_prompt
125
+ )
126
+
127
+ # Get the full caption by exhausting the generator
128
+ generated_caption = ""
129
+ for chunk in caption_gen:
130
+ generated_caption += chunk
131
+ yield generated_caption, None # Update caption in real-time
132
+
133
+ final_prompt = generated_caption
134
+ yield f"Using caption: {final_prompt}", None
135
+
136
+ # Show the final prompt being used
137
+ yield f"Generating with: {final_prompt}", None
138
+
139
+ # Generate the image
140
+ try:
141
+ image = generate_image(
142
+ prompt=final_prompt,
143
+ scale=scale,
144
+ steps=steps,
145
+ control_image=control_image,
146
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
147
+ guidance_scale=guidance_scale,
148
+ seed=seed,
149
+ guidance_end=guidance_end
150
+ )
151
+ yield f"Completed! Used prompt: {final_prompt}", image
152
+ except Exception as e:
153
+ yield f"Error: {str(e)}", None
154
+ raise
155
+
156
+ def handle_outputs(outputs):
157
+ if isinstance(outputs, dict) and outputs.get("__type__") == "update_caption":
158
+ return outputs["caption"], None
159
+ return outputs
160
+
161
+ with gr.Blocks(title="FLUX Turbo Upscaler", fill_height=True) as iface:
162
+ gr.Markdown("⚠️ WIP SPACE - UNFINISHED & BUGGY")
163
+ with gr.Row():
164
+ control_image = gr.Image(type="pil", label="Control Image", show_label=False)
165
+ generated_image = gr.Image(type="pil", label="Generated Image", format="png", show_label=False)
166
+ with gr.Row():
167
+ with gr.Column(scale=1):
168
+ prompt = gr.Textbox(lines=4, placeholder="Enter your prompt here...", label="Prompt")
169
+ output_caption = gr.Textbox(label="Caption")
170
+ scale = gr.Slider(1, 3, value=1, label="Scale", step=0.25)
171
+ generate_button = gr.Button("Generate Image", variant="primary")
172
+ caption_button = gr.Button("Generate Caption", variant="secondary")
173
+ with gr.Column(scale=1):
174
+ seed = gr.Slider(0, MAX_SEED, value=42, label="Seed", step=1)
175
+ steps = gr.Slider(2, 16, value=8, label="Steps", step=1)
176
+ controlnet_conditioning_scale = gr.Slider(0, 1, value=0.6, label="ControlNet Scale")
177
+ guidance_scale = gr.Slider(1, 30, value=3.5, label="Guidance Scale")
178
+ guidance_end = gr.Slider(0, 1, value=1.0, label="Guidance End")
179
+ with gr.Row():
180
+ with gr.Accordion("Generation settings", open=False):
181
+ system_prompt = gr.Textbox(
182
+ lines=4,
183
+ value="Write a straightforward caption for this image. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what's absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with 'This image is…' or similar phrasing.",
184
+ label="System Prompt for Captioning",
185
+ visible=True # Changed to visible
186
+ )
187
+ temperature_slider = gr.Slider(
188
+ minimum=0.0, maximum=2.0, value=0.6, step=0.05,
189
+ label="Temperature",
190
+ info="Higher values make the output more random, lower values make it more deterministic.",
191
+ visible=True # Changed to visible
192
+ )
193
+ top_p_slider = gr.Slider(
194
+ minimum=0.0, maximum=1.0, value=0.9, step=0.01,
195
+ label="Top-p",
196
+ visible=True # Changed to visible
197
+ )
198
+ max_tokens_slider = gr.Slider(
199
+ minimum=1, maximum=2048, value=368, step=1,
200
+ label="Max New Tokens",
201
+ info="Maximum number of tokens to generate. The model will stop generating if it reaches this limit.",
202
+ visible=False # Changed to visible
203
+ )
204
+ log_prompt = gr.Checkbox(value=True, label="Log", visible=False) # Changed to visible
205
+
206
+ gr.Markdown("**Tips:** 8 steps is all you need!")
207
+
208
+ generate_button.click(
209
+ fn=process_image,
210
+ inputs=[
211
+ control_image, prompt, system_prompt, scale, steps,
212
+ controlnet_conditioning_scale, guidance_scale, seed,
213
+ guidance_end, temperature_slider, top_p_slider, max_tokens_slider, log_prompt
214
+ ],
215
+ outputs=[output_caption, generated_image]
216
+ )
217
+
218
+ caption_button.click(
219
+ fn=caption,
220
+ inputs=[control_image, system_prompt, temperature_slider, top_p_slider, max_tokens_slider, log_prompt],
221
+ outputs=output_caption,
222
+ )
223
+
224
+ iface.launch()
requirements.txt CHANGED
@@ -16,4 +16,5 @@ gradio_imageslider
16
  bitsandbytes
17
  pydantic==2.10.6
18
  attention_map_diffusers
19
- liger-kernel==0.5.9
 
 
16
  bitsandbytes
17
  pydantic==2.10.6
18
  attention_map_diffusers
19
+ liger-kernel==0.5.9
20
+ moondream