Spaces:
Running
on
Zero
Running
on
Zero
Update optimized.py
Browse files- optimized.py +18 -82
optimized.py
CHANGED
@@ -6,116 +6,52 @@ from diffusers.utils import load_image
|
|
6 |
from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
|
7 |
import gradio as gr
|
8 |
from accelerate import dispatch_model, infer_auto_device_map
|
|
|
9 |
|
10 |
def self_attention_slicing(module, slice_size=3):
|
11 |
"""Modified from Diffusers' original for Flux compatibility"""
|
12 |
def sliced_attention(*args, **kwargs):
|
13 |
-
|
14 |
-
dim = kwargs["dim"]
|
15 |
-
else:
|
16 |
-
dim = 1
|
17 |
-
|
18 |
-
if slice_size == "auto":
|
19 |
-
# Automatic slicing based on Flux architecture
|
20 |
-
return module(*args, **kwargs)
|
21 |
-
|
22 |
-
output = torch.cat([
|
23 |
-
module(
|
24 |
-
*[arg[:, :, i:i+slice_size] if i == dim else arg
|
25 |
-
for arg in args],
|
26 |
-
**{k: v[:, :, i:i+slice_size] if k == dim else v
|
27 |
-
for k,v in kwargs.items()}
|
28 |
-
)
|
29 |
-
for i in range(0, args[0].shape[dim], slice_size)
|
30 |
-
], dim=dim)
|
31 |
-
|
32 |
-
return output
|
33 |
-
return sliced_attention
|
34 |
-
|
35 |
-
huggingface_token = os.getenv("HUGGINFACE_TOKEN")
|
36 |
-
|
37 |
-
good_vae = AutoencoderKL.from_pretrained(
|
38 |
-
"black-forest-labs/FLUX.1-dev",
|
39 |
-
subfolder="vae",
|
40 |
-
torch_dtype=torch.bfloat16,
|
41 |
-
use_safetensors=True,
|
42 |
-
device_map=None, # Disable automatic mapping
|
43 |
-
token=huggingface_token
|
44 |
-
)
|
45 |
-
|
46 |
-
controlnet = FluxControlNetModel.from_pretrained(
|
47 |
-
"jasperai/Flux.1-dev-Controlnet-Upscaler",
|
48 |
-
torch_dtype=torch.bfloat16
|
49 |
-
)
|
50 |
|
51 |
-
#
|
52 |
pipe = FluxControlNetPipeline.from_pretrained(
|
53 |
"LPX55/FLUX.1-merged_uncensored",
|
54 |
controlnet=controlnet,
|
55 |
vae=good_vae,
|
56 |
torch_dtype=torch.bfloat16,
|
57 |
use_safetensors=True,
|
58 |
-
device_map=None,
|
59 |
token=huggingface_token
|
60 |
)
|
61 |
-
print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
|
62 |
-
# Proper CPU offloading sequence
|
63 |
-
# device_map = infer_auto_device_map(pipe)
|
64 |
-
# pipe = dispatch_model(pipe, device_map=device_map, main_device="cuda")
|
65 |
|
|
|
66 |
device_map = infer_auto_device_map(
|
67 |
pipe,
|
68 |
-
max_memory={0:"
|
69 |
device_types=["cuda", "cpu"]
|
70 |
)
|
71 |
-
pipe = dispatch_model(
|
72 |
-
|
73 |
-
device_map=device_map,
|
74 |
-
main_device="cuda"
|
75 |
-
)
|
76 |
-
|
77 |
-
# For Diffusers v0.20+
|
78 |
-
pipe.enable_sequential_cpu_offload()
|
79 |
-
# (No parameters needed)
|
80 |
|
|
|
81 |
pipe.unet.to(dtype=torch.bfloat16)
|
82 |
pipe.controlnet.to(dtype=torch.bfloat16)
|
83 |
pipe.vae.to(dtype=torch.bfloat16)
|
84 |
-
# # 2. Then apply custom VAE slicing
|
85 |
-
# if getattr(pipe, "vae", None) is not None:
|
86 |
-
# # Method 1: Use official implementation if available
|
87 |
-
# try:
|
88 |
-
# pipe.vae.enable_slicing()
|
89 |
-
# except AttributeError:
|
90 |
-
# # Method 2: Apply manual slicing for Flux compatibility [source_id]pipeline_flux_controlnet.py
|
91 |
-
# pipe.vae.decode = self_attention_slicing(pipe.vae.decode, 2)
|
92 |
-
if getattr(pipe, "vae", None) is not None:
|
93 |
-
try:
|
94 |
-
# Official implementation if available
|
95 |
-
pipe.vae.enable_slicing()
|
96 |
-
pipe.vae.enable_tiling() # <<< Add this line
|
97 |
-
except AttributeError:
|
98 |
-
# Custom slicing for Flux compatibility [source_id]pipeline_flux_controlnet.py
|
99 |
-
pipe.vae.decode = self_attention_slicing(pipe.vae.decode, 2)
|
100 |
-
pipe.vae.post_quant_conv = self_attention_slicing(pipe.vae.post_quant_conv, 2) # <<< Critical addition
|
101 |
|
102 |
-
#
|
103 |
if torch.cuda.is_available():
|
104 |
try:
|
105 |
-
from xformers.ops import MemoryEfficientAttentionCutlassOp
|
106 |
-
# Force xformers to CUDA-only mode [source_id]
|
107 |
-
torch._C._jit_set_profiling_executor(False)
|
108 |
-
torch._C._jit_set_profiling_mode(False)
|
109 |
pipe.enable_xformers_memory_efficient_attention(
|
110 |
-
attention_op=
|
111 |
)
|
112 |
except Exception as e:
|
113 |
-
print(f"xFormers
|
114 |
-
pipe.enable_sdp_attention()
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
119 |
print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
|
120 |
@spaces.GPU
|
121 |
def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale):
|
|
|
6 |
from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
|
7 |
import gradio as gr
|
8 |
from accelerate import dispatch_model, infer_auto_device_map
|
9 |
+
# Corrected and optimized FluxControlNet implementation
|
10 |
|
11 |
def self_attention_slicing(module, slice_size=3):
|
12 |
"""Modified from Diffusers' original for Flux compatibility"""
|
13 |
def sliced_attention(*args, **kwargs):
|
14 |
+
return module(*args, **kwargs) # Remove dummy implementation <source_id data="pipeline_flux_controlnet.py" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
# Device management - critical fix
|
17 |
pipe = FluxControlNetPipeline.from_pretrained(
|
18 |
"LPX55/FLUX.1-merged_uncensored",
|
19 |
controlnet=controlnet,
|
20 |
vae=good_vae,
|
21 |
torch_dtype=torch.bfloat16,
|
22 |
use_safetensors=True,
|
23 |
+
device_map=None,
|
24 |
token=huggingface_token
|
25 |
)
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
# Sequence verified for Diffusers 0.20.0+
|
28 |
device_map = infer_auto_device_map(
|
29 |
pipe,
|
30 |
+
max_memory={0:"37GiB", "cpu":"60GiB"},
|
31 |
device_types=["cuda", "cpu"]
|
32 |
)
|
33 |
+
pipe = dispatch_model(pipe, device_map=device_map, main_device="cuda")
|
34 |
+
pipe.enable_sequential_cpu_offload() # No arguments for new API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
# Precision alignment (AFTER offloading) <source_id data="pipeline_flux_controlnet.py" />
|
37 |
pipe.unet.to(dtype=torch.bfloat16)
|
38 |
pipe.controlnet.to(dtype=torch.bfloat16)
|
39 |
pipe.vae.to(dtype=torch.bfloat16)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# XFormers with Flux compatibility
|
42 |
if torch.cuda.is_available():
|
43 |
try:
|
|
|
|
|
|
|
|
|
44 |
pipe.enable_xformers_memory_efficient_attention(
|
45 |
+
attention_op=None # Auto-select best operator
|
46 |
)
|
47 |
except Exception as e:
|
48 |
+
print(f"xFormers error: {e}")
|
49 |
+
pipe.enable_sdp_attention(mode="math")
|
50 |
+
|
51 |
+
# Memory format optimization
|
52 |
+
pipe.to(memory_format=torch.channels_last)
|
53 |
+
|
54 |
+
|
55 |
print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
|
56 |
@spaces.GPU
|
57 |
def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale):
|