Spaces:
Running
on
Zero
Running
on
Zero
import nodes | |
import node_helpers | |
import torch | |
import comfy.model_management | |
import comfy.utils | |
class WanImageToVideo: | |
def INPUT_TYPES(s): | |
return {"required": {"positive": ("CONDITIONING", ), | |
"negative": ("CONDITIONING", ), | |
"vae": ("VAE", ), | |
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), | |
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), | |
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), | |
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), | |
}, | |
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), | |
"start_image": ("IMAGE", ), | |
}} | |
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") | |
RETURN_NAMES = ("positive", "negative", "latent") | |
FUNCTION = "encode" | |
CATEGORY = "conditioning/video_models" | |
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None): | |
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) | |
if start_image is not None: | |
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) | |
image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 | |
image[:start_image.shape[0]] = start_image | |
concat_latent_image = vae.encode(image[:, :, :, :3]) | |
mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) | |
mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 | |
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) | |
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) | |
if clip_vision_output is not None: | |
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) | |
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) | |
out_latent = {} | |
out_latent["samples"] = latent | |
return (positive, negative, out_latent) | |
NODE_CLASS_MAPPINGS = { | |
"WanImageToVideo": WanImageToVideo, | |
} | |