diff --git a/.gitignore b/.gitignore index 0850089305657382152fbd25dd190ed3307c33c9..9bc430eb447ad26c675cffeccb4ac7fc1f804741 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ __pycache__/ venv/ env/ .venv/ +build/ +dist/ # Jupyter Notebook checkpoints .ipynb_checkpoints/ @@ -25,4 +27,4 @@ env/ .vscode/ # Hugging Face cache (optional) -/content/huggingface/ +~/.cache/huggingface/ diff --git a/dist/hy3dgen-2.0.0-py3.12.egg b/dist/hy3dgen-2.0.0-py3.12.egg deleted file mode 100644 index 31ccfc3573626346a5da66f2afb8405d256c18db..0000000000000000000000000000000000000000 Binary files a/dist/hy3dgen-2.0.0-py3.12.egg and /dev/null differ diff --git a/hy3dgen/__init__.py b/hy3dgen/__init__.py deleted file mode 100644 index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000 --- a/hy3dgen/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/rembg.py b/hy3dgen/rembg.py deleted file mode 100644 index c0d99483c8354fc10c6689b5cf12ebcd44368d92..0000000000000000000000000000000000000000 --- a/hy3dgen/rembg.py +++ /dev/null @@ -1,36 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -from PIL import Image -from rembg import remove, new_session - - -class BackgroundRemover(): - def __init__(self): - self.session = new_session() - - def __call__(self, image: Image.Image): - output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0]) - return output diff --git a/hy3dgen/shapegen/__init__.py b/hy3dgen/shapegen/__init__.py deleted file mode 100644 index d1f9534c15d029511d910d29e45da5ba7b8c8714..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline -from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover -from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR diff --git a/hy3dgen/shapegen/models/__init__.py b/hy3dgen/shapegen/models/__init__.py deleted file mode 100644 index 684b3e389737fb988f5e363e777c34f6cd1fe4ea..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/models/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder -from .hunyuan3ddit import Hunyuan3DDiT -from .vae import ShapeVAE diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py deleted file mode 100644 index 1af4c0cc440a193167c0837621c3494242b95f3d..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/models/conditioner.py +++ /dev/null @@ -1,165 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import torch -import torch.nn as nn -from torchvision import transforms -from transformers import ( - CLIPVisionModelWithProjection, - CLIPVisionConfig, - Dinov2Model, - Dinov2Config, -) - - -class ImageEncoder(nn.Module): - def __init__( - self, - version=None, - config=None, - use_cls_token=True, - image_size=224, - **kwargs, - ): - super().__init__() - - if config is None: - self.model = self.MODEL_CLASS.from_pretrained(version) - else: - self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config)) - self.model.eval() - self.model.requires_grad_(False) - self.use_cls_token = use_cls_token - self.size = image_size // 14 - self.num_patches = (image_size // 14) ** 2 - if self.use_cls_token: - self.num_patches += 1 - - self.transform = transforms.Compose( - [ - transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True), - transforms.CenterCrop(image_size), - transforms.Normalize( - mean=self.mean, - std=self.std, - ), - ] - ) - - def forward(self, image, mask=None, value_range=(-1, 1)): - if value_range is not None: - low, high = value_range - image = (image - low) / (high - low) - - image = image.to(self.model.device, dtype=self.model.dtype) - inputs = self.transform(image) - outputs = self.model(inputs) - - last_hidden_state = outputs.last_hidden_state - if not self.use_cls_token: - last_hidden_state = last_hidden_state[:, 1:, :] - - return last_hidden_state - - def unconditional_embedding(self, batch_size): - device = next(self.model.parameters()).device - dtype = next(self.model.parameters()).dtype - zero = torch.zeros( - batch_size, - self.num_patches, - self.model.config.hidden_size, - device=device, - dtype=dtype, - ) - - return zero - - -class CLIPImageEncoder(ImageEncoder): - MODEL_CLASS = CLIPVisionModelWithProjection - MODEL_CONFIG_CLASS = CLIPVisionConfig - mean = [0.48145466, 0.4578275, 0.40821073] - std = [0.26862954, 0.26130258, 0.27577711] - - -class DinoImageEncoder(ImageEncoder): - MODEL_CLASS = Dinov2Model - MODEL_CONFIG_CLASS = Dinov2Config - mean = [0.485, 0.456, 0.406] - std = [0.229, 0.224, 0.225] - - -def build_image_encoder(config): - if config['type'] == 'CLIPImageEncoder': - return CLIPImageEncoder(**config['kwargs']) - elif config['type'] == 'DinoImageEncoder': - return DinoImageEncoder(**config['kwargs']) - else: - raise ValueError(f'Unknown image encoder type: {config["type"]}') - - -class DualImageEncoder(nn.Module): - def __init__( - self, - main_image_encoder, - additional_image_encoder, - ): - super().__init__() - self.main_image_encoder = build_image_encoder(main_image_encoder) - self.additional_image_encoder = build_image_encoder(additional_image_encoder) - - def forward(self, image, mask=None): - outputs = { - 'main': self.main_image_encoder(image, mask=mask), - 'additional': self.additional_image_encoder(image, mask=mask), - } - return outputs - - def unconditional_embedding(self, batch_size): - outputs = { - 'main': self.main_image_encoder.unconditional_embedding(batch_size), - 'additional': self.additional_image_encoder.unconditional_embedding(batch_size), - } - return outputs - - -class SingleImageEncoder(nn.Module): - def __init__( - self, - main_image_encoder, - ): - super().__init__() - self.main_image_encoder = build_image_encoder(main_image_encoder) - - def forward(self, image, mask=None): - outputs = { - 'main': self.main_image_encoder(image, mask=mask), - } - return outputs - - def unconditional_embedding(self, batch_size): - outputs = { - 'main': self.main_image_encoder.unconditional_embedding(batch_size), - } - return outputs diff --git a/hy3dgen/shapegen/models/hunyuan3ddit.py b/hy3dgen/shapegen/models/hunyuan3ddit.py deleted file mode 100644 index d1c778666890cb13538eba15460cf0c05c7f9130..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/models/hunyuan3ddit.py +++ /dev/null @@ -1,390 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import math -from dataclasses import dataclass -from typing import List, Tuple, Optional - -import torch -from einops import rearrange -from torch import Tensor, nn - - -def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor: - x = torch.nn.functional.scaled_dot_product_attention(q, k, v) - x = rearrange(x, "B H L D -> B L (H D)") - return x - - -def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0): - """ - Create sinusoidal timestep embeddings. - :param t: a 1-D Tensor of N indices, one per batch element. - These may be fractional. - :param dim: the dimension of the output. - :param max_period: controls the minimum frequency of the embeddings. - :return: an (N, D) Tensor of positional embeddings. - """ - t = time_factor * t - half = dim // 2 - freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to( - t.device - ) - - args = t[:, None].float() * freqs[None] - embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) - if dim % 2: - embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) - if torch.is_floating_point(t): - embedding = embedding.to(t) - return embedding - - -class MLPEmbedder(nn.Module): - def __init__(self, in_dim: int, hidden_dim: int): - super().__init__() - self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) - self.silu = nn.SiLU() - self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) - - def forward(self, x: Tensor) -> Tensor: - return self.out_layer(self.silu(self.in_layer(x))) - - -class RMSNorm(torch.nn.Module): - def __init__(self, dim: int): - super().__init__() - self.scale = nn.Parameter(torch.ones(dim)) - - def forward(self, x: Tensor): - x_dtype = x.dtype - x = x.float() - rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6) - return (x * rrms).to(dtype=x_dtype) * self.scale - - -class QKNorm(torch.nn.Module): - def __init__(self, dim: int): - super().__init__() - self.query_norm = RMSNorm(dim) - self.key_norm = RMSNorm(dim) - - def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]: - q = self.query_norm(q) - k = self.key_norm(k) - return q.to(v), k.to(v) - - -class SelfAttention(nn.Module): - def __init__( - self, - dim: int, - num_heads: int = 8, - qkv_bias: bool = False, - ): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - - self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) - self.norm = QKNorm(head_dim) - self.proj = nn.Linear(dim, dim) - - def forward(self, x: Tensor, pe: Tensor) -> Tensor: - qkv = self.qkv(x) - q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) - q, k = self.norm(q, k, v) - x = attention(q, k, v, pe=pe) - x = self.proj(x) - return x - - -@dataclass -class ModulationOut: - shift: Tensor - scale: Tensor - gate: Tensor - - -class Modulation(nn.Module): - def __init__(self, dim: int, double: bool): - super().__init__() - self.is_double = double - self.multiplier = 6 if double else 3 - self.lin = nn.Linear(dim, self.multiplier * dim, bias=True) - - def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]: - out = self.lin(nn.functional.silu(vec))[:, None, :] - out = out.chunk(self.multiplier, dim=-1) - - return ( - ModulationOut(*out[:3]), - ModulationOut(*out[3:]) if self.is_double else None, - ) - - -class DoubleStreamBlock(nn.Module): - def __init__( - self, - hidden_size: int, - num_heads: int, - mlp_ratio: float, - qkv_bias: bool = False, - ): - super().__init__() - mlp_hidden_dim = int(hidden_size * mlp_ratio) - self.num_heads = num_heads - self.hidden_size = hidden_size - self.img_mod = Modulation(hidden_size, double=True) - self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) - - self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - self.img_mlp = nn.Sequential( - nn.Linear(hidden_size, mlp_hidden_dim, bias=True), - nn.GELU(approximate="tanh"), - nn.Linear(mlp_hidden_dim, hidden_size, bias=True), - ) - - self.txt_mod = Modulation(hidden_size, double=True) - self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) - - self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - self.txt_mlp = nn.Sequential( - nn.Linear(hidden_size, mlp_hidden_dim, bias=True), - nn.GELU(approximate="tanh"), - nn.Linear(mlp_hidden_dim, hidden_size, bias=True), - ) - - def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]: - img_mod1, img_mod2 = self.img_mod(vec) - txt_mod1, txt_mod2 = self.txt_mod(vec) - - img_modulated = self.img_norm1(img) - img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift - img_qkv = self.img_attn.qkv(img_modulated) - img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) - img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) - - txt_modulated = self.txt_norm1(txt) - txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift - txt_qkv = self.txt_attn.qkv(txt_modulated) - txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) - txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) - - q = torch.cat((txt_q, img_q), dim=2) - k = torch.cat((txt_k, img_k), dim=2) - v = torch.cat((txt_v, img_v), dim=2) - - attn = attention(q, k, v, pe=pe) - txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] - - img = img + img_mod1.gate * self.img_attn.proj(img_attn) - img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) - - txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) - txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) - return img, txt - - -class SingleStreamBlock(nn.Module): - """ - A DiT block with parallel linear layers as described in - https://arxiv.org/abs/2302.05442 and adapted modulation interface. - """ - - def __init__( - self, - hidden_size: int, - num_heads: int, - mlp_ratio: float = 4.0, - qk_scale: Optional[float] = None, - ): - super().__init__() - - self.hidden_dim = hidden_size - self.num_heads = num_heads - head_dim = hidden_size // num_heads - self.scale = qk_scale or head_dim ** -0.5 - - self.mlp_hidden_dim = int(hidden_size * mlp_ratio) - # qkv and mlp_in - self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim) - # proj and mlp_out - self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) - - self.norm = QKNorm(head_dim) - - self.hidden_size = hidden_size - self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - - self.mlp_act = nn.GELU(approximate="tanh") - self.modulation = Modulation(hidden_size, double=False) - - def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor: - mod, _ = self.modulation(vec) - - x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift - qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) - - q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) - q, k = self.norm(q, k, v) - - # compute attention - attn = attention(q, k, v, pe=pe) - # compute activation in mlp stream, cat again and run second linear layer - output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) - return x + mod.gate * output - - -class LastLayer(nn.Module): - def __init__(self, hidden_size: int, patch_size: int, out_channels: int): - super().__init__() - self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) - self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) - self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) - - def forward(self, x: Tensor, vec: Tensor) -> Tensor: - shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1) - x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :] - x = self.linear(x) - return x - - -class Hunyuan3DDiT(nn.Module): - def __init__( - self, - in_channels: int = 64, - context_in_dim: int = 1536, - hidden_size: int = 1024, - mlp_ratio: float = 4.0, - num_heads: int = 16, - depth: int = 16, - depth_single_blocks: int = 32, - axes_dim: List[int] = [64], - theta: int = 10_000, - qkv_bias: bool = True, - time_factor: float = 1000, - ckpt_path: Optional[str] = None, - **kwargs, - ): - super().__init__() - self.in_channels = in_channels - self.context_in_dim = context_in_dim - self.hidden_size = hidden_size - self.mlp_ratio = mlp_ratio - self.num_heads = num_heads - self.depth = depth - self.depth_single_blocks = depth_single_blocks - self.axes_dim = axes_dim - self.theta = theta - self.qkv_bias = qkv_bias - self.time_factor = time_factor - self.out_channels = self.in_channels - - if hidden_size % num_heads != 0: - raise ValueError( - f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}" - ) - pe_dim = hidden_size // num_heads - if sum(axes_dim) != pe_dim: - raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}") - self.hidden_size = hidden_size - self.num_heads = num_heads - self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) - self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) - self.cond_in = nn.Linear(context_in_dim, self.hidden_size) - - self.double_blocks = nn.ModuleList( - [ - DoubleStreamBlock( - self.hidden_size, - self.num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - ) - for _ in range(depth) - ] - ) - - self.single_blocks = nn.ModuleList( - [ - SingleStreamBlock( - self.hidden_size, - self.num_heads, - mlp_ratio=mlp_ratio, - ) - for _ in range(depth_single_blocks) - ] - ) - - self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels) - - if ckpt_path is not None: - print('restored denoiser ckpt', ckpt_path) - - ckpt = torch.load(ckpt_path, map_location="cpu") - if 'state_dict' not in ckpt: - # deepspeed ckpt - state_dict = {} - for k in ckpt.keys(): - new_k = k.replace('_forward_module.', '') - state_dict[new_k] = ckpt[k] - else: - state_dict = ckpt["state_dict"] - - final_state_dict = {} - for k, v in state_dict.items(): - if k.startswith('model.'): - final_state_dict[k.replace('model.', '')] = v - else: - final_state_dict[k] = v - missing, unexpected = self.load_state_dict(final_state_dict, strict=False) - print('unexpected keys:', unexpected) - print('missing keys:', missing) - - def forward( - self, - x, - t, - contexts, - **kwargs, - ) -> Tensor: - cond = contexts['main'] - latent = self.latent_in(x) - vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype)) - cond = self.cond_in(cond) - pe = None - - for block in self.double_blocks: - latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe) - - latent = torch.cat((cond, latent), 1) - for block in self.single_blocks: - latent = block(latent, vec=vec, pe=pe) - - latent = latent[:, cond.shape[1]:, ...] - latent = self.final_layer(latent, vec) - return latent diff --git a/hy3dgen/shapegen/models/vae.py b/hy3dgen/shapegen/models/vae.py deleted file mode 100644 index aef2784ac0db653714e711d12697eafc962c2aa3..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/models/vae.py +++ /dev/null @@ -1,636 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -from typing import Tuple, List, Union, Optional - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange, repeat -from skimage import measure -from tqdm import tqdm - - -class FourierEmbedder(nn.Module): - """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts - each feature dimension of `x[..., i]` into: - [ - sin(x[..., i]), - sin(f_1*x[..., i]), - sin(f_2*x[..., i]), - ... - sin(f_N * x[..., i]), - cos(x[..., i]), - cos(f_1*x[..., i]), - cos(f_2*x[..., i]), - ... - cos(f_N * x[..., i]), - x[..., i] # only present if include_input is True. - ], here f_i is the frequency. - - Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs]. - If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...]; - Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]. - - Args: - num_freqs (int): the number of frequencies, default is 6; - logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], - otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]; - input_dim (int): the input dimension, default is 3; - include_input (bool): include the input tensor or not, default is True. - - Attributes: - frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], - otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1); - - out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1), - otherwise, it is input_dim * num_freqs * 2. - - """ - - def __init__(self, - num_freqs: int = 6, - logspace: bool = True, - input_dim: int = 3, - include_input: bool = True, - include_pi: bool = True) -> None: - - """The initialization""" - - super().__init__() - - if logspace: - frequencies = 2.0 ** torch.arange( - num_freqs, - dtype=torch.float32 - ) - else: - frequencies = torch.linspace( - 1.0, - 2.0 ** (num_freqs - 1), - num_freqs, - dtype=torch.float32 - ) - - if include_pi: - frequencies *= torch.pi - - self.register_buffer("frequencies", frequencies, persistent=False) - self.include_input = include_input - self.num_freqs = num_freqs - - self.out_dim = self.get_dims(input_dim) - - def get_dims(self, input_dim): - temp = 1 if self.include_input or self.num_freqs == 0 else 0 - out_dim = input_dim * (self.num_freqs * 2 + temp) - - return out_dim - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ Forward process. - - Args: - x: tensor of shape [..., dim] - - Returns: - embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)] - where temp is 1 if include_input is True and 0 otherwise. - """ - - if self.num_freqs > 0: - embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1) - if self.include_input: - return torch.cat((x, embed.sin(), embed.cos()), dim=-1) - else: - return torch.cat((embed.sin(), embed.cos()), dim=-1) - else: - return x - - -class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - """ - - def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - self.scale_by_keep = scale_by_keep - - def forward(self, x): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - - This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for - changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use - 'survival rate' as the argument. - - """ - if self.drop_prob == 0. or not self.training: - return x - keep_prob = 1 - self.drop_prob - shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets - random_tensor = x.new_empty(shape).bernoulli_(keep_prob) - if keep_prob > 0.0 and self.scale_by_keep: - random_tensor.div_(keep_prob) - return x * random_tensor - - def extra_repr(self): - return f'drop_prob={round(self.drop_prob, 3):0.3f}' - - -class MLP(nn.Module): - def __init__( - self, *, - width: int, - output_width: int = None, - drop_path_rate: float = 0.0 - ): - super().__init__() - self.width = width - self.c_fc = nn.Linear(width, width * 4) - self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width) - self.gelu = nn.GELU() - self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() - - def forward(self, x): - return self.drop_path(self.c_proj(self.gelu(self.c_fc(x)))) - - -class QKVMultiheadCrossAttention(nn.Module): - def __init__( - self, - *, - heads: int, - n_data: Optional[int] = None, - width=None, - qk_norm=False, - norm_layer=nn.LayerNorm - ): - super().__init__() - self.heads = heads - self.n_data = n_data - self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() - self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() - - def forward(self, q, kv): - _, n_ctx, _ = q.shape - bs, n_data, width = kv.shape - attn_ch = width // self.heads // 2 - q = q.view(bs, n_ctx, self.heads, -1) - kv = kv.view(bs, n_data, self.heads, -1) - k, v = torch.split(kv, attn_ch, dim=-1) - - q = self.q_norm(q) - k = self.k_norm(k) - - q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) - out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) - - return out - - -class MultiheadCrossAttention(nn.Module): - def __init__( - self, - *, - width: int, - heads: int, - qkv_bias: bool = True, - n_data: Optional[int] = None, - data_width: Optional[int] = None, - norm_layer=nn.LayerNorm, - qk_norm: bool = False - ): - super().__init__() - self.n_data = n_data - self.width = width - self.heads = heads - self.data_width = width if data_width is None else data_width - self.c_q = nn.Linear(width, width, bias=qkv_bias) - self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias) - self.c_proj = nn.Linear(width, width) - self.attention = QKVMultiheadCrossAttention( - heads=heads, - n_data=n_data, - width=width, - norm_layer=norm_layer, - qk_norm=qk_norm - ) - - def forward(self, x, data): - x = self.c_q(x) - data = self.c_kv(data) - x = self.attention(x, data) - x = self.c_proj(x) - return x - - -class ResidualCrossAttentionBlock(nn.Module): - def __init__( - self, - *, - n_data: Optional[int] = None, - width: int, - heads: int, - data_width: Optional[int] = None, - qkv_bias: bool = True, - norm_layer=nn.LayerNorm, - qk_norm: bool = False - ): - super().__init__() - - if data_width is None: - data_width = width - - self.attn = MultiheadCrossAttention( - n_data=n_data, - width=width, - heads=heads, - data_width=data_width, - qkv_bias=qkv_bias, - norm_layer=norm_layer, - qk_norm=qk_norm - ) - self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) - self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6) - self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6) - self.mlp = MLP(width=width) - - def forward(self, x: torch.Tensor, data: torch.Tensor): - x = x + self.attn(self.ln_1(x), self.ln_2(data)) - x = x + self.mlp(self.ln_3(x)) - return x - - -class QKVMultiheadAttention(nn.Module): - def __init__( - self, - *, - heads: int, - n_ctx: int, - width=None, - qk_norm=False, - norm_layer=nn.LayerNorm - ): - super().__init__() - self.heads = heads - self.n_ctx = n_ctx - self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() - self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() - - def forward(self, qkv): - bs, n_ctx, width = qkv.shape - attn_ch = width // self.heads // 3 - qkv = qkv.view(bs, n_ctx, self.heads, -1) - q, k, v = torch.split(qkv, attn_ch, dim=-1) - - q = self.q_norm(q) - k = self.k_norm(k) - - q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) - out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) - return out - - -class MultiheadAttention(nn.Module): - def __init__( - self, - *, - n_ctx: int, - width: int, - heads: int, - qkv_bias: bool, - norm_layer=nn.LayerNorm, - qk_norm: bool = False, - drop_path_rate: float = 0.0 - ): - super().__init__() - self.n_ctx = n_ctx - self.width = width - self.heads = heads - self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias) - self.c_proj = nn.Linear(width, width) - self.attention = QKVMultiheadAttention( - heads=heads, - n_ctx=n_ctx, - width=width, - norm_layer=norm_layer, - qk_norm=qk_norm - ) - self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() - - def forward(self, x): - x = self.c_qkv(x) - x = self.attention(x) - x = self.drop_path(self.c_proj(x)) - return x - - -class ResidualAttentionBlock(nn.Module): - def __init__( - self, - *, - n_ctx: int, - width: int, - heads: int, - qkv_bias: bool = True, - norm_layer=nn.LayerNorm, - qk_norm: bool = False, - drop_path_rate: float = 0.0, - ): - super().__init__() - self.attn = MultiheadAttention( - n_ctx=n_ctx, - width=width, - heads=heads, - qkv_bias=qkv_bias, - norm_layer=norm_layer, - qk_norm=qk_norm, - drop_path_rate=drop_path_rate - ) - self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) - self.mlp = MLP(width=width, drop_path_rate=drop_path_rate) - self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6) - - def forward(self, x: torch.Tensor): - x = x + self.attn(self.ln_1(x)) - x = x + self.mlp(self.ln_2(x)) - return x - - -class Transformer(nn.Module): - def __init__( - self, - *, - n_ctx: int, - width: int, - layers: int, - heads: int, - qkv_bias: bool = True, - norm_layer=nn.LayerNorm, - qk_norm: bool = False, - drop_path_rate: float = 0.0 - ): - super().__init__() - self.n_ctx = n_ctx - self.width = width - self.layers = layers - self.resblocks = nn.ModuleList( - [ - ResidualAttentionBlock( - n_ctx=n_ctx, - width=width, - heads=heads, - qkv_bias=qkv_bias, - norm_layer=norm_layer, - qk_norm=qk_norm, - drop_path_rate=drop_path_rate - ) - for _ in range(layers) - ] - ) - - def forward(self, x: torch.Tensor): - for block in self.resblocks: - x = block(x) - return x - - -class CrossAttentionDecoder(nn.Module): - - def __init__( - self, - *, - num_latents: int, - out_channels: int, - fourier_embedder: FourierEmbedder, - width: int, - heads: int, - qkv_bias: bool = True, - qk_norm: bool = False, - label_type: str = "binary" - ): - super().__init__() - - self.fourier_embedder = fourier_embedder - - self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width) - - self.cross_attn_decoder = ResidualCrossAttentionBlock( - n_data=num_latents, - width=width, - heads=heads, - qkv_bias=qkv_bias, - qk_norm=qk_norm - ) - - self.ln_post = nn.LayerNorm(width) - self.output_proj = nn.Linear(width, out_channels) - self.label_type = label_type - - def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor): - queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype)) - x = self.cross_attn_decoder(queries, latents) - x = self.ln_post(x) - occ = self.output_proj(x) - return occ - - -def generate_dense_grid_points(bbox_min: np.ndarray, - bbox_max: np.ndarray, - octree_depth: int, - indexing: str = "ij", - octree_resolution: int = None, - ): - length = bbox_max - bbox_min - num_cells = np.exp2(octree_depth) - if octree_resolution is not None: - num_cells = octree_resolution - - x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) - y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) - z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) - [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) - xyz = np.stack((xs, ys, zs), axis=-1) - xyz = xyz.reshape(-1, 3) - grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] - - return xyz, grid_size, length - - -def center_vertices(vertices): - """Translate the vertices so that bounding box is centered at zero.""" - vert_min = vertices.min(dim=0)[0] - vert_max = vertices.max(dim=0)[0] - vert_center = 0.5 * (vert_min + vert_max) - return vertices - vert_center - - -class Latent2MeshOutput: - - def __init__(self, mesh_v=None, mesh_f=None): - self.mesh_v = mesh_v - self.mesh_f = mesh_f - - -class ShapeVAE(nn.Module): - def __init__( - self, - *, - num_latents: int, - embed_dim: int, - width: int, - heads: int, - num_decoder_layers: int, - num_freqs: int = 8, - include_pi: bool = True, - qkv_bias: bool = True, - qk_norm: bool = False, - label_type: str = "binary", - drop_path_rate: float = 0.0, - scale_factor: float = 1.0, - ): - super().__init__() - self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) - - self.post_kl = nn.Linear(embed_dim, width) - - self.transformer = Transformer( - n_ctx=num_latents, - width=width, - layers=num_decoder_layers, - heads=heads, - qkv_bias=qkv_bias, - qk_norm=qk_norm, - drop_path_rate=drop_path_rate - ) - - self.geo_decoder = CrossAttentionDecoder( - fourier_embedder=self.fourier_embedder, - out_channels=1, - num_latents=num_latents, - width=width, - heads=heads, - qkv_bias=qkv_bias, - qk_norm=qk_norm, - label_type=label_type, - ) - - self.scale_factor = scale_factor - self.latent_shape = (num_latents, embed_dim) - - def forward(self, latents): - latents = self.post_kl(latents) - latents = self.transformer(latents) - return latents - - @torch.no_grad() - def latents2mesh( - self, - latents: torch.FloatTensor, - bounds: Union[Tuple[float], List[float], float] = 1.1, - octree_depth: int = 7, - num_chunks: int = 10000, - mc_level: float = -1 / 512, - octree_resolution: int = None, - mc_algo: str = 'dmc', - ): - device = latents.device - - # 1. generate query points - if isinstance(bounds, float): - bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] - bbox_min = np.array(bounds[0:3]) - bbox_max = np.array(bounds[3:6]) - bbox_size = bbox_max - bbox_min - xyz_samples, grid_size, length = generate_dense_grid_points( - bbox_min=bbox_min, - bbox_max=bbox_max, - octree_depth=octree_depth, - octree_resolution=octree_resolution, - indexing="ij" - ) - xyz_samples = torch.FloatTensor(xyz_samples) - - # 2. latents to 3d volume - batch_logits = [] - batch_size = latents.shape[0] - for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), - desc=f"MC Level {mc_level} Implicit Function:"): - queries = xyz_samples[start: start + num_chunks, :].to(device) - queries = queries.half() - batch_queries = repeat(queries, "p c -> b p c", b=batch_size) - - logits = self.geo_decoder(batch_queries.to(latents.dtype), latents) - if mc_level == -1: - mc_level = 0 - logits = torch.sigmoid(logits) * 2 - 1 - print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.') - batch_logits.append(logits) - grid_logits = torch.cat(batch_logits, dim=1) - grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float() - - # 3. extract surface - outputs = [] - for i in range(batch_size): - try: - if mc_algo == 'mc': - vertices, faces, normals, _ = measure.marching_cubes( - grid_logits[i].cpu().numpy(), - mc_level, - method="lewiner" - ) - vertices = vertices / grid_size * bbox_size + bbox_min - elif mc_algo == 'dmc': - if not hasattr(self, 'dmc'): - try: - from diso import DiffDMC - except: - raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'") - self.dmc = DiffDMC(dtype=torch.float32).to(device) - octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution - sdf = -grid_logits[i] / octree_resolution - verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True) - verts = center_vertices(verts) - vertices = verts.detach().cpu().numpy() - faces = faces.detach().cpu().numpy()[:, ::-1] - else: - raise ValueError(f"mc_algo {mc_algo} not supported.") - - outputs.append( - Latent2MeshOutput( - mesh_v=vertices.astype(np.float32), - mesh_f=np.ascontiguousarray(faces) - ) - ) - - except ValueError: - outputs.append(None) - except RuntimeError: - outputs.append(None) - - return outputs diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py deleted file mode 100644 index 02fd79b5976b51df79aa242c11eab2378e92ee34..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/pipelines.py +++ /dev/null @@ -1,589 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import copy -import importlib -import inspect -import logging -import os -from typing import List, Optional, Union - -import numpy as np -import torch -import trimesh -import yaml -from PIL import Image -from diffusers.utils.torch_utils import randn_tensor -from tqdm import tqdm - -logger = logging.getLogger(__name__) - - -def retrieve_timesteps( - scheduler, - num_inference_steps: Optional[int] = None, - device: Optional[Union[str, torch.device]] = None, - timesteps: Optional[List[int]] = None, - sigmas: Optional[List[float]] = None, - **kwargs, -): - """ - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Args: - scheduler (`SchedulerMixin`): - The scheduler to get timesteps from. - num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` - must be `None`. - device (`str` or `torch.device`, *optional*): - The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. - timesteps (`List[int]`, *optional*): - Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, - `num_inference_steps` and `sigmas` must be `None`. - sigmas (`List[float]`, *optional*): - Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, - `num_inference_steps` and `timesteps` must be `None`. - - Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. - """ - if timesteps is not None and sigmas is not None: - raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") - if timesteps is not None: - accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) - if not accepts_timesteps: - raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" - f" timestep schedules. Please check whether you are using the correct scheduler." - ) - scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) - elif sigmas is not None: - accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) - if not accept_sigmas: - raise ValueError( - f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" - f" sigmas schedules. Please check whether you are using the correct scheduler." - ) - scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) - timesteps = scheduler.timesteps - num_inference_steps = len(timesteps) - else: - scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) - timesteps = scheduler.timesteps - return timesteps, num_inference_steps - - -def export_to_trimesh(mesh_output): - if isinstance(mesh_output, list): - outputs = [] - for mesh in mesh_output: - if mesh is None: - outputs.append(None) - else: - mesh.mesh_f = mesh.mesh_f[:, ::-1] - mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f) - outputs.append(mesh_output) - return outputs - else: - mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1] - mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f) - return mesh_output - - -def get_obj_from_str(string, reload=False): - module, cls = string.rsplit(".", 1) - if reload: - module_imp = importlib.import_module(module) - importlib.reload(module_imp) - return getattr(importlib.import_module(module, package=None), cls) - - -def instantiate_from_config(config, **kwargs): - if "target" not in config: - raise KeyError("Expected key `target` to instantiate.") - cls = get_obj_from_str(config["target"]) - params = config.get("params", dict()) - kwargs.update(params) - instance = cls(**kwargs) - return instance - - -class Hunyuan3DDiTPipeline: - @classmethod - def from_single_file( - cls, - ckpt_path, - config_path, - device='cpu', - dtype=torch.float16, - **kwargs, - ): - # load config - with open(config_path, 'r') as f: - config = yaml.safe_load(f) - - # load ckpt - if not os.path.exists(ckpt_path): - raise FileNotFoundError(f"Model file {ckpt_path} not found") - logger.info(f"Loading model from {ckpt_path}") - - if ckpt_path.endswith('.safetensors'): - # parse safetensors - import safetensors.torch - safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') - ckpt = {} - for key, value in safetensors_ckpt.items(): - model_name = key.split('.')[0] - new_key = key[len(model_name) + 1:] - if model_name not in ckpt: - ckpt[model_name] = {} - ckpt[model_name][new_key] = value - else: - ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) - - # load model - from accelerate import init_empty_weights - with init_empty_weights(): - model = instantiate_from_config(config['model']) - vae = instantiate_from_config(config['vae']) - conditioner = instantiate_from_config(config['conditioner']) - image_processor = instantiate_from_config(config['image_processor']) - scheduler = instantiate_from_config(config['scheduler']) - - model.load_state_dict(ckpt['model'], assign = True) - vae.load_state_dict(ckpt['vae'], assign = True) - if 'conditioner' in ckpt: - conditioner.load_state_dict(ckpt['conditioner'], assign = True) - - model_kwargs = dict( - vae=vae, - model=model, - scheduler=scheduler, - conditioner=conditioner, - image_processor=image_processor, - device=device, - dtype=dtype, - ) - model_kwargs.update(kwargs) - - return cls( - **model_kwargs - ) - - @classmethod - def from_pretrained( - cls, - model_path, - device='cuda', - dtype=torch.float16, - use_safetensors=None, - variant=None, - subfolder='hunyuan3d-dit-v2-0', - **kwargs, - ): - original_model_path = model_path - if not os.path.exists(model_path): - # try local path - base_dir = os.environ.get('HY3DGEN_MODELS', '/content/hy3dgen') - model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder)) - if not os.path.exists(model_path): - try: - import huggingface_hub - # download from huggingface - path = huggingface_hub.snapshot_download(repo_id=original_model_path) - model_path = os.path.join(path, subfolder) - except ImportError: - logger.warning( - "You need to install HuggingFace Hub to load models from the hub." - ) - raise RuntimeError(f"Model path {model_path} not found") - if not os.path.exists(model_path): - raise FileNotFoundError(f"Model path {original_model_path} not found") - - extension = 'ckpt' if not use_safetensors else 'safetensors' - variant = '' if variant is None else f'.{variant}' - ckpt_name = f'model{variant}.{extension}' - config_path = os.path.join(model_path, 'config.yaml') - ckpt_path = os.path.join(model_path, ckpt_name) - - return cls.from_single_file( - ckpt_path, - config_path, - device=device, - dtype=dtype, - use_safetensors=use_safetensors, - variant=variant, - **kwargs - ) - - def __init__( - self, - vae, - model, - scheduler, - conditioner, - image_processor, - device='cuda', - dtype=torch.float16, - **kwargs - ): - self.vae = vae - self.model = model - self.scheduler = scheduler - self.conditioner = conditioner - self.image_processor = image_processor - - self.to(device, dtype) - - def to(self, device=None, dtype=None): - if device is not None: - self.device = torch.device(device) - self.vae.to(device) - self.model.to(device) - self.conditioner.to(device) - if dtype is not None: - self.dtype = dtype - self.vae.to(dtype=dtype) - self.model.to(dtype=dtype) - self.conditioner.to(dtype=dtype) - - def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance): - bsz = image.shape[0] - cond = self.conditioner(image=image, mask=mask) - - if do_classifier_free_guidance: - un_cond = self.conditioner.unconditional_embedding(bsz) - - if dual_guidance: - un_cond_drop_main = copy.deepcopy(un_cond) - un_cond_drop_main['additional'] = cond['additional'] - - def cat_recursive(a, b, c): - if isinstance(a, torch.Tensor): - return torch.cat([a, b, c], dim=0).to(self.dtype) - out = {} - for k in a.keys(): - out[k] = cat_recursive(a[k], b[k], c[k]) - return out - - cond = cat_recursive(cond, un_cond_drop_main, un_cond) - else: - un_cond = self.conditioner.unconditional_embedding(bsz) - - def cat_recursive(a, b): - if isinstance(a, torch.Tensor): - return torch.cat([a, b], dim=0).to(self.dtype) - out = {} - for k in a.keys(): - out[k] = cat_recursive(a[k], b[k]) - return out - - cond = cat_recursive(cond, un_cond) - return cond - - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # check if the scheduler accepts generator - accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_generator: - extra_step_kwargs["generator"] = generator - return extra_step_kwargs - - def prepare_latents(self, batch_size, dtype, device, generator, latents=None): - shape = (batch_size, *self.vae.latent_shape) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - else: - latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0) - return latents - - def prepare_image(self, image): - if isinstance(image, str) and not os.path.exists(image): - raise FileNotFoundError(f"Couldn't find image at path {image}") - - if not isinstance(image, list): - image = [image] - image_pts = [] - mask_pts = [] - for img in image: - image_pt, mask_pt = self.image_processor(img, return_mask=True) - image_pts.append(image_pt) - mask_pts.append(mask_pt) - - image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype) - if mask_pts[0] is not None: - mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype) - else: - mask_pts = None - return image_pts, mask_pts - - def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - timesteps (`torch.Tensor`): - generate embedding vectors at these timesteps - embedding_dim (`int`, *optional*, defaults to 512): - dimension of the embeddings to generate - dtype: - data type of the generated embeddings - - Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` - """ - assert len(w.shape) == 1 - w = w * 1000.0 - - half_dim = embedding_dim // 2 - emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) - emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) - emb = w.to(dtype)[:, None] * emb[None, :] - emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) - if embedding_dim % 2 == 1: # zero pad - emb = torch.nn.functional.pad(emb, (0, 1)) - assert emb.shape == (w.shape[0], embedding_dim) - return emb - - @torch.no_grad() - def __call__( - self, - image: Union[str, List[str], Image.Image] = None, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - eta: float = 0.0, - guidance_scale: float = 7.5, - dual_guidance_scale: float = 10.5, - dual_guidance: bool = True, - generator=None, - box_v=1.01, - octree_resolution=384, - mc_level=-1 / 512, - num_chunks=8000, - mc_algo='mc', - output_type: Optional[str] = "trimesh", - enable_pbar=True, - **kwargs, - ) -> List[List[trimesh.Trimesh]]: - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - device = self.device - dtype = self.dtype - do_classifier_free_guidance = guidance_scale >= 0 and \ - getattr(self.model, 'guidance_cond_proj_dim', None) is None - dual_guidance = dual_guidance_scale >= 0 and dual_guidance - - image, mask = self.prepare_image(image) - cond = self.encode_cond(image=image, - mask=mask, - do_classifier_free_guidance=do_classifier_free_guidance, - dual_guidance=dual_guidance) - batch_size = image.shape[0] - - t_dtype = torch.long - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas) - - latents = self.prepare_latents(batch_size, dtype, device, generator) - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - guidance_cond = None - if getattr(self.model, 'guidance_cond_proj_dim', None) is not None: - print('Using lcm guidance scale') - guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size) - guidance_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim - ).to(device=device, dtype=latents.dtype) - - for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)): - # expand the latents if we are doing classifier free guidance - if do_classifier_free_guidance: - latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2)) - else: - latent_model_input = latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device) - timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0]) - noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond) - - # no drop, drop clip, all drop - if do_classifier_free_guidance: - if dual_guidance: - noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3) - noise_pred = ( - noise_pred_uncond - + guidance_scale * (noise_pred_clip - noise_pred_dino) - + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond) - ) - else: - noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) - latents = outputs.prev_sample - - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, outputs) - - return self._export( - latents, - output_type, - box_v, mc_level, num_chunks, octree_resolution, mc_algo, - ) - - def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo): - if not output_type == "latent": - latents = 1. / self.vae.scale_factor * latents - latents = self.vae(latents) - outputs = self.vae.latents2mesh( - latents, - bounds=box_v, - mc_level=mc_level, - num_chunks=num_chunks, - octree_resolution=octree_resolution, - mc_algo=mc_algo, - ) - else: - outputs = latents - - if output_type == 'trimesh': - outputs = export_to_trimesh(outputs) - - return outputs - - -class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): - - @torch.no_grad() - def __call__( - self, - image: Union[str, List[str], Image.Image] = None, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - eta: float = 0.0, - guidance_scale: float = 7.5, - generator=None, - box_v=1.01, - octree_resolution=384, - mc_level=0.0, - mc_algo='mc', - num_chunks=8000, - output_type: Optional[str] = "trimesh", - enable_pbar=True, - **kwargs, - ) -> List[List[trimesh.Trimesh]]: - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - device = self.device - dtype = self.dtype - do_classifier_free_guidance = guidance_scale >= 0 and not ( - hasattr(self.model, 'guidance_embed') and - self.model.guidance_embed is True - ) - - image, mask = self.prepare_image(image) - cond = self.encode_cond( - image=image, - mask=mask, - do_classifier_free_guidance=do_classifier_free_guidance, - dual_guidance=False, - ) - batch_size = image.shape[0] - - # 5. Prepare timesteps - # NOTE: this is slightly different from common usage, we start from 0. - sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, - num_inference_steps, - device, - sigmas=sigmas, - ) - latents = self.prepare_latents(batch_size, dtype, device, generator) - - guidance = None - if hasattr(self.model, 'guidance_embed') and \ - self.model.guidance_embed is True: - guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype) - - for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")): - # expand the latents if we are doing classifier free guidance - if do_classifier_free_guidance: - latent_model_input = torch.cat([latents] * 2) - else: - latent_model_input = latents - - # NOTE: we assume model get timesteps ranged from 0 to 1 - timestep = t.expand(latent_model_input.shape[0]).to( - latents.dtype) / self.scheduler.config.num_train_timesteps - noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance) - - if do_classifier_free_guidance: - noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - outputs = self.scheduler.step(noise_pred, t, latents) - latents = outputs.prev_sample - - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, outputs) - - return self._export( - latents, - output_type, - box_v, mc_level, num_chunks, octree_resolution, mc_algo, - ) diff --git a/hy3dgen/shapegen/postprocessors.py b/hy3dgen/shapegen/postprocessors.py deleted file mode 100644 index 0500fa2d8f70a3a933f8313d11126ad9b27bf57c..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/postprocessors.py +++ /dev/null @@ -1,175 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import os -import tempfile -from typing import Union - -import pymeshlab -import trimesh - -from .models.vae import Latent2MeshOutput - - -def load_mesh(path): - if path.endswith(".glb"): - mesh = trimesh.load(path) - else: - mesh = pymeshlab.MeshSet() - mesh.load_new_mesh(path) - return mesh - - -def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000): - mesh.apply_filter( - "meshing_decimation_quadric_edge_collapse", - targetfacenum=max_facenum, - qualitythr=1.0, - preserveboundary=True, - boundaryweight=3, - preservenormal=True, - preservetopology=True, - autoclean=True - ) - return mesh - - -def remove_floater(mesh: pymeshlab.MeshSet): - mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face", - nbfaceratio=0.005) - mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False) - mesh.apply_filter("meshing_remove_selected_vertices_and_faces") - return mesh - - -def pymeshlab2trimesh(mesh: pymeshlab.MeshSet): - temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) - temp_file.close() - temp_file_name = temp_file.name - - mesh.save_current_mesh(temp_file_name) - mesh = trimesh.load(temp_file_name) - if os.path.exists(temp_file_name): - os.remove(temp_file_name) - - # 检查加载的对象类型 - if isinstance(mesh, trimesh.Scene): - combined_mesh = trimesh.Trimesh() - # 如果是Scene,遍历所有的geometry并合并 - for geom in mesh.geometry.values(): - combined_mesh = trimesh.util.concatenate([combined_mesh, geom]) - mesh = combined_mesh - return mesh - - -def trimesh2pymeshlab(mesh: trimesh.Trimesh): - temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) - temp_file.close() - temp_file_name = temp_file.name - - if isinstance(mesh, trimesh.scene.Scene): - for idx, obj in enumerate(mesh.geometry.values()): - if idx == 0: - temp_mesh = obj - else: - temp_mesh = temp_mesh + obj - mesh = temp_mesh - mesh.export(temp_file_name) - mesh = pymeshlab.MeshSet() - mesh.load_new_mesh(temp_file_name) - if os.path.exists(temp_file_name): - os.remove(temp_file_name) - - return mesh - - -def export_mesh(input, output): - if isinstance(input, pymeshlab.MeshSet): - mesh = output - elif isinstance(input, Latent2MeshOutput): - output = Latent2MeshOutput() - output.mesh_v = output.current_mesh().vertex_matrix() - output.mesh_f = output.current_mesh().face_matrix() - mesh = output - else: - mesh = pymeshlab2trimesh(output) - return mesh - - -def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet: - if isinstance(mesh, str): - mesh = load_mesh(mesh) - elif isinstance(mesh, Latent2MeshOutput): - mesh = pymeshlab.MeshSet() - mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f) - mesh.add_mesh(mesh_pymeshlab, "converted_mesh") - - if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)): - mesh = trimesh2pymeshlab(mesh) - - return mesh - - -class FaceReducer: - def __call__( - self, - mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], - max_facenum: int = 40000 - ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]: - ms = import_mesh(mesh) - ms = reduce_face(ms, max_facenum=max_facenum) - mesh = export_mesh(mesh, ms) - return mesh - - -class FloaterRemover: - def __call__( - self, - mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], - ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: - ms = import_mesh(mesh) - ms = remove_floater(ms) - mesh = export_mesh(mesh, ms) - return mesh - - -class DegenerateFaceRemover: - def __call__( - self, - mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], - ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: - ms = import_mesh(mesh) - - temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) - temp_file.close() - temp_file_name = temp_file.name - - ms.save_current_mesh(temp_file_name) - ms = pymeshlab.MeshSet() - ms.load_new_mesh(temp_file_name) - if os.path.exists(temp_file_name): - os.remove(temp_file_name) - - mesh = export_mesh(mesh, ms) - return mesh diff --git a/hy3dgen/shapegen/preprocessors.py b/hy3dgen/shapegen/preprocessors.py deleted file mode 100644 index 2bdaff2d16cc0844d8d23c886d35c2f4e7286ff7..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/preprocessors.py +++ /dev/null @@ -1,127 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import cv2 -import numpy as np -import torch -from PIL import Image -from einops import repeat, rearrange - - -def array_to_tensor(np_array): - image_pt = torch.tensor(np_array).float() - image_pt = image_pt / 255 * 2 - 1 - image_pt = rearrange(image_pt, "h w c -> c h w") - image_pts = repeat(image_pt, "c h w -> b c h w", b=1) - return image_pts - - -class ImageProcessorV2: - def __init__(self, size=512, border_ratio=None): - self.size = size - self.border_ratio = border_ratio - - @staticmethod - def recenter(image, border_ratio: float = 0.2): - """ recenter an image to leave some empty space at the image border. - - Args: - image (ndarray): input image, float/uint8 [H, W, 3/4] - mask (ndarray): alpha mask, bool [H, W] - border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2. - - Returns: - ndarray: output image, float/uint8 [H, W, 3/4] - """ - - if image.shape[-1] == 4: - mask = image[..., 3] - else: - mask = np.ones_like(image[..., 0:1]) * 255 - image = np.concatenate([image, mask], axis=-1) - mask = mask[..., 0] - - H, W, C = image.shape - - size = max(H, W) - result = np.zeros((size, size, C), dtype=np.uint8) - - coords = np.nonzero(mask) - x_min, x_max = coords[0].min(), coords[0].max() - y_min, y_max = coords[1].min(), coords[1].max() - h = x_max - x_min - w = y_max - y_min - if h == 0 or w == 0: - raise ValueError('input image is empty') - desired_size = int(size * (1 - border_ratio)) - scale = desired_size / max(h, w) - h2 = int(h * scale) - w2 = int(w * scale) - x2_min = (size - h2) // 2 - x2_max = x2_min + h2 - - y2_min = (size - w2) // 2 - y2_max = y2_min + w2 - - result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2), - interpolation=cv2.INTER_AREA) - - bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 - # bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 - mask = result[..., 3:].astype(np.float32) / 255 - result = result[..., :3] * mask + bg * (1 - mask) - - mask = mask * 255 - result = result.clip(0, 255).astype(np.uint8) - mask = mask.clip(0, 255).astype(np.uint8) - return result, mask - - def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs): - if self.border_ratio is not None: - border_ratio = self.border_ratio - print(f"Using border_ratio from init: {border_ratio}") - if isinstance(image, str): - image = cv2.imread(image, cv2.IMREAD_UNCHANGED) - image, mask = self.recenter(image, border_ratio=border_ratio) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - elif isinstance(image, Image.Image): - image = np.asarray(image) - image, mask = self.recenter(image, border_ratio=border_ratio) - - image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC) - mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST) - mask = mask[..., np.newaxis] - - if to_tensor: - image = array_to_tensor(image) - mask = array_to_tensor(mask) - if return_mask: - return image, mask - return image - - -IMAGE_PROCESSORS = { - "v2": ImageProcessorV2, -} - -DEFAULT_IMAGEPROCESSOR = 'v2' diff --git a/hy3dgen/shapegen/schedulers.py b/hy3dgen/shapegen/schedulers.py deleted file mode 100644 index 0069f5cd49c5095930b588f01129a77f172171a7..0000000000000000000000000000000000000000 --- a/hy3dgen/shapegen/schedulers.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union - -import numpy as np -import torch -from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.schedulers.scheduling_utils import SchedulerMixin -from diffusers.utils import BaseOutput, logging - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - - -@dataclass -class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput): - """ - Output class for the scheduler's `step` function output. - - Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): - Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the - denoising loop. - """ - - prev_sample: torch.FloatTensor - - -class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin): - """ - NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed - - Euler scheduler. - - This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic - methods the library implements for all schedulers such as loading and saving. - - Args: - num_train_timesteps (`int`, defaults to 1000): - The number of diffusion steps to train the model. - timestep_spacing (`str`, defaults to `"linspace"`): - The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. - shift (`float`, defaults to 1.0): - The shift value for the timestep schedule. - """ - - _compatibles = [] - order = 1 - - @register_to_config - def __init__( - self, - num_train_timesteps: int = 1000, - shift: float = 1.0, - use_dynamic_shifting=False, - ): - timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy() - timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) - - sigmas = timesteps / num_train_timesteps - if not use_dynamic_shifting: - # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution - sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) - - self.timesteps = sigmas * num_train_timesteps - - self._step_index = None - self._begin_index = None - - self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication - self.sigma_min = self.sigmas[-1].item() - self.sigma_max = self.sigmas[0].item() - - @property - def step_index(self): - """ - The index counter for current timestep. It will increase 1 after each scheduler step. - """ - return self._step_index - - @property - def begin_index(self): - """ - The index for the first timestep. It should be set from pipeline with `set_begin_index` method. - """ - return self._begin_index - - # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index - def set_begin_index(self, begin_index: int = 0): - """ - Sets the begin index for the scheduler. This function should be run from pipeline before the inference. - - Args: - begin_index (`int`): - The begin index for the scheduler. - """ - self._begin_index = begin_index - - def scale_noise( - self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: - """ - Forward process in flow-matching - - Args: - sample (`torch.FloatTensor`): - The input sample. - timestep (`int`, *optional*): - The current timestep in the diffusion chain. - - Returns: - `torch.FloatTensor`: - A scaled input sample. - """ - # Make sure sigmas and timesteps have the same device and dtype as original_samples - sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) - - if sample.device.type == "mps" and torch.is_floating_point(timestep): - # mps does not support float64 - schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) - timestep = timestep.to(sample.device, dtype=torch.float32) - else: - schedule_timesteps = self.timesteps.to(sample.device) - timestep = timestep.to(sample.device) - - # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index - if self.begin_index is None: - step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] - elif self.step_index is not None: - # add_noise is called after first denoising step (for inpainting) - step_indices = [self.step_index] * timestep.shape[0] - else: - # add noise is called before first denoising step to create initial latent(img2img) - step_indices = [self.begin_index] * timestep.shape[0] - - sigma = sigmas[step_indices].flatten() - while len(sigma.shape) < len(sample.shape): - sigma = sigma.unsqueeze(-1) - - sample = sigma * noise + (1.0 - sigma) * sample - - return sample - - def _sigma_to_t(self, sigma): - return sigma * self.config.num_train_timesteps - - def time_shift(self, mu: float, sigma: float, t: torch.Tensor): - return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) - - def set_timesteps( - self, - num_inference_steps: int = None, - device: Union[str, torch.device] = None, - sigmas: Optional[List[float]] = None, - mu: Optional[float] = None, - ): - """ - Sets the discrete timesteps used for the diffusion chain (to be run before inference). - - Args: - num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. - device (`str` or `torch.device`, *optional*): - The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. - """ - - if self.config.use_dynamic_shifting and mu is None: - raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") - - if sigmas is None: - self.num_inference_steps = num_inference_steps - timesteps = np.linspace( - self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps - ) - - sigmas = timesteps / self.config.num_train_timesteps - - if self.config.use_dynamic_shifting: - sigmas = self.time_shift(mu, 1.0, sigmas) - else: - sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) - - sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) - timesteps = sigmas * self.config.num_train_timesteps - - self.timesteps = timesteps.to(device=device) - self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)]) - - self._step_index = None - self._begin_index = None - - def index_for_timestep(self, timestep, schedule_timesteps=None): - if schedule_timesteps is None: - schedule_timesteps = self.timesteps - - indices = (schedule_timesteps == timestep).nonzero() - - # The sigma index that is taken for the **very** first `step` - # is always the second index (or the last index if there is only 1) - # This way we can ensure we don't accidentally skip a sigma in - # case we start in the middle of the denoising schedule (e.g. for image-to-image) - pos = 1 if len(indices) > 1 else 0 - - return indices[pos].item() - - def _init_step_index(self, timestep): - if self.begin_index is None: - if isinstance(timestep, torch.Tensor): - timestep = timestep.to(self.timesteps.device) - self._step_index = self.index_for_timestep(timestep) - else: - self._step_index = self._begin_index - - def step( - self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, - s_churn: float = 0.0, - s_tmin: float = 0.0, - s_tmax: float = float("inf"), - s_noise: float = 1.0, - generator: Optional[torch.Generator] = None, - return_dict: bool = True, - ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]: - """ - Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion - process from the learned model outputs (most often the predicted noise). - - Args: - model_output (`torch.FloatTensor`): - The direct output from learned diffusion model. - timestep (`float`): - The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): - A current instance of a sample created by the diffusion process. - s_churn (`float`): - s_tmin (`float`): - s_tmax (`float`): - s_noise (`float`, defaults to 1.0): - Scaling factor for noise added to the sample. - generator (`torch.Generator`, *optional*): - A random number generator. - return_dict (`bool`): - Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or - tuple. - - Returns: - [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`: - If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is - returned, otherwise a tuple is returned where the first element is the sample tensor. - """ - - if ( - isinstance(timestep, int) - or isinstance(timestep, torch.IntTensor) - or isinstance(timestep, torch.LongTensor) - ): - raise ValueError( - ( - "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" - " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" - " one of the `scheduler.timesteps` as a timestep." - ), - ) - - if self.step_index is None: - self._init_step_index(timestep) - - # Upcast to avoid precision issues when computing prev_sample - sample = sample.to(torch.float32) - - sigma = self.sigmas[self.step_index] - sigma_next = self.sigmas[self.step_index + 1] - - prev_sample = sample + (sigma_next - sigma) * model_output - - # Cast sample back to model compatible dtype - prev_sample = prev_sample.to(model_output.dtype) - - # upon completion increase step index by one - self._step_index += 1 - - if not return_dict: - return (prev_sample,) - - return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample) - - def __len__(self): - return self.config.num_train_timesteps diff --git a/hy3dgen/texgen/__init__.py b/hy3dgen/texgen/__init__.py deleted file mode 100644 index 1f890f024d507021eca8087d40dc472de36152bd..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py deleted file mode 100644 index df40dcc8d4819eb903263ff1faf70ce902eb7e07..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -''' -from .hierarchy import BuildHierarchy, BuildHierarchyWithColor -from .io_obj import LoadObj, LoadObjWithTexture -from .render import rasterize, interpolate -''' -from .io_glb import * -from .io_obj import * -from .render import * diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py deleted file mode 100644 index c5d7dc8c6127e62848dda8e79fdc281c5a7b42cb..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py +++ /dev/null @@ -1,248 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import base64 -import io -import os - -import numpy as np -from PIL import Image as PILImage -from pygltflib import GLTF2 -from scipy.spatial.transform import Rotation as R - - -# Function to extract buffer data -def get_buffer_data(gltf, buffer_view): - buffer = gltf.buffers[buffer_view.buffer] - buffer_data = gltf.get_data_from_buffer_uri(buffer.uri) - byte_offset = buffer_view.byteOffset if buffer_view.byteOffset else 0 - byte_length = buffer_view.byteLength - return buffer_data[byte_offset:byte_offset + byte_length] - - -# Function to extract attribute data -def get_attribute_data(gltf, accessor_index): - accessor = gltf.accessors[accessor_index] - buffer_view = gltf.bufferViews[accessor.bufferView] - buffer_data = get_buffer_data(gltf, buffer_view) - - comptype = {5120: np.int8, 5121: np.uint8, 5122: np.int16, 5123: np.uint16, 5125: np.uint32, 5126: np.float32} - dtype = comptype[accessor.componentType] - - t2n = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT2': 4, 'MAT3': 9, 'MAT4': 16} - num_components = t2n[accessor.type] - - # Calculate the correct slice of data - byte_offset = accessor.byteOffset if accessor.byteOffset else 0 - byte_stride = buffer_view.byteStride if buffer_view.byteStride else num_components * np.dtype(dtype).itemsize - count = accessor.count - - # Extract the attribute data - attribute_data = np.zeros((count, num_components), dtype=dtype) - for i in range(count): - start = byte_offset + i * byte_stride - end = start + num_components * np.dtype(dtype).itemsize - attribute_data[i] = np.frombuffer(buffer_data[start:end], dtype=dtype) - - return attribute_data - - -# Function to extract image data -def get_image_data(gltf, image, folder): - if image.uri: - if image.uri.startswith('data:'): - # Data URI - header, encoded = image.uri.split(',', 1) - data = base64.b64decode(encoded) - else: - # External file - fn = image.uri - if not os.path.isabs(fn): - fn = folder + '/' + fn - with open(fn, 'rb') as f: - data = f.read() - else: - buffer_view = gltf.bufferViews[image.bufferView] - data = get_buffer_data(gltf, buffer_view) - return data - - -# Function to convert triangle strip to triangles -def convert_triangle_strip_to_triangles(indices): - triangles = [] - for i in range(len(indices) - 2): - if i % 2 == 0: - triangles.append([indices[i], indices[i + 1], indices[i + 2]]) - else: - triangles.append([indices[i], indices[i + 2], indices[i + 1]]) - return np.array(triangles).reshape(-1, 3) - - -# Function to convert triangle fan to triangles -def convert_triangle_fan_to_triangles(indices): - triangles = [] - for i in range(1, len(indices) - 1): - triangles.append([indices[0], indices[i], indices[i + 1]]) - return np.array(triangles).reshape(-1, 3) - - -# Function to get the transformation matrix from a node -def get_node_transform(node): - if node.matrix: - return np.array(node.matrix).reshape(4, 4).T - else: - T = np.eye(4) - if node.translation: - T[:3, 3] = node.translation - if node.rotation: - R_mat = R.from_quat(node.rotation).as_matrix() - T[:3, :3] = R_mat - if node.scale: - S = np.diag(node.scale + [1]) - T = T @ S - return T - - -def get_world_transform(gltf, node_index, parents, world_transforms): - if parents[node_index] == -2: - return world_transforms[node_index] - - node = gltf.nodes[node_index] - if parents[node_index] == -1: - world_transforms[node_index] = get_node_transform(node) - parents[node_index] = -2 - return world_transforms[node_index] - - parent_index = parents[node_index] - parent_transform = get_world_transform(gltf, parent_index, parents, world_transforms) - world_transforms[node_index] = parent_transform @ get_node_transform(node) - parents[node_index] = -2 - return world_transforms[node_index] - - -def LoadGlb(path): - # Load the GLB file using pygltflib - gltf = GLTF2().load(path) - - primitives = [] - images = {} - # Iterate through the meshes in the GLB file - - world_transforms = [np.identity(4) for i in range(len(gltf.nodes))] - parents = [-1 for i in range(len(gltf.nodes))] - for node_index, node in enumerate(gltf.nodes): - for idx in node.children: - parents[idx] = node_index - # for i in range(len(gltf.nodes)): - # get_world_transform(gltf, i, parents, world_transform) - - for node_index, node in enumerate(gltf.nodes): - if node.mesh is not None: - world_transform = get_world_transform(gltf, node_index, parents, world_transforms) - # Iterate through the primitives in the mesh - mesh = gltf.meshes[node.mesh] - for primitive in mesh.primitives: - # Access the attributes of the primitive - attributes = primitive.attributes.__dict__ - mode = primitive.mode if primitive.mode is not None else 4 # Default to TRIANGLES - result = {} - if primitive.indices is not None: - indices = get_attribute_data(gltf, primitive.indices) - if mode == 4: # TRIANGLES - face_indices = indices.reshape(-1, 3) - elif mode == 5: # TRIANGLE_STRIP - face_indices = convert_triangle_strip_to_triangles(indices) - elif mode == 6: # TRIANGLE_FAN - face_indices = convert_triangle_fan_to_triangles(indices) - else: - continue - result['F'] = face_indices - - # Extract vertex positions - if 'POSITION' in attributes and attributes['POSITION'] is not None: - positions = get_attribute_data(gltf, attributes['POSITION']) - # Apply the world transformation to the positions - positions_homogeneous = np.hstack([positions, np.ones((positions.shape[0], 1))]) - transformed_positions = (world_transform @ positions_homogeneous.T).T[:, :3] - result['V'] = transformed_positions - - # Extract vertex colors - if 'COLOR_0' in attributes and attributes['COLOR_0'] is not None: - colors = get_attribute_data(gltf, attributes['COLOR_0']) - if colors.shape[-1] > 3: - colors = colors[..., :3] - result['VC'] = colors - - # Extract UVs - if 'TEXCOORD_0' in attributes and not attributes['TEXCOORD_0'] is None: - uvs = get_attribute_data(gltf, attributes['TEXCOORD_0']) - result['UV'] = uvs - - if primitive.material is not None: - material = gltf.materials[primitive.material] - if material.pbrMetallicRoughness is not None and material.pbrMetallicRoughness.baseColorTexture is not None: - texture_index = material.pbrMetallicRoughness.baseColorTexture.index - texture = gltf.textures[texture_index] - image_index = texture.source - if not image_index in images: - image = gltf.images[image_index] - image_data = get_image_data(gltf, image, os.path.dirname(path)) - pil_image = PILImage.open(io.BytesIO(image_data)) - if pil_image.mode != 'RGB': - pil_image = pil_image.convert('RGB') - images[image_index] = pil_image - result['TEX'] = image_index - elif material.emissiveTexture is not None: - texture_index = material.emissiveTexture.index - texture = gltf.textures[texture_index] - image_index = texture.source - if not image_index in images: - image = gltf.images[image_index] - image_data = get_image_data(gltf, image, os.path.dirname(path)) - pil_image = PILImage.open(io.BytesIO(image_data)) - if pil_image.mode != 'RGB': - pil_image = pil_image.convert('RGB') - images[image_index] = pil_image - result['TEX'] = image_index - else: - if material.pbrMetallicRoughness is not None: - base_color = material.pbrMetallicRoughness.baseColorFactor - else: - base_color = np.array([0.8, 0.8, 0.8], dtype=np.float32) - result['MC'] = base_color - - primitives.append(result) - - return primitives, images - - -def RotatePrimitives(primitives, transform): - for i in range(len(primitives)): - if 'V' in primitives[i]: - primitives[i]['V'] = primitives[i]['V'] @ transform.T - - -if __name__ == '__main__': - path = 'data/test.glb' - LoadGlb(path) diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py deleted file mode 100644 index a72c478d8efcb9a3d71a67ce5f167559ef76b922..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py +++ /dev/null @@ -1,76 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import cv2 -import numpy as np - - -def LoadObj(fn): - lines = [l.strip() for l in open(fn)] - vertices = [] - faces = [] - for l in lines: - words = [w for w in l.split(' ') if w != ''] - if len(words) == 0: - continue - if words[0] == 'v': - v = [float(words[i]) for i in range(1, 4)] - vertices.append(v) - elif words[0] == 'f': - f = [int(words[i]) - 1 for i in range(1, 4)] - faces.append(f) - - return np.array(vertices).astype('float32'), np.array(faces).astype('int32') - - -def LoadObjWithTexture(fn, tex_fn): - lines = [l.strip() for l in open(fn)] - vertices = [] - vertex_textures = [] - faces = [] - face_textures = [] - for l in lines: - words = [w for w in l.split(' ') if w != ''] - if len(words) == 0: - continue - if words[0] == 'v': - v = [float(words[i]) for i in range(1, len(words))] - vertices.append(v) - elif words[0] == 'vt': - v = [float(words[i]) for i in range(1, len(words))] - vertex_textures.append(v) - elif words[0] == 'f': - f = [] - ft = [] - for i in range(1, len(words)): - t = words[i].split('/') - f.append(int(t[0]) - 1) - ft.append(int(t[1]) - 1) - for i in range(2, len(f)): - faces.append([f[0], f[i - 1], f[i]]) - face_textures.append([ft[0], ft[i - 1], ft[i]]) - - tex_image = cv2.cvtColor(cv2.imread(tex_fn), cv2.COLOR_BGR2RGB) - return np.array(vertices).astype('float32'), np.array(vertex_textures).astype('float32'), np.array(faces).astype( - 'int32'), np.array(face_textures).astype('int32'), tex_image diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py deleted file mode 100644 index 743d4aac4da9e1e18374ce712ac24d19e6788870..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py +++ /dev/null @@ -1,41 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import custom_rasterizer_kernel -import torch - - -def rasterize(pos, tri, resolution, clamp_depth=torch.zeros(0), use_depth_prior=0): - assert (pos.device == tri.device) - findices, barycentric = custom_rasterizer_kernel.rasterize_image(pos[0], tri, clamp_depth, resolution[1], - resolution[0], 1e-6, use_depth_prior) - return findices, barycentric - - -def interpolate(col, findices, barycentric, tri): - f = findices - 1 + (findices == 0) - vcol = col[0, tri.long()[f.long()]] - result = barycentric.view(*barycentric.shape, 1) * vcol - result = torch.sum(result, axis=-2) - return result.view(1, *result.shape) diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py deleted file mode 100644 index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp deleted file mode 100644 index dab3983eef9cae227710bcdc4d86fc2e50b4e6be..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp +++ /dev/null @@ -1,575 +0,0 @@ -#include "rasterizer.h" -#include - -inline int pos2key(float* p, int resolution) { - int x = (p[0] * 0.5 + 0.5) * resolution; - int y = (p[1] * 0.5 + 0.5) * resolution; - int z = (p[2] * 0.5 + 0.5) * resolution; - return (x * resolution + y) * resolution + z; -} - -inline void key2pos(int key, int resolution, float* p) { - int x = key / resolution / resolution; - int y = key / resolution % resolution; - int z = key % resolution; - p[0] = ((x + 0.5) / resolution - 0.5) * 2; - p[1] = ((y + 0.5) / resolution - 0.5) * 2; - p[2] = ((z + 0.5) / resolution - 0.5) * 2; -} - -inline void key2cornerpos(int key, int resolution, float* p) { - int x = key / resolution / resolution; - int y = key / resolution % resolution; - int z = key % resolution; - p[0] = ((x + 0.75) / resolution - 0.5) * 2; - p[1] = ((y + 0.25) / resolution - 0.5) * 2; - p[2] = ((z + 0.75) / resolution - 0.5) * 2; -} - -inline float* pos_ptr(int l, int i, int j, torch::Tensor t) { - float* pdata = t.data_ptr(); - int height = t.size(1); - int width = t.size(2); - return &pdata[((l * height + i) * width + j) * 4]; -} - -struct Grid -{ - std::vector seq2oddcorner; - std::vector seq2evencorner; - std::vector seq2grid; - std::vector seq2normal; - std::vector seq2neighbor; - std::unordered_map grid2seq; - std::vector downsample_seq; - int num_origin_seq; - int resolution; - int stride; -}; - -inline void pos_from_seq(Grid& grid, int seq, float* p) { - auto k = grid.seq2grid[seq]; - key2pos(k, grid.resolution, p); -} - -inline int fetch_seq(Grid& grid, int l, int i, int j, torch::Tensor pdata) { - float* p = pos_ptr(l, i, j, pdata); - if (p[3] == 0) - return -1; - auto key = pos2key(p, grid.resolution); - int seq = grid.grid2seq[key]; - return seq; -} - -inline int fetch_last_seq(Grid& grid, int i, int j, torch::Tensor pdata) { - int num_layers = pdata.size(0); - int l = 0; - int idx = fetch_seq(grid, l, i, j, pdata); - while (l < num_layers - 1) { - l += 1; - int new_idx = fetch_seq(grid, l, i, j, pdata); - if (new_idx == -1) - break; - idx = new_idx; - } - return idx; -} - -inline int fetch_nearest_seq(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) { - float p[3]; - float max_dist = 1e10; - int best_idx = -1; - int num_layers = pdata.size(0); - for (int l = 0; l < num_layers; ++l) { - int idx = fetch_seq(grid, l, i, j, pdata); - if (idx == -1) - break; - pos_from_seq(grid, idx, p); - float dist = std::abs(d - p[(dim + 2) % 3]); - if (dist < max_dist) { - max_dist = dist; - best_idx = idx; - } - } - return best_idx; -} - -inline int fetch_nearest_seq_layer(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) { - float p[3]; - float max_dist = 1e10; - int best_layer = -1; - int num_layers = pdata.size(0); - for (int l = 0; l < num_layers; ++l) { - int idx = fetch_seq(grid, l, i, j, pdata); - if (idx == -1) - break; - pos_from_seq(grid, idx, p); - float dist = std::abs(d - p[(dim + 2) % 3]); - if (dist < max_dist) { - max_dist = dist; - best_layer = l; - } - } - return best_layer; -} - -void FetchNeighbor(Grid& grid, int seq, float* pos, int dim, int boundary_info, std::vector& view_layer_positions, - int* output_indices) -{ - auto t = view_layer_positions[dim]; - int height = t.size(1); - int width = t.size(2); - int top = 0; - int ci = 0; - int cj = 0; - if (dim == 0) { - ci = (pos[1]/2+0.5)*height; - cj = (pos[0]/2+0.5)*width; - } - else if (dim == 1) { - ci = (pos[1]/2+0.5)*height; - cj = (pos[2]/2+0.5)*width; - } - else { - ci = (-pos[2]/2+0.5)*height; - cj = (pos[0]/2+0.5)*width; - } - int stride = grid.stride; - for (int ni = ci + stride; ni >= ci - stride; ni -= stride) { - for (int nj = cj - stride; nj <= cj + stride; nj += stride) { - int idx = -1; - if (ni == ci && nj == cj) - idx = seq; - else if (!(ni < 0 || ni >= height || nj < 0 || nj >= width)) { - if (boundary_info == -1) - idx = fetch_seq(grid, 0, ni, nj, t); - else if (boundary_info == 1) - idx = fetch_last_seq(grid, ni, nj, t); - else - idx = fetch_nearest_seq(grid, ni, nj, dim, pos[(dim + 2) % 3], t); - } - output_indices[top] = idx; - top += 1; - } - } -} - -void DownsampleGrid(Grid& src, Grid& tar) -{ - src.downsample_seq.resize(src.seq2grid.size(), -1); - tar.resolution = src.resolution / 2; - tar.stride = src.stride * 2; - float pos[3]; - std::vector seq2normal_count; - for (int i = 0; i < src.seq2grid.size(); ++i) { - key2pos(src.seq2grid[i], src.resolution, pos); - int k = pos2key(pos, tar.resolution); - int s = seq2normal_count.size(); - if (!tar.grid2seq.count(k)) { - tar.grid2seq[k] = tar.seq2grid.size(); - tar.seq2grid.emplace_back(k); - seq2normal_count.emplace_back(0); - seq2normal_count.emplace_back(0); - seq2normal_count.emplace_back(0); - //tar.seq2normal.emplace_back(src.seq2normal[i]); - } else { - s = tar.grid2seq[k] * 3; - } - seq2normal_count[s + src.seq2normal[i]] += 1; - src.downsample_seq[i] = tar.grid2seq[k]; - } - tar.seq2normal.resize(seq2normal_count.size() / 3); - for (int i = 0; i < seq2normal_count.size(); i += 3) { - int t = 0; - for (int j = 1; j < 3; ++j) { - if (seq2normal_count[i + j] > seq2normal_count[i + t]) - t = j; - } - tar.seq2normal[i / 3] = t; - } -} - -void NeighborGrid(Grid& grid, std::vector view_layer_positions, int v) -{ - grid.seq2evencorner.resize(grid.seq2grid.size(), 0); - grid.seq2oddcorner.resize(grid.seq2grid.size(), 0); - std::unordered_set visited_seq; - for (int vd = 0; vd < 3; ++vd) { - auto t = view_layer_positions[vd]; - auto t0 = view_layer_positions[v]; - int height = t.size(1); - int width = t.size(2); - int num_layers = t.size(0); - int num_view_layers = t0.size(0); - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; ++j) { - for (int l = 0; l < num_layers; ++l) { - int seq = fetch_seq(grid, l, i, j, t); - if (seq == -1) - break; - int dim = grid.seq2normal[seq]; - if (dim != v) - continue; - - float pos[3]; - pos_from_seq(grid, seq, pos); - - int ci = 0; - int cj = 0; - if (dim == 0) { - ci = (pos[1]/2+0.5)*height; - cj = (pos[0]/2+0.5)*width; - } - else if (dim == 1) { - ci = (pos[1]/2+0.5)*height; - cj = (pos[2]/2+0.5)*width; - } - else { - ci = (-pos[2]/2+0.5)*height; - cj = (pos[0]/2+0.5)*width; - } - - if ((ci % (grid.stride * 2) < grid.stride) && (cj % (grid.stride * 2) >= grid.stride)) - grid.seq2evencorner[seq] = 1; - - if ((ci % (grid.stride * 2) >= grid.stride) && (cj % (grid.stride * 2) < grid.stride)) - grid.seq2oddcorner[seq] = 1; - - bool is_boundary = false; - if (vd == v) { - if (l == 0 || l == num_layers - 1) - is_boundary = true; - else { - int seq_new = fetch_seq(grid, l + 1, i, j, t); - if (seq_new == -1) - is_boundary = true; - } - } - int boundary_info = 0; - if (is_boundary && (l == 0)) - boundary_info = -1; - else if (is_boundary) - boundary_info = 1; - if (visited_seq.count(seq)) - continue; - visited_seq.insert(seq); - - FetchNeighbor(grid, seq, pos, dim, boundary_info, view_layer_positions, &grid.seq2neighbor[seq * 9]); - } - } - } - } -} - -void PadGrid(Grid& src, Grid& tar, std::vector& view_layer_positions) { - auto& downsample_seq = src.downsample_seq; - auto& seq2evencorner = src.seq2evencorner; - auto& seq2oddcorner = src.seq2oddcorner; - int indices[9]; - std::vector mapped_even_corners(tar.seq2grid.size(), 0); - std::vector mapped_odd_corners(tar.seq2grid.size(), 0); - for (int i = 0; i < downsample_seq.size(); ++i) { - if (seq2evencorner[i] > 0) { - mapped_even_corners[downsample_seq[i]] = 1; - } - if (seq2oddcorner[i] > 0) { - mapped_odd_corners[downsample_seq[i]] = 1; - } - } - auto& tar_seq2normal = tar.seq2normal; - auto& tar_seq2grid = tar.seq2grid; - for (int i = 0; i < tar_seq2grid.size(); ++i) { - if (mapped_even_corners[i] == 1 && mapped_odd_corners[i] == 1) - continue; - auto k = tar_seq2grid[i]; - float p[3]; - key2cornerpos(k, tar.resolution, p); - - int src_key = pos2key(p, src.resolution); - if (!src.grid2seq.count(src_key)) { - int seq = src.seq2grid.size(); - src.grid2seq[src_key] = seq; - src.seq2evencorner.emplace_back((mapped_even_corners[i] == 0)); - src.seq2oddcorner.emplace_back((mapped_odd_corners[i] == 0)); - src.seq2grid.emplace_back(src_key); - src.seq2normal.emplace_back(tar_seq2normal[i]); - FetchNeighbor(src, seq, p, tar_seq2normal[i], 0, view_layer_positions, indices); - for (int j = 0; j < 9; ++j) { - src.seq2neighbor.emplace_back(indices[j]); - } - src.downsample_seq.emplace_back(i); - } else { - int seq = src.grid2seq[src_key]; - if (mapped_even_corners[i] == 0) - src.seq2evencorner[seq] = 1; - if (mapped_odd_corners[i] == 0) - src.seq2oddcorner[seq] = 1; - } - } -} - -std::vector> build_hierarchy(std::vector view_layer_positions, - std::vector view_layer_normals, int num_level, int resolution) -{ - if (view_layer_positions.size() != 3 || num_level < 1) { - printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level); - return {{},{},{},{}}; - } - - std::vector grids; - grids.resize(num_level); - - std::vector seq2pos; - auto& seq2grid = grids[0].seq2grid; - auto& seq2normal = grids[0].seq2normal; - auto& grid2seq = grids[0].grid2seq; - grids[0].resolution = resolution; - grids[0].stride = 1; - - auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); - auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); - - for (int v = 0; v < 3; ++v) { - int num_layers = view_layer_positions[v].size(0); - int height = view_layer_positions[v].size(1); - int width = view_layer_positions[v].size(2); - float* data = view_layer_positions[v].data_ptr(); - float* data_normal = view_layer_normals[v].data_ptr(); - for (int l = 0; l < num_layers; ++l) { - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; ++j) { - float* p = &data[(i * width + j) * 4]; - float* n = &data_normal[(i * width + j) * 3]; - if (p[3] == 0) - continue; - auto k = pos2key(p, resolution); - if (!grid2seq.count(k)) { - int dim = 0; - for (int d = 0; d < 3; ++d) { - if (std::abs(n[d]) > std::abs(n[dim])) - dim = d; - } - dim = (dim + 1) % 3; - grid2seq[k] = seq2grid.size(); - seq2grid.emplace_back(k); - seq2pos.push_back(p[0]); - seq2pos.push_back(p[1]); - seq2pos.push_back(p[2]); - seq2normal.emplace_back(dim); - } - } - } - data += (height * width * 4); - data_normal += (height * width * 3); - } - } - - for (int i = 0; i < num_level - 1; ++i) { - DownsampleGrid(grids[i], grids[i + 1]); - } - - for (int l = 0; l < num_level; ++l) { - grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1); - grids[l].num_origin_seq = grids[l].seq2grid.size(); - for (int d = 0; d < 3; ++d) { - NeighborGrid(grids[l], view_layer_positions, d); - } - } - - for (int i = num_level - 2; i >= 0; --i) { - PadGrid(grids[i], grids[i + 1], view_layer_positions); - } - for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) { - int k = grids[0].seq2grid[i]; - float p[3]; - key2pos(k, grids[0].resolution, p); - seq2pos.push_back(p[0]); - seq2pos.push_back(p[1]); - seq2pos.push_back(p[2]); - } - - std::vector texture_positions(2); - std::vector grid_neighbors(grids.size()); - std::vector grid_downsamples(grids.size() - 1); - std::vector grid_evencorners(grids.size()); - std::vector grid_oddcorners(grids.size()); - - - texture_positions[0] = torch::zeros({static_cast(seq2pos.size() / 3), static_cast(3)}, float_options); - texture_positions[1] = torch::zeros({static_cast(seq2pos.size() / 3)}, float_options); - float* positions_out_ptr = texture_positions[0].data_ptr(); - memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size()); - positions_out_ptr = texture_positions[1].data_ptr(); - for (int i = 0; i < grids[0].seq2grid.size(); ++i) { - positions_out_ptr[i] = (i < grids[0].num_origin_seq); - } - - for (int i = 0; i < grids.size(); ++i) { - grid_neighbors[i] = torch::zeros({static_cast(grids[i].seq2grid.size()), static_cast(9)}, int64_options); - int64_t* nptr = grid_neighbors[i].data_ptr(); - for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) { - nptr[j] = grids[i].seq2neighbor[j]; - } - - grid_evencorners[i] = torch::zeros({static_cast(grids[i].seq2evencorner.size())}, int64_options); - grid_oddcorners[i] = torch::zeros({static_cast(grids[i].seq2oddcorner.size())}, int64_options); - int64_t* dptr = grid_evencorners[i].data_ptr(); - for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) { - dptr[j] = grids[i].seq2evencorner[j]; - } - dptr = grid_oddcorners[i].data_ptr(); - for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) { - dptr[j] = grids[i].seq2oddcorner[j]; - } - if (i + 1 < grids.size()) { - grid_downsamples[i] = torch::zeros({static_cast(grids[i].downsample_seq.size())}, int64_options); - int64_t* dptr = grid_downsamples[i].data_ptr(); - for (int j = 0; j < grids[i].downsample_seq.size(); ++j) { - dptr[j] = grids[i].downsample_seq[j]; - } - } - - } - return {texture_positions, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners}; -} - -std::vector> build_hierarchy_with_feat( - std::vector view_layer_positions, - std::vector view_layer_normals, - std::vector view_layer_feats, - int num_level, int resolution) -{ - if (view_layer_positions.size() != 3 || num_level < 1) { - printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level); - return {{},{},{},{}}; - } - - std::vector grids; - grids.resize(num_level); - - std::vector seq2pos; - std::vector seq2feat; - auto& seq2grid = grids[0].seq2grid; - auto& seq2normal = grids[0].seq2normal; - auto& grid2seq = grids[0].grid2seq; - grids[0].resolution = resolution; - grids[0].stride = 1; - - auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); - auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); - - int feat_channel = 3; - for (int v = 0; v < 3; ++v) { - int num_layers = view_layer_positions[v].size(0); - int height = view_layer_positions[v].size(1); - int width = view_layer_positions[v].size(2); - float* data = view_layer_positions[v].data_ptr(); - float* data_normal = view_layer_normals[v].data_ptr(); - float* data_feat = view_layer_feats[v].data_ptr(); - feat_channel = view_layer_feats[v].size(3); - for (int l = 0; l < num_layers; ++l) { - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; ++j) { - float* p = &data[(i * width + j) * 4]; - float* n = &data_normal[(i * width + j) * 3]; - float* f = &data_feat[(i * width + j) * feat_channel]; - if (p[3] == 0) - continue; - auto k = pos2key(p, resolution); - if (!grid2seq.count(k)) { - int dim = 0; - for (int d = 0; d < 3; ++d) { - if (std::abs(n[d]) > std::abs(n[dim])) - dim = d; - } - dim = (dim + 1) % 3; - grid2seq[k] = seq2grid.size(); - seq2grid.emplace_back(k); - seq2pos.push_back(p[0]); - seq2pos.push_back(p[1]); - seq2pos.push_back(p[2]); - for (int c = 0; c < feat_channel; ++c) { - seq2feat.emplace_back(f[c]); - } - seq2normal.emplace_back(dim); - } - } - } - data += (height * width * 4); - data_normal += (height * width * 3); - data_feat += (height * width * feat_channel); - } - } - - for (int i = 0; i < num_level - 1; ++i) { - DownsampleGrid(grids[i], grids[i + 1]); - } - - for (int l = 0; l < num_level; ++l) { - grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1); - grids[l].num_origin_seq = grids[l].seq2grid.size(); - for (int d = 0; d < 3; ++d) { - NeighborGrid(grids[l], view_layer_positions, d); - } - } - - for (int i = num_level - 2; i >= 0; --i) { - PadGrid(grids[i], grids[i + 1], view_layer_positions); - } - for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) { - int k = grids[0].seq2grid[i]; - float p[3]; - key2pos(k, grids[0].resolution, p); - seq2pos.push_back(p[0]); - seq2pos.push_back(p[1]); - seq2pos.push_back(p[2]); - for (int c = 0; c < feat_channel; ++c) { - seq2feat.emplace_back(0.5); - } - } - - std::vector texture_positions(2); - std::vector texture_feats(1); - std::vector grid_neighbors(grids.size()); - std::vector grid_downsamples(grids.size() - 1); - std::vector grid_evencorners(grids.size()); - std::vector grid_oddcorners(grids.size()); - - texture_positions[0] = torch::zeros({static_cast(seq2pos.size() / 3), static_cast(3)}, float_options); - texture_positions[1] = torch::zeros({static_cast(seq2pos.size() / 3)}, float_options); - texture_feats[0] = torch::zeros({static_cast(seq2feat.size() / feat_channel), static_cast(feat_channel)}, float_options); - float* positions_out_ptr = texture_positions[0].data_ptr(); - memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size()); - positions_out_ptr = texture_positions[1].data_ptr(); - for (int i = 0; i < grids[0].seq2grid.size(); ++i) { - positions_out_ptr[i] = (i < grids[0].num_origin_seq); - } - float* feats_out_ptr = texture_feats[0].data_ptr(); - memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size()); - - for (int i = 0; i < grids.size(); ++i) { - grid_neighbors[i] = torch::zeros({static_cast(grids[i].seq2grid.size()), static_cast(9)}, int64_options); - int64_t* nptr = grid_neighbors[i].data_ptr(); - for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) { - nptr[j] = grids[i].seq2neighbor[j]; - } - grid_evencorners[i] = torch::zeros({static_cast(grids[i].seq2evencorner.size())}, int64_options); - grid_oddcorners[i] = torch::zeros({static_cast(grids[i].seq2oddcorner.size())}, int64_options); - int64_t* dptr = grid_evencorners[i].data_ptr(); - for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) { - dptr[j] = grids[i].seq2evencorner[j]; - } - dptr = grid_oddcorners[i].data_ptr(); - for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) { - dptr[j] = grids[i].seq2oddcorner[j]; - } - if (i + 1 < grids.size()) { - grid_downsamples[i] = torch::zeros({static_cast(grids[i].downsample_seq.size())}, int64_options); - int64_t* dptr = grid_downsamples[i].data_ptr(); - for (int j = 0; j < grids[i].downsample_seq.size(); ++j) { - dptr[j] = grids[i].downsample_seq[j]; - } - } - } - return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners}; -} diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp deleted file mode 100644 index 4529d7eb674d5263f5103f7a2c2aa5085ee752d5..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp +++ /dev/null @@ -1,139 +0,0 @@ -#include "rasterizer.h" - -void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) { - float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0])); - float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0])); - float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1])); - float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1])); - - for (int px = x_min; px < x_max + 1; ++px) { - if (px < 0 || px >= width) - continue; - for (int py = y_min; py < y_max + 1; ++py) { - if (py < 0 || py >= height) - continue; - float vt[2] = {px + 0.5f, py + 0.5f}; - float baryCentricCoordinate[3]; - calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate); - if (isBarycentricCoordInBounds(baryCentricCoordinate)) { - int pixel = py * width + px; - if (zbuffer == 0) { - zbuffer[pixel] = (INT64)(idx + 1); - continue; - } - - float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2]; - float depth_thres = 0; - if (d) { - depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation; - } - - int z_quantize = depth * (2<<17); - INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1); - if (depth < depth_thres) - continue; - zbuffer[pixel] = std::min(zbuffer[pixel], token); - } - } - } -} - -void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces, - float* barycentric_map, int pix) -{ - INT64 f = zbuffer[pix] % MAXINT; - if (f == (MAXINT-1)) { - findices[pix] = 0; - barycentric_map[pix * 3] = 0; - barycentric_map[pix * 3 + 1] = 0; - barycentric_map[pix * 3 + 2] = 0; - return; - } - findices[pix] = f; - f -= 1; - float barycentric[3] = {0, 0, 0}; - if (f >= 0) { - float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f}; - float* vt0_ptr = V + (F[f * 3] * 4); - float* vt1_ptr = V + (F[f * 3 + 1] * 4); - float* vt2_ptr = V + (F[f * 3 + 2] * 4); - - float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f}; - float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f}; - float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f}; - - calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric); - - barycentric[0] = barycentric[0] / vt0_ptr[3]; - barycentric[1] = barycentric[1] / vt1_ptr[3]; - barycentric[2] = barycentric[2] / vt2_ptr[3]; - float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]); - barycentric[0] *= w; - barycentric[1] *= w; - barycentric[2] *= w; - - } - barycentric_map[pix * 3] = barycentric[0]; - barycentric_map[pix * 3 + 1] = barycentric[1]; - barycentric_map[pix * 3 + 2] = barycentric[2]; -} - -void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f) -{ - float* vt0_ptr = V + (F[f * 3] * 4); - float* vt1_ptr = V + (F[f * 3 + 1] * 4); - float* vt2_ptr = V + (F[f * 3 + 2] * 4); - - float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f}; - float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f}; - float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f}; - - rasterizeTriangleCPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc); -} - -std::vector rasterize_image_cpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, - int width, int height, float occlusion_truncation, int use_depth_prior) -{ - int num_faces = F.size(0); - int num_vertices = V.size(0); - auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false); - auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); - auto findices = torch::zeros({height, width}, options); - INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1); - auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint; - - if (!use_depth_prior) { - for (int i = 0; i < num_faces; ++i) { - rasterizeImagecoordsKernelCPU(V.data_ptr(), F.data_ptr(), 0, - (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces, i); - } - } else { - for (int i = 0; i < num_faces; ++i) - rasterizeImagecoordsKernelCPU(V.data_ptr(), F.data_ptr(), D.data_ptr(), - (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces, i); - } - - auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); - auto barycentric = torch::zeros({height, width, 3}, float_options); - for (int i = 0; i < width * height; ++i) - barycentricFromImgcoordCPU(V.data_ptr(), F.data_ptr(), - findices.data_ptr(), (INT64*)z_min.data_ptr(), width, height, num_vertices, num_faces, barycentric.data_ptr(), i); - - return {findices, barycentric}; -} - -std::vector rasterize_image(torch::Tensor V, torch::Tensor F, torch::Tensor D, - int width, int height, float occlusion_truncation, int use_depth_prior) -{ - int device_id = V.get_device(); - if (device_id == -1) - return rasterize_image_cpu(V, F, D, width, height, occlusion_truncation, use_depth_prior); - else - return rasterize_image_gpu(V, F, D, width, height, occlusion_truncation, use_depth_prior); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("rasterize_image", &rasterize_image, "Custom image rasterization"); - m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization"); - m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization"); -} diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h deleted file mode 100644 index a1fa8ff2150cbf34644c5027a77f6400c8c9cdde..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef RASTERIZER_H_ -#define RASTERIZER_H_ - -#include -#include -#include -#include // For CUDA context -#include -#define INT64 uint64_t -#define MAXINT 2147483647 - -__host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) { - return ((c[0] - a[0]) * (b[1] - a[1]) - (b[0] - a[0]) * (c[1] - a[1])); -} - -__host__ __device__ inline void calculateBarycentricCoordinate(float* a, float* b, float* c, float* p, - float* barycentric) -{ - float beta_tri = calculateSignedArea2(a, p, c); - float gamma_tri = calculateSignedArea2(a, b, p); - float area = calculateSignedArea2(a, b, c); - if (area == 0) { - barycentric[0] = -1.0; - barycentric[1] = -1.0; - barycentric[2] = -1.0; - return; - } - float tri_inv = 1.0 / area; - float beta = beta_tri * tri_inv; - float gamma = gamma_tri * tri_inv; - float alpha = 1.0 - beta - gamma; - barycentric[0] = alpha; - barycentric[1] = beta; - barycentric[2] = gamma; -} - -__host__ __device__ inline bool isBarycentricCoordInBounds(float* barycentricCoord) { - return barycentricCoord[0] >= 0.0 && barycentricCoord[0] <= 1.0 && - barycentricCoord[1] >= 0.0 && barycentricCoord[1] <= 1.0 && - barycentricCoord[2] >= 0.0 && barycentricCoord[2] <= 1.0; -} - -std::vector rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, - int width, int height, float occlusion_truncation, int use_depth_prior); - -std::vector> build_hierarchy(std::vector view_layer_positions, std::vector view_layer_normals, int num_level, int resolution); - -std::vector> build_hierarchy_with_feat( - std::vector view_layer_positions, - std::vector view_layer_normals, - std::vector view_layer_feats, - int num_level, int resolution); - -#endif \ No newline at end of file diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu deleted file mode 100644 index cc6f354c0e2801b9ac84ec4547845c8edb606a60..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu +++ /dev/null @@ -1,127 +0,0 @@ -#include "rasterizer.h" - -__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) { - float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0])); - float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0])); - float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1])); - float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1])); - - for (int px = x_min; px < x_max + 1; ++px) { - if (px < 0 || px >= width) - continue; - for (int py = y_min; py < y_max + 1; ++py) { - if (py < 0 || py >= height) - continue; - float vt[2] = {px + 0.5f, py + 0.5f}; - float baryCentricCoordinate[3]; - calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate); - if (isBarycentricCoordInBounds(baryCentricCoordinate)) { - int pixel = py * width + px; - if (zbuffer == 0) { - atomicExch(&zbuffer[pixel], (INT64)(idx + 1)); - continue; - } - float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2]; - float depth_thres = 0; - if (d) { - depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation; - } - - int z_quantize = depth * (2<<17); - INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1); - if (depth < depth_thres) - continue; - atomicMin(&zbuffer[pixel], token); - } - } - } -} - -__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces, - float* barycentric_map) -{ - int pix = blockIdx.x * blockDim.x + threadIdx.x; - if (pix >= width * height) - return; - INT64 f = zbuffer[pix] % MAXINT; - if (f == (MAXINT-1)) { - findices[pix] = 0; - barycentric_map[pix * 3] = 0; - barycentric_map[pix * 3 + 1] = 0; - barycentric_map[pix * 3 + 2] = 0; - return; - } - findices[pix] = f; - f -= 1; - float barycentric[3] = {0, 0, 0}; - if (f >= 0) { - float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f}; - float* vt0_ptr = V + (F[f * 3] * 4); - float* vt1_ptr = V + (F[f * 3 + 1] * 4); - float* vt2_ptr = V + (F[f * 3 + 2] * 4); - - float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f}; - float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f}; - float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f}; - - calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric); - - barycentric[0] = barycentric[0] / vt0_ptr[3]; - barycentric[1] = barycentric[1] / vt1_ptr[3]; - barycentric[2] = barycentric[2] / vt2_ptr[3]; - float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]); - barycentric[0] *= w; - barycentric[1] *= w; - barycentric[2] *= w; - - } - barycentric_map[pix * 3] = barycentric[0]; - barycentric_map[pix * 3 + 1] = barycentric[1]; - barycentric_map[pix * 3 + 2] = barycentric[2]; -} - -__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces) -{ - int f = blockIdx.x * blockDim.x + threadIdx.x; - if (f >= num_faces) - return; - - float* vt0_ptr = V + (F[f * 3] * 4); - float* vt1_ptr = V + (F[f * 3 + 1] * 4); - float* vt2_ptr = V + (F[f * 3 + 2] * 4); - - float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f}; - float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f}; - float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f}; - - rasterizeTriangleGPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc); -} - -std::vector rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, - int width, int height, float occlusion_truncation, int use_depth_prior) -{ - int device_id = V.get_device(); - cudaSetDevice(device_id); - int num_faces = F.size(0); - int num_vertices = V.size(0); - auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false); - auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false); - auto findices = torch::zeros({height, width}, options); - INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1); - auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint; - - if (!use_depth_prior) { - rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr(), F.data_ptr(), 0, - (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces); - } else { - rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr(), F.data_ptr(), D.data_ptr(), - (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces); - } - - auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false); - auto barycentric = torch::zeros({height, width, 3}, float_options); - barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr(), F.data_ptr(), - findices.data_ptr(), (INT64*)z_min.data_ptr(), width, height, num_vertices, num_faces, barycentric.data_ptr()); - - return {findices, barycentric}; -} diff --git a/hy3dgen/texgen/custom_rasterizer/setup.py b/hy3dgen/texgen/custom_rasterizer/setup.py deleted file mode 100644 index 3e312a7f45689753b5ba3ed4befff1fefecff6fd..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/custom_rasterizer/setup.py +++ /dev/null @@ -1,26 +0,0 @@ -from setuptools import setup, find_packages -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -# build custom rasterizer -# build with `python setup.py install` -# nvcc is needed - -custom_rasterizer_module = CUDAExtension('custom_rasterizer_kernel', [ - 'lib/custom_rasterizer_kernel/rasterizer.cpp', - 'lib/custom_rasterizer_kernel/grid_neighbor.cpp', - 'lib/custom_rasterizer_kernel/rasterizer_gpu.cu', -]) - -setup( - packages=find_packages(), - version='0.1', - name='custom_rasterizer', - include_package_data=True, - package_dir={'': '.'}, - ext_modules=[ - custom_rasterizer_module, - ], - cmdclass={ - 'build_ext': BuildExtension - } -) diff --git a/hy3dgen/texgen/differentiable_renderer/__init__.py b/hy3dgen/texgen/differentiable_renderer/__init__.py deleted file mode 100644 index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp deleted file mode 100644 index cb7a9671b7e96564de44070afdced28da0f631b7..0000000000000000000000000000000000000000 Binary files a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp and /dev/null differ diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib deleted file mode 100644 index 19b554dd00907fa3cacbf26d59f00247cd76985b..0000000000000000000000000000000000000000 Binary files a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib and /dev/null differ diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj deleted file mode 100644 index 318c2eddbb7c258091e2825e02abff7f65ef35b9..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1aa1f67f69a3f4389d88b5824de08503705112177eb5d8c7dd5ad09c2847e8b6 -size 7617045 diff --git a/hy3dgen/texgen/differentiable_renderer/camera_utils.py b/hy3dgen/texgen/differentiable_renderer/camera_utils.py deleted file mode 100644 index 289710ab787a174b39154f1010fc6209e4c92dfe..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/camera_utils.py +++ /dev/null @@ -1,116 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import math - -import numpy as np -import torch - - -def transform_pos(mtx, pos, keepdim=False): - t_mtx = torch.from_numpy(mtx).to( - pos.device) if isinstance( - mtx, np.ndarray) else mtx - if pos.shape[-1] == 3: - posw = torch.cat( - [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1) - else: - posw = pos - - if keepdim: - return torch.matmul(posw, t_mtx.t())[...] - else: - return torch.matmul(posw, t_mtx.t())[None, ...] - - -def get_mv_matrix(elev, azim, camera_distance, center=None): - elev = -elev - azim += 90 - - elev_rad = math.radians(elev) - azim_rad = math.radians(azim) - - camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad), - camera_distance * - math.cos(elev_rad) * math.sin(azim_rad), - camera_distance * math.sin(elev_rad)]) - - if center is None: - center = np.array([0, 0, 0]) - else: - center = np.array(center) - - lookat = center - camera_position - lookat = lookat / np.linalg.norm(lookat) - - up = np.array([0, 0, 1.0]) - right = np.cross(lookat, up) - right = right / np.linalg.norm(right) - up = np.cross(right, lookat) - up = up / np.linalg.norm(up) - - c2w = np.concatenate( - [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1) - - w2c = np.zeros((4, 4)) - w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0)) - w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:]) - w2c[3, 3] = 1.0 - - return w2c.astype(np.float32) - - -def get_orthographic_projection_matrix( - left=-1, right=1, bottom=-1, top=1, near=0, far=2): - """ - 计算正交投影矩阵。 - - 参数: - left (float): 投影区域左侧边界。 - right (float): 投影区域右侧边界。 - bottom (float): 投影区域底部边界。 - top (float): 投影区域顶部边界。 - near (float): 投影区域近裁剪面距离。 - far (float): 投影区域远裁剪面距离。 - - 返回: - numpy.ndarray: 正交投影矩阵。 - """ - ortho_matrix = np.eye(4, dtype=np.float32) - ortho_matrix[0, 0] = 2 / (right - left) - ortho_matrix[1, 1] = 2 / (top - bottom) - ortho_matrix[2, 2] = -2 / (far - near) - ortho_matrix[0, 3] = -(right + left) / (right - left) - ortho_matrix[1, 3] = -(top + bottom) / (top - bottom) - ortho_matrix[2, 3] = -(far + near) / (far - near) - return ortho_matrix - - -def get_perspective_projection_matrix(fovy, aspect_wh, near, far): - fovy_rad = math.radians(fovy) - return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0], - [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0], - [0, 0, -(far + near) / (far - near), - - 2.0 * far * near / (far - near)], - [0, 0, -1, 0]]).astype(np.float32) diff --git a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat deleted file mode 100644 index 3947b0f03f9f6245dac95db7460703076444a304..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat +++ /dev/null @@ -1,3 +0,0 @@ -FOR /F "tokens=*" %%i IN ('python -m pybind11 --includes') DO SET PYINCLUDES=%%i -echo %PYINCLUDES% -g++ -O3 -Wall -shared -std=c++11 -fPIC %PYINCLUDES% mesh_processor.cpp -o mesh_processor.pyd -lpython3.12 \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg b/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg deleted file mode 100644 index 94aa03de74fc9b82fc5335e097d1c2f538610577..0000000000000000000000000000000000000000 Binary files a/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg and /dev/null differ diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp deleted file mode 100644 index ca8650fada02099d3fce0f551fa4f953f278cf34..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp +++ /dev/null @@ -1,161 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; -using namespace std; - -std::pair, - py::array_t> meshVerticeInpaint_smooth(py::array_t texture, -py::array_t mask, - py::array_t vtx_pos, py::array_t vtx_uv, - py::array_t pos_idx, py::array_t uv_idx) { - auto texture_buf = texture.request(); - auto mask_buf = mask.request(); - auto vtx_pos_buf = vtx_pos.request(); - auto vtx_uv_buf = vtx_uv.request(); - auto pos_idx_buf = pos_idx.request(); - auto uv_idx_buf = uv_idx.request(); - - int texture_height = texture_buf.shape[0]; - int texture_width = texture_buf.shape[1]; - int texture_channel = texture_buf.shape[2]; - float* texture_ptr = static_cast(texture_buf.ptr); - uint8_t* mask_ptr = static_cast(mask_buf.ptr); - - int vtx_num = vtx_pos_buf.shape[0]; - float* vtx_pos_ptr = static_cast(vtx_pos_buf.ptr); - float* vtx_uv_ptr = static_cast(vtx_uv_buf.ptr); - int* pos_idx_ptr = static_cast(pos_idx_buf.ptr); - int* uv_idx_ptr = static_cast(uv_idx_buf.ptr); - - vector vtx_mask(vtx_num, 0.0f); - vector> vtx_color(vtx_num, vector(texture_channel, 0.0f)); - vector uncolored_vtxs; - - vector> G(vtx_num); - - for (int i = 0; i < uv_idx_buf.shape[0]; ++i) { - for (int k = 0; k < 3; ++k) { - int vtx_uv_idx = uv_idx_ptr[i * 3 + k]; - int vtx_idx = pos_idx_ptr[i * 3 + k]; - int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1)); - int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1)); - - if (mask_ptr[uv_u * texture_width + uv_v] > 0) { - vtx_mask[vtx_idx] = 1.0f; - for (int c = 0; c < texture_channel; ++c) { - vtx_color[vtx_idx][c] = texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c]; - } - }else{ - uncolored_vtxs.push_back(vtx_idx); - } - - G[pos_idx_ptr[i * 3 + k]].push_back(pos_idx_ptr[i * 3 + (k + 1) % 3]); - } - } - - int smooth_count = 2; - int last_uncolored_vtx_count = 0; - while (smooth_count>0) { - int uncolored_vtx_count = 0; - - for (int vtx_idx : uncolored_vtxs) { - - vector sum_color(texture_channel, 0.0f); - float total_weight = 0.0f; - - array vtx_0 = {vtx_pos_ptr[vtx_idx * 3], -vtx_pos_ptr[vtx_idx * 3 + 1], vtx_pos_ptr[vtx_idx * 3 + 2]}; - for (int connected_idx : G[vtx_idx]) { - if (vtx_mask[connected_idx] > 0) { - array vtx1 = {vtx_pos_ptr[connected_idx * 3], - vtx_pos_ptr[connected_idx * 3 + 1], vtx_pos_ptr[connected_idx * 3 + 2]}; - float dist_weight = 1.0f / max(sqrt(pow(vtx_0[0] - vtx1[0], 2) + pow(vtx_0[1] - vtx1[1], 2) + \ - pow(vtx_0[2] - vtx1[2], 2)), 1E-4); - dist_weight = dist_weight * dist_weight; - for (int c = 0; c < texture_channel; ++c) { - sum_color[c] += vtx_color[connected_idx][c] * dist_weight; - } - total_weight += dist_weight; - } - } - - if (total_weight > 0.0f) { - for (int c = 0; c < texture_channel; ++c) { - vtx_color[vtx_idx][c] = sum_color[c] / total_weight; - } - vtx_mask[vtx_idx] = 1.0f; - } else { - uncolored_vtx_count++; - } - - } - - if(last_uncolored_vtx_count==uncolored_vtx_count){ - smooth_count--; - }else{ - smooth_count++; - } - last_uncolored_vtx_count = uncolored_vtx_count; - } - - // Create new arrays for the output - py::array_t new_texture(texture_buf.size); - py::array_t new_mask(mask_buf.size); - - auto new_texture_buf = new_texture.request(); - auto new_mask_buf = new_mask.request(); - - float* new_texture_ptr = static_cast(new_texture_buf.ptr); - uint8_t* new_mask_ptr = static_cast(new_mask_buf.ptr); - // Copy original texture and mask to new arrays - std::copy(texture_ptr, texture_ptr + texture_buf.size, new_texture_ptr); - std::copy(mask_ptr, mask_ptr + mask_buf.size, new_mask_ptr); - - for (int face_idx = 0; face_idx < uv_idx_buf.shape[0]; ++face_idx) { - for (int k = 0; k < 3; ++k) { - int vtx_uv_idx = uv_idx_ptr[face_idx * 3 + k]; - int vtx_idx = pos_idx_ptr[face_idx * 3 + k]; - - if (vtx_mask[vtx_idx] == 1.0f) { - int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1)); - int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1)); - - for (int c = 0; c < texture_channel; ++c) { - new_texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c] = vtx_color[vtx_idx][c]; - } - new_mask_ptr[uv_u * texture_width + uv_v] = 255; - } - } - } - - // Reshape the new arrays to match the original texture and mask shapes - new_texture.resize({texture_height, texture_width, 3}); - new_mask.resize({texture_height, texture_width}); - return std::make_pair(new_texture, new_mask); -} - - -std::pair, py::array_t> meshVerticeInpaint(py::array_t texture, - py::array_t mask, - py::array_t vtx_pos, py::array_t vtx_uv, - py::array_t pos_idx, py::array_t uv_idx, const std::string& method = "smooth") { - if (method == "smooth") { - return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx); - } else { - throw std::invalid_argument("Invalid method. Use 'smooth' or 'forward'."); - } -} - -PYBIND11_MODULE(mesh_processor, m) { - m.def("meshVerticeInpaint", &meshVerticeInpaint, "A function to process mesh", - py::arg("texture"), py::arg("mask"), - py::arg("vtx_pos"), py::arg("vtx_uv"), - py::arg("pos_idx"), py::arg("uv_idx"), - py::arg("method") = "smooth"); -} \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO deleted file mode 100644 index ddb5e19214f697ef854a3c010d9e1e1e25a49702..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO +++ /dev/null @@ -1,7 +0,0 @@ -Metadata-Version: 2.2 -Name: mesh_processor -Version: 0.0.0 -Requires-Python: >=3.6 -Requires-Dist: pybind11>=2.6.0 -Dynamic: requires-dist -Dynamic: requires-python diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt deleted file mode 100644 index 0ca24855f9323bfe0f20a2fab4dc2f55e6e34079..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt +++ /dev/null @@ -1,7 +0,0 @@ -mesh_processor.cpp -setup.py -mesh_processor.egg-info/PKG-INFO -mesh_processor.egg-info/SOURCES.txt -mesh_processor.egg-info/dependency_links.txt -mesh_processor.egg-info/requires.txt -mesh_processor.egg-info/top_level.txt \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt deleted file mode 100644 index d89789fcaa28db9e76d59597b04095a0a9f99fa3..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt +++ /dev/null @@ -1 +0,0 @@ -pybind11>=2.6.0 diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt deleted file mode 100644 index ccd72df0d4e79e7f3ee7e8ad3728d300bde6c3fe..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -mesh_processor diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py deleted file mode 100644 index a96955c19757df5ad18095b33829962140c04647..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np - -def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx): - texture_height, texture_width, texture_channel = texture.shape - vtx_num = vtx_pos.shape[0] - - vtx_mask = np.zeros(vtx_num, dtype=np.float32) - vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)] - uncolored_vtxs = [] - G = [[] for _ in range(vtx_num)] - - for i in range(uv_idx.shape[0]): - for k in range(3): - vtx_uv_idx = uv_idx[i, k] - vtx_idx = pos_idx[i, k] - uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) - uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) - if mask[uv_u, uv_v] > 0: - vtx_mask[vtx_idx] = 1.0 - vtx_color[vtx_idx] = texture[uv_u, uv_v] - else: - uncolored_vtxs.append(vtx_idx) - G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3]) - - smooth_count = 2 - last_uncolored_vtx_count = 0 - while smooth_count > 0: - uncolored_vtx_count = 0 - for vtx_idx in uncolored_vtxs: - sum_color = np.zeros(texture_channel, dtype=np.float32) - total_weight = 0.0 - vtx_0 = vtx_pos[vtx_idx] - for connected_idx in G[vtx_idx]: - if vtx_mask[connected_idx] > 0: - vtx1 = vtx_pos[connected_idx] - dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2)) - dist_weight = 1.0 / max(dist, 1e-4) - dist_weight *= dist_weight - sum_color += vtx_color[connected_idx] * dist_weight - total_weight += dist_weight - if total_weight > 0: - vtx_color[vtx_idx] = sum_color / total_weight - vtx_mask[vtx_idx] = 1.0 - else: - uncolored_vtx_count += 1 - - if last_uncolored_vtx_count == uncolored_vtx_count: - smooth_count -= 1 - else: - smooth_count += 1 - last_uncolored_vtx_count = uncolored_vtx_count - - new_texture = texture.copy() - new_mask = mask.copy() - for face_idx in range(uv_idx.shape[0]): - for k in range(3): - vtx_uv_idx = uv_idx[face_idx, k] - vtx_idx = pos_idx[face_idx, k] - if vtx_mask[vtx_idx] == 1.0: - uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) - uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) - new_texture[uv_u, uv_v] = vtx_color[vtx_idx] - new_mask[uv_u, uv_v] = 255 - return new_texture, new_mask - -def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"): - if method == "smooth": - return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) - else: - raise ValueError("Invalid method. Use 'smooth' or 'forward'.") \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_render.py b/hy3dgen/texgen/differentiable_renderer/mesh_render.py deleted file mode 100644 index c85b80e043221282e9ff6bfb81764fb32c5d48ed..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_render.py +++ /dev/null @@ -1,833 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import cv2 -import numpy as np -import torch -import torch.nn.functional as F -import trimesh -from PIL import Image - -from .camera_utils import ( - transform_pos, - get_mv_matrix, - get_orthographic_projection_matrix, - get_perspective_projection_matrix, -) -from .mesh_processor import meshVerticeInpaint -from .mesh_utils import load_mesh, save_mesh - - -def stride_from_shape(shape): - stride = [1] - for x in reversed(shape[1:]): - stride.append(stride[-1] * x) - return list(reversed(stride)) - - -def scatter_add_nd_with_count(input, count, indices, values, weights=None): - # input: [..., C], D dimension + C channel - # count: [..., 1], D dimension - # indices: [N, D], long - # values: [N, C] - - D = indices.shape[-1] - C = input.shape[-1] - size = input.shape[:-1] - stride = stride_from_shape(size) - - assert len(size) == D - - input = input.view(-1, C) # [HW, C] - count = count.view(-1, 1) - - flatten_indices = (indices * torch.tensor(stride, - dtype=torch.long, device=indices.device)).sum(-1) # [N] - - if weights is None: - weights = torch.ones_like(values[..., :1]) - - input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values) - count.scatter_add_(0, flatten_indices.unsqueeze(1), weights) - - return input.view(*size, C), count.view(*size, 1) - - -def linear_grid_put_2d(H, W, coords, values, return_count=False): - # coords: [N, 2], float in [0, 1] - # values: [N, C] - - C = values.shape[-1] - - indices = coords * torch.tensor( - [H - 1, W - 1], dtype=torch.float32, device=coords.device - ) - indices_00 = indices.floor().long() # [N, 2] - indices_00[:, 0].clamp_(0, H - 2) - indices_00[:, 1].clamp_(0, W - 2) - indices_01 = indices_00 + torch.tensor( - [0, 1], dtype=torch.long, device=indices.device - ) - indices_10 = indices_00 + torch.tensor( - [1, 0], dtype=torch.long, device=indices.device - ) - indices_11 = indices_00 + torch.tensor( - [1, 1], dtype=torch.long, device=indices.device - ) - - h = indices[..., 0] - indices_00[..., 0].float() - w = indices[..., 1] - indices_00[..., 1].float() - w_00 = (1 - h) * (1 - w) - w_01 = (1 - h) * w - w_10 = h * (1 - w) - w_11 = h * w - - result = torch.zeros(H, W, C, device=values.device, - dtype=values.dtype) # [H, W, C] - count = torch.zeros(H, W, 1, device=values.device, - dtype=values.dtype) # [H, W, 1] - weights = torch.ones_like(values[..., :1]) # [N, 1] - - result, count = scatter_add_nd_with_count( - result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1)) - result, count = scatter_add_nd_with_count( - result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1)) - result, count = scatter_add_nd_with_count( - result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1)) - result, count = scatter_add_nd_with_count( - result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1)) - - if return_count: - return result, count - - mask = (count.squeeze(-1) > 0) - result[mask] = result[mask] / count[mask].repeat(1, C) - - return result - - -class MeshRender(): - def __init__( - self, - camera_distance=1.45, camera_type='orth', - default_resolution=1024, texture_size=1024, - use_antialias=True, max_mip_level=None, filter_mode='linear', - bake_mode='linear', raster_mode='cr', device='cuda'): - - self.device = device - - self.set_default_render_resolution(default_resolution) - self.set_default_texture_resolution(texture_size) - - self.camera_distance = camera_distance - self.use_antialias = use_antialias - self.max_mip_level = max_mip_level - self.filter_mode = filter_mode - - self.bake_angle_thres = 75 - self.bake_unreliable_kernel_size = int( - (2 / 512) * max(self.default_resolution[0], self.default_resolution[1])) - self.bake_mode = bake_mode - - self.raster_mode = raster_mode - if self.raster_mode == 'cr': - import custom_rasterizer as cr - self.raster = cr - else: - raise f'No raster named {self.raster_mode}' - - if camera_type == 'orth': - self.ortho_scale = 1.2 - self.camera_proj_mat = get_orthographic_projection_matrix( - left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5, - bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5, - near=0.1, far=100 - ) - elif camera_type == 'perspective': - self.camera_proj_mat = get_perspective_projection_matrix( - 49.13, self.default_resolution[1] / self.default_resolution[0], - 0.01, 100.0 - ) - else: - raise f'No camera type {camera_type}' - - def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True): - - if self.raster_mode == 'cr': - rast_out_db = None - if pos.dim() == 2: - pos = pos.unsqueeze(0) - findices, barycentric = self.raster.rasterize(pos, tri, resolution) - rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1) - rast_out = rast_out.unsqueeze(0) - else: - raise f'No raster named {self.raster_mode}' - - return rast_out, rast_out_db - - def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None): - - if self.raster_mode == 'cr': - textd = None - barycentric = rast_out[0, ..., :-1] - findices = rast_out[0, ..., -1] - if uv.dim() == 2: - uv = uv.unsqueeze(0) - textc = self.raster.interpolate(uv, findices, barycentric, uv_idx) - else: - raise f'No raster named {self.raster_mode}' - - return textc, textd - - def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', - boundary_mode='wrap', max_mip_level=None): - - if self.raster_mode == 'cr': - raise f'Texture is not implemented in cr' - else: - raise f'No raster named {self.raster_mode}' - - return color - - def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0): - - if self.raster_mode == 'cr': - # Antialias has not been supported yet - color = color - else: - raise f'No raster named {self.raster_mode}' - - return color - - def load_mesh( - self, - mesh, - scale_factor=1.15, - auto_center=True, - ): - vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh) - self.mesh_copy = mesh - self.set_mesh(vtx_pos, pos_idx, - vtx_uv=vtx_uv, uv_idx=uv_idx, - scale_factor=scale_factor, auto_center=auto_center - ) - if texture_data is not None: - self.set_texture(texture_data) - - def save_mesh(self): - texture_data = self.get_texture() - texture_data = Image.fromarray((texture_data * 255).astype(np.uint8)) - return save_mesh(self.mesh_copy, texture_data) - - def set_mesh( - self, - vtx_pos, pos_idx, - vtx_uv=None, uv_idx=None, - scale_factor=1.15, auto_center=True - ): - - self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float() - self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int) - if (vtx_uv is not None) and (uv_idx is not None): - self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float() - self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int) - else: - self.vtx_uv = None - self.uv_idx = None - - self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]] - self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]] - if (vtx_uv is not None) and (uv_idx is not None): - self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1] - - if auto_center: - max_bb = (self.vtx_pos - 0).max(0)[0] - min_bb = (self.vtx_pos - 0).min(0)[0] - center = (max_bb + min_bb) / 2 - scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0 - self.vtx_pos = (self.vtx_pos - center) * \ - (scale_factor / float(scale)) - self.scale_factor = scale_factor - - def set_texture(self, tex): - if isinstance(tex, np.ndarray): - tex = Image.fromarray((tex * 255).astype(np.uint8)) - elif isinstance(tex, torch.Tensor): - tex = tex.cpu().numpy() - tex = Image.fromarray((tex * 255).astype(np.uint8)) - - tex = tex.resize(self.texture_size).convert('RGB') - tex = np.array(tex) / 255.0 - self.tex = torch.from_numpy(tex).to(self.device) - self.tex = self.tex.float() - - def set_default_render_resolution(self, default_resolution): - if isinstance(default_resolution, int): - default_resolution = (default_resolution, default_resolution) - self.default_resolution = default_resolution - - def set_default_texture_resolution(self, texture_size): - if isinstance(texture_size, int): - texture_size = (texture_size, texture_size) - self.texture_size = texture_size - - def get_mesh(self): - vtx_pos = self.vtx_pos.cpu().numpy() - pos_idx = self.pos_idx.cpu().numpy() - vtx_uv = self.vtx_uv.cpu().numpy() - uv_idx = self.uv_idx.cpu().numpy() - - # 坐标变换的逆变换 - vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]] - vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]] - - vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1] - return vtx_pos, pos_idx, vtx_uv, uv_idx - - def get_texture(self): - return self.tex.cpu().numpy() - - def to(self, device): - self.device = device - - for attr_name in dir(self): - attr_value = getattr(self, attr_name) - if isinstance(attr_value, torch.Tensor): - setattr(self, attr_name, attr_value.to(self.device)) - - def color_rgb_to_srgb(self, image): - if isinstance(image, Image.Image): - image_rgb = torch.tesnor( - np.array(image) / - 255.0).float().to( - self.device) - elif isinstance(image, np.ndarray): - image_rgb = torch.tensor(image).float() - else: - image_rgb = image.to(self.device) - - image_srgb = torch.where( - image_rgb <= 0.0031308, - 12.92 * image_rgb, - 1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055 - ) - - if isinstance(image, Image.Image): - image_srgb = Image.fromarray( - (image_srgb.cpu().numpy() * - 255).astype( - np.uint8)) - elif isinstance(image, np.ndarray): - image_srgb = image_srgb.cpu().numpy() - else: - image_srgb = image_srgb.to(image.device) - - return image_srgb - - def _render( - self, - glctx, - mvp, - pos, - pos_idx, - uv, - uv_idx, - tex, - resolution, - max_mip_level, - keep_alpha, - filter_mode - ): - pos_clip = transform_pos(mvp, pos) - if isinstance(resolution, (int, float)): - resolution = [resolution, resolution] - rast_out, rast_out_db = self.raster_rasterize( - glctx, pos_clip, pos_idx, resolution=resolution) - - tex = tex.contiguous() - if filter_mode == 'linear-mipmap-linear': - texc, texd = self.raster_interpolate( - uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all') - color = self.raster_texture( - tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level) - else: - texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx) - color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode) - - visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) - color = color * visible_mask # Mask out background. - if self.use_antialias: - color = self.raster_antialias(color, rast_out, pos_clip, pos_idx) - - if keep_alpha: - color = torch.cat([color, visible_mask], dim=-1) - return color[0, ...] - - def render( - self, - elev, - azim, - camera_distance=None, - center=None, - resolution=None, - tex=None, - keep_alpha=True, - bgcolor=None, - filter_mode=None, - return_type='th' - ): - - proj = self.camera_proj_mat - r_mv = get_mv_matrix( - elev=elev, - azim=azim, - camera_distance=self.camera_distance if camera_distance is None else camera_distance, - center=center) - r_mvp = np.matmul(proj, r_mv).astype(np.float32) - if tex is not None: - if isinstance(tex, Image.Image): - tex = torch.tensor(np.array(tex) / 255.0) - elif isinstance(tex, np.ndarray): - tex = torch.tensor(tex) - if tex.dim() == 2: - tex = tex.unsqueeze(-1) - tex = tex.float().to(self.device) - image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx, - self.tex if tex is None else tex, - self.default_resolution if resolution is None else resolution, - self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode) - mask = (image[..., [-1]] == 1).float() - if bgcolor is None: - bgcolor = [0 for _ in range(image.shape[-1] - 1)] - image = image * mask + (1 - mask) * \ - torch.tensor(bgcolor + [0]).to(self.device) - if keep_alpha == False: - image = image[..., :-1] - if return_type == 'np': - image = image.cpu().numpy() - elif return_type == 'pl': - image = image.squeeze(-1).cpu().numpy() * 255 - image = Image.fromarray(image.astype(np.uint8)) - return image - - def render_normal( - self, - elev, - azim, - camera_distance=None, - center=None, - resolution=None, - bg_color=[1, 1, 1], - use_abs_coor=False, - normalize_rgb=True, - return_type='th' - ): - - pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) - if resolution is None: - resolution = self.default_resolution - if isinstance(resolution, (int, float)): - resolution = [resolution, resolution] - rast_out, rast_out_db = self.raster_rasterize( - pos_clip, self.pos_idx, resolution=resolution) - - if use_abs_coor: - mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :] - else: - pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] - mesh_triangles = pos_camera[self.pos_idx[:, :3], :] - face_normals = F.normalize( - torch.cross(mesh_triangles[:, - 1, - :] - mesh_triangles[:, - 0, - :], - mesh_triangles[:, - 2, - :] - mesh_triangles[:, - 0, - :], - dim=-1), - dim=-1) - - vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], - faces=self.pos_idx.cpu(), - face_normals=face_normals.cpu(), ) - vertex_normals = torch.from_numpy( - vertex_normals).float().to(self.device).contiguous() - - # Interpolate normal values across the rasterized pixels - normal, _ = self.raster_interpolate( - vertex_normals[None, ...], rast_out, self.pos_idx) - - visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) - normal = normal * visible_mask + \ - torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - - visible_mask) # Mask out background. - - if normalize_rgb: - normal = (normal + 1) * 0.5 - if self.use_antialias: - normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx) - - image = normal[0, ...] - if return_type == 'np': - image = image.cpu().numpy() - elif return_type == 'pl': - image = image.cpu().numpy() * 255 - image = Image.fromarray(image.astype(np.uint8)) - - return image - - def convert_normal_map(self, image): - # blue is front, red is left, green is top - if isinstance(image, Image.Image): - image = np.array(image) - mask = (image == [255, 255, 255]).all(axis=-1) - - image = (image / 255.0) * 2.0 - 1.0 - - image[..., [1]] = -image[..., [1]] - image[..., [1, 2]] = image[..., [2, 1]] - image[..., [0]] = -image[..., [0]] - - image = (image + 1.0) * 0.5 - - image = (image * 255).astype(np.uint8) - image[mask] = [127, 127, 255] - - return Image.fromarray(image) - - def get_pos_from_mvp(self, elev, azim, camera_distance, center): - proj = self.camera_proj_mat - r_mv = get_mv_matrix( - elev=elev, - azim=azim, - camera_distance=self.camera_distance if camera_distance is None else camera_distance, - center=center) - - pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) - pos_clip = transform_pos(proj, pos_camera) - - return pos_camera, pos_clip - - def render_depth( - self, - elev, - azim, - camera_distance=None, - center=None, - resolution=None, - return_type='th' - ): - pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) - - if resolution is None: - resolution = self.default_resolution - if isinstance(resolution, (int, float)): - resolution = [resolution, resolution] - rast_out, rast_out_db = self.raster_rasterize( - pos_clip, self.pos_idx, resolution=resolution) - - pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] - tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() - - # Interpolate depth values across the rasterized pixels - depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) - - visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) - depth_max, depth_min = depth[visible_mask > - 0].max(), depth[visible_mask > 0].min() - depth = (depth - depth_min) / (depth_max - depth_min) - - depth = depth * visible_mask # Mask out background. - if self.use_antialias: - depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx) - - image = depth[0, ...] - if return_type == 'np': - image = image.cpu().numpy() - elif return_type == 'pl': - image = image.squeeze(-1).cpu().numpy() * 255 - image = Image.fromarray(image.astype(np.uint8)) - return image - - def render_position(self, elev, azim, camera_distance=None, center=None, - resolution=None, bg_color=[1, 1, 1], return_type='th'): - pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) - if resolution is None: - resolution = self.default_resolution - if isinstance(resolution, (int, float)): - resolution = [resolution, resolution] - rast_out, rast_out_db = self.raster_rasterize( - pos_clip, self.pos_idx, resolution=resolution) - - tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor - tex_position = tex_position.contiguous() - - # Interpolate depth values across the rasterized pixels - position, _ = self.raster_interpolate( - tex_position[None, ...], rast_out, self.pos_idx) - - visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) - - position = position * visible_mask + \ - torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - - visible_mask) # Mask out background. - if self.use_antialias: - position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx) - - image = position[0, ...] - - if return_type == 'np': - image = image.cpu().numpy() - elif return_type == 'pl': - image = image.squeeze(-1).cpu().numpy() * 255 - image = Image.fromarray(image.astype(np.uint8)) - return image - - def render_uvpos(self, return_type='th'): - image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5) - if return_type == 'np': - image = image.cpu().numpy() - elif return_type == 'pl': - image = image.cpu().numpy() * 255 - image = Image.fromarray(image.astype(np.uint8)) - return image - - def uv_feature_map(self, vert_feat, bg=None): - vtx_uv = self.vtx_uv * 2 - 1.0 - vtx_uv = torch.cat( - [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0) - vtx_uv[..., -1] = 1 - uv_idx = self.uv_idx - rast_out, rast_out_db = self.raster_rasterize( - vtx_uv, uv_idx, resolution=self.texture_size) - feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx) - feat_map = feat_map[0, ...] - if bg is not None: - visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] - feat_map[visible_mask == 0] = bg - return feat_map - - def render_sketch_from_geometry(self, normal_image, depth_image): - normal_image_np = normal_image.cpu().numpy() - depth_image_np = depth_image.cpu().numpy() - - normal_image_np = (normal_image_np * 255).astype(np.uint8) - depth_image_np = (depth_image_np * 255).astype(np.uint8) - normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY) - - normal_edges = cv2.Canny(normal_image_np, 80, 150) - depth_edges = cv2.Canny(depth_image_np, 30, 80) - - combined_edges = np.maximum(normal_edges, depth_edges) - - sketch_image = torch.from_numpy(combined_edges).to( - normal_image.device).float() / 255.0 - sketch_image = sketch_image.unsqueeze(-1) - - return sketch_image - - def render_sketch_from_depth(self, depth_image): - depth_image_np = depth_image.cpu().numpy() - depth_image_np = (depth_image_np * 255).astype(np.uint8) - depth_edges = cv2.Canny(depth_image_np, 30, 80) - combined_edges = depth_edges - sketch_image = torch.from_numpy(combined_edges).to( - depth_image.device).float() / 255.0 - sketch_image = sketch_image.unsqueeze(-1) - return sketch_image - - def back_project(self, image, elev, azim, - camera_distance=None, center=None, method=None): - if isinstance(image, Image.Image): - image = torch.tensor(np.array(image) / 255.0) - elif isinstance(image, np.ndarray): - image = torch.tensor(image) - if image.dim() == 2: - image = image.unsqueeze(-1) - image = image.float().to(self.device) - resolution = image.shape[:2] - channel = image.shape[-1] - texture = torch.zeros(self.texture_size + (channel,)).to(self.device) - cos_map = torch.zeros(self.texture_size + (1,)).to(self.device) - - proj = self.camera_proj_mat - r_mv = get_mv_matrix( - elev=elev, - azim=azim, - camera_distance=self.camera_distance if camera_distance is None else camera_distance, - center=center) - pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) - pos_clip = transform_pos(proj, pos_camera) - pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] - v0 = pos_camera[self.pos_idx[:, 0], :] - v1 = pos_camera[self.pos_idx[:, 1], :] - v2 = pos_camera[self.pos_idx[:, 2], :] - face_normals = F.normalize( - torch.cross( - v1 - v0, - v2 - v0, - dim=-1), - dim=-1) - vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], - faces=self.pos_idx.cpu(), - face_normals=face_normals.cpu(), ) - vertex_normals = torch.from_numpy( - vertex_normals).float().to(self.device).contiguous() - tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() - rast_out, rast_out_db = self.raster_rasterize( - pos_clip, self.pos_idx, resolution=resolution) - visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] - - normal, _ = self.raster_interpolate( - vertex_normals[None, ...], rast_out, self.pos_idx) - normal = normal[0, ...] - uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx) - depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) - depth = depth[0, ...] - - depth_max, depth_min = depth[visible_mask > - 0].max(), depth[visible_mask > 0].min() - depth_normalized = (depth - depth_min) / (depth_max - depth_min) - depth_image = depth_normalized * visible_mask # Mask out background. - - sketch_image = self.render_sketch_from_depth(depth_image) - - lookat = torch.tensor([[0, 0, -1]], device=self.device) - cos_image = torch.nn.functional.cosine_similarity( - lookat, normal.view(-1, 3)) - cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1) - - cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi) - cos_image[cos_image < cos_thres] = 0 - - # shrink - kernel_size = self.bake_unreliable_kernel_size * 2 + 1 - kernel = torch.ones( - (1, 1, kernel_size, kernel_size), dtype=torch.float32).to( - sketch_image.device) - - visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float() - visible_mask = F.conv2d( - 1.0 - visible_mask, - kernel, - padding=kernel_size // 2) - visible_mask = 1.0 - (visible_mask > 0).float() # 二值化 - visible_mask = visible_mask.squeeze(0).permute(1, 2, 0) - - sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0) - sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2) - sketch_image = (sketch_image > 0).float() # 二值化 - sketch_image = sketch_image.squeeze(0).permute(1, 2, 0) - visible_mask = visible_mask * (sketch_image < 0.5) - - cos_image[visible_mask == 0] = 0 - - method = self.bake_mode if method is None else method - - if method == 'linear': - proj_mask = (visible_mask != 0).view(-1) - uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask] - image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask] - cos_image = cos_image.contiguous().view(-1, 1)[proj_mask] - sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask] - - texture = linear_grid_put_2d( - self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image) - cos_map = linear_grid_put_2d( - self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image) - boundary_map = linear_grid_put_2d( - self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image) - else: - raise f'No bake mode {method}' - - return texture, cos_map, boundary_map - - def bake_texture(self, colors, elevs, azims, - camera_distance=None, center=None, exp=6, weights=None): - for i in range(len(colors)): - if isinstance(colors[i], Image.Image): - colors[i] = torch.tensor( - np.array( - colors[i]) / 255.0, - device=self.device).float() - if weights is None: - weights = [1.0 for _ in range(colors)] - textures = [] - cos_maps = [] - for color, elev, azim, weight in zip(colors, elevs, azims, weights): - texture, cos_map, _ = self.back_project( - color, elev, azim, camera_distance, center) - cos_map = weight * (cos_map ** exp) - textures.append(texture) - cos_maps.append(cos_map) - - texture_merge, trust_map_merge = self.fast_bake_texture( - textures, cos_maps) - return texture_merge, trust_map_merge - - @torch.no_grad() - def fast_bake_texture(self, textures, cos_maps): - - channel = textures[0].shape[-1] - texture_merge = torch.zeros( - self.texture_size + (channel,)).to(self.device) - trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device) - for texture, cos_map in zip(textures, cos_maps): - view_sum = (cos_map > 0).sum() - painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum() - if painted_sum / view_sum > 0.99: - continue - texture_merge += texture * cos_map - trust_map_merge += cos_map - texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8) - - return texture_merge, trust_map_merge > 1E-8 - - def uv_inpaint(self, texture, mask): - - if isinstance(texture, torch.Tensor): - texture_np = texture.cpu().numpy() - elif isinstance(texture, np.ndarray): - texture_np = texture - elif isinstance(texture, Image.Image): - texture_np = np.array(texture) / 255.0 - - vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh() - - texture_np, mask = meshVerticeInpaint( - texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) - - texture_np = cv2.inpaint( - (texture_np * - 255).astype( - np.uint8), - 255 - - mask, - 3, - cv2.INPAINT_NS) - - return texture_np diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py deleted file mode 100644 index ca0ba1a6145c68651ec033b97e80900cd2c9d7ec..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import trimesh - - -def load_mesh(mesh): - vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None - pos_idx = mesh.faces if hasattr(mesh, 'faces') else None - - vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None - uv_idx = mesh.faces if hasattr(mesh, 'faces') else None - - texture_data = None - - return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data - - -def save_mesh(mesh, texture_data): - material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255)) - texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material) - mesh.visual = texture_visuals - return mesh diff --git a/hy3dgen/texgen/differentiable_renderer/setup.py b/hy3dgen/texgen/differentiable_renderer/setup.py deleted file mode 100644 index 2ea78693fe96ac027742bd752238421c6d83f8fc..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/differentiable_renderer/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -from setuptools import setup, Extension -import pybind11 -import sys -import platform - -def get_platform_specific_args(): - system = platform.system().lower() - cpp_std = 'c++14' # Make configurable if needed - - if sys.platform == 'win32': - compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj'] - link_args = [] - extra_includes = [] - elif system == 'linux': - compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread'] - link_args = ['-fPIC', '-pthread'] - extra_includes = [] - elif sys.platform == 'darwin': - compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', - '-stdlib=libc++', '-mmacosx-version-min=10.14'] - link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib'] - extra_includes = [] - else: - raise RuntimeError(f"Unsupported platform: {system}") - - return compile_args, link_args, extra_includes - -extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args() -include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)] -include_dirs.extend(platform_includes) - -ext_modules = [ - Extension( - "mesh_processor", - ["mesh_processor.cpp"], - include_dirs=include_dirs, - language='c++', - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args, - ), -] - -setup( - name="mesh_processor", - ext_modules=ext_modules, - install_requires=['pybind11>=2.6.0'], - python_requires='>=3.6', -) \ No newline at end of file diff --git a/hy3dgen/texgen/hunyuanpaint/__init__.py b/hy3dgen/texgen/hunyuanpaint/__init__.py deleted file mode 100644 index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/hunyuanpaint/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py deleted file mode 100644 index 436ce34efb8bc40c3df2b3902b7a29dffa39ae91..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/hunyuanpaint/pipeline.py +++ /dev/null @@ -1,554 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy -import numpy as np -import torch -import torch.distributed -import torch.utils.checkpoint -from PIL import Image -from diffusers import ( - AutoencoderKL, - DiffusionPipeline, - ImagePipelineOutput -) -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback -from diffusers.image_processor import PipelineImageInput -from diffusers.image_processor import VaeImageProcessor -from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \ - rescale_noise_cfg -from diffusers.schedulers import KarrasDiffusionSchedulers -from diffusers.utils import deprecate -from einops import rearrange -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer - -from .unet.modules import UNet2p5DConditionModel - - -def to_rgb_image(maybe_rgba: Image.Image): - if maybe_rgba.mode == 'RGB': - return maybe_rgba - elif maybe_rgba.mode == 'RGBA': - rgba = maybe_rgba - img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8) - img = Image.fromarray(img, 'RGB') - img.paste(rgba, mask=rgba.getchannel('A')) - return img - else: - raise ValueError("Unsupported image type.", maybe_rgba.mode) - - -class HunyuanPaintPipeline(StableDiffusionPipeline): - - def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2p5DConditionModel, - scheduler: KarrasDiffusionSchedulers, - feature_extractor: CLIPImageProcessor, - safety_checker=None, - use_torch_compile=False, - ): - DiffusionPipeline.__init__(self) - - safety_checker = None - self.register_modules( - vae=torch.compile(vae) if use_torch_compile else vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor, - ) - self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - - @torch.no_grad() - def encode_images(self, images): - B = images.shape[0] - images = rearrange(images, 'b n c h w -> (b n) c h w') - - dtype = next(self.vae.parameters()).dtype - images = (images - 0.5) * 2.0 - posterior = self.vae.encode(images.to(dtype)).latent_dist - latents = posterior.sample() * self.vae.config.scaling_factor - - latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B) - return latents - - @torch.no_grad() - def __call__( - self, - image: Image.Image = None, - prompt=None, - negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast', - *args, - num_images_per_prompt: Optional[int] = 1, - guidance_scale=2.0, - output_type: Optional[str] = "pil", - width=512, - height=512, - num_inference_steps=28, - return_dict=True, - **cached_condition, - ): - if image is None: - raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.") - assert not isinstance(image, torch.Tensor) - - image = to_rgb_image(image) - - image_vae = torch.tensor(np.array(image) / 255.0) - image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0) - image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype) - - batch_size = image_vae.shape[0] - assert batch_size == 1 - assert num_images_per_prompt == 1 - - ref_latents = self.encode_images(image_vae) - - def convert_pil_list_to_tensor(images): - bg_c = [1., 1., 1.] - images_tensor = [] - for batch_imgs in images: - view_imgs = [] - for pil_img in batch_imgs: - img = numpy.asarray(pil_img, dtype=numpy.float32) / 255. - if img.shape[2] > 3: - alpha = img[:, :, 3:] - img = img[:, :, :3] * alpha + bg_c * (1 - alpha) - img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda") - view_imgs.append(img) - view_imgs = torch.cat(view_imgs, dim=0) - images_tensor.append(view_imgs.unsqueeze(0)) - - images_tensor = torch.cat(images_tensor, dim=0) - return images_tensor - - if "normal_imgs" in cached_condition: - - if isinstance(cached_condition["normal_imgs"], List): - cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"]) - - cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"]) - - if "position_imgs" in cached_condition: - - if isinstance(cached_condition["position_imgs"], List): - cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"]) - - cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"]) - - if 'camera_info_gen' in cached_condition: - camera_info = cached_condition['camera_info_gen'] # B,N - if isinstance(camera_info, List): - camera_info = torch.tensor(camera_info) - camera_info = camera_info.to(image_vae.device).to(torch.int64) - cached_condition['camera_info_gen'] = camera_info - if 'camera_info_ref' in cached_condition: - camera_info = cached_condition['camera_info_ref'] # B,N - if isinstance(camera_info, List): - camera_info = torch.tensor(camera_info) - camera_info = camera_info.to(image_vae.device).to(torch.int64) - cached_condition['camera_info_ref'] = camera_info - - cached_condition['ref_latents'] = ref_latents - - if guidance_scale > 1: - negative_ref_latents = torch.zeros_like(cached_condition['ref_latents']) - cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']]) - cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents']) - if "normal_imgs" in cached_condition: - cached_condition['normal_imgs'] = torch.cat( - (cached_condition['normal_imgs'], cached_condition['normal_imgs'])) - - if "position_imgs" in cached_condition: - cached_condition['position_imgs'] = torch.cat( - (cached_condition['position_imgs'], cached_condition['position_imgs'])) - - if 'position_maps' in cached_condition: - cached_condition['position_maps'] = torch.cat( - (cached_condition['position_maps'], cached_condition['position_maps'])) - - if 'camera_info_gen' in cached_condition: - cached_condition['camera_info_gen'] = torch.cat( - (cached_condition['camera_info_gen'], cached_condition['camera_info_gen'])) - if 'camera_info_ref' in cached_condition: - cached_condition['camera_info_ref'] = torch.cat( - (cached_condition['camera_info_ref'], cached_condition['camera_info_ref'])) - - prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1) - negative_prompt_embeds = torch.zeros_like(prompt_embeds) - - latents: torch.Tensor = self.denoise( - None, - *args, - cross_attention_kwargs=None, - guidance_scale=guidance_scale, - num_images_per_prompt=num_images_per_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - num_inference_steps=num_inference_steps, - output_type='latent', - width=width, - height=height, - **cached_condition - ).images - - if not output_type == "latent": - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] - else: - image = latents - - image = self.image_processor.postprocess(image, output_type=output_type) - if not return_dict: - return (image,) - - return ImagePipelineOutput(images=image) - - def denoise( - self, - prompt: Union[str, List[str]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - timesteps: List[int] = None, - sigmas: List[float] = None, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.Tensor] = None, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - clip_skip: Optional[int] = None, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - **kwargs, - ): - r""" - The call function to the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): - The width in pixels of the generated image. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - sigmas (`List[float]`, *optional*): - Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in - their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed - will be used. - guidance_scale (`float`, *optional*, defaults to 7.5): - A higher guidance scale value encourages the model to generate images closely linked to the text - `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide what to not include in image generation. If not defined, you need to - pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies - to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - generation deterministic. - latents (`torch.Tensor`, *optional*): - Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not - provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If - not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of - IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should - contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in - [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.0): - Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when - using zero terminal SNR. - clip_skip (`int`, *optional*): - Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that - the output of the pre-final layer will be used for computing the prompt embeddings. - callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): - A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of - each denoising step during the inference. with the following arguments: `callback_on_step_end(self: - DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a - list of all tensors as specified by `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Examples: - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, - otherwise a `tuple` is returned where the first element is a list with the generated images and the - second element is a list of `bool`s indicating whether the corresponding generated image contains - "not-safe-for-work" (nsfw) content. - """ - - callback = kwargs.pop("callback", None) - callback_steps = kwargs.pop("callback_steps", None) - - if callback is not None: - deprecate( - "callback", - "1.0.0", - "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", - ) - if callback_steps is not None: - deprecate( - "callback_steps", - "1.0.0", - "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", - ) - - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - # to deal with lora scaling and other possible forward hooks - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - height, - width, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - ip_adapter_image, - ip_adapter_image_embeds, - callback_on_step_end_tensor_inputs, - ) - - self._guidance_scale = guidance_scale - self._guidance_rescale = guidance_rescale - self._clip_skip = clip_skip - self._cross_attention_kwargs = cross_attention_kwargs - self._interrupt = False - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - device = self._execution_device - - # 3. Encode input prompt - lora_scale = ( - self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None - ) - - prompt_embeds, negative_prompt_embeds = self.encode_prompt( - prompt, - device, - num_images_per_prompt, - self.do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - lora_scale=lora_scale, - clip_skip=self.clip_skip, - ) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - if self.do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self.do_classifier_free_guidance, - ) - - # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) - assert num_images_per_prompt == 1 - # 5. Prepare latent variables - num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size * kwargs['num_in_batch'], # num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 6.1 Add image embeds for IP-Adapter - added_cond_kwargs = ( - {"image_embeds": image_embeds} - if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) - else None - ) - - # 6.2 Optionally get Guidance Scale Embedding - timestep_cond = None - if self.unet.config.time_cond_proj_dim is not None: - guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) - timestep_cond = self.get_guidance_scale_embedding( - guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim - ).to(device=device, dtype=latents.dtype) - - # 7. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - self._num_timesteps = len(timesteps) - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - # expand the latents if we are doing classifier free guidance - latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch']) - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w') - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch']) - - # predict the noise residual - - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=prompt_embeds, - timestep_cond=timestep_cond, - cross_attention_kwargs=self.cross_attention_kwargs, - added_cond_kwargs=added_cond_kwargs, - return_dict=False, **kwargs - )[0] - latents = rearrange(latents, 'b n c h w -> (b n) c h w') - # perform guidance - if self.do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) - - if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - latents = \ - self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs, - return_dict=False)[0] - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if not output_type == "latent": - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ - 0 - ] - image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - else: - image = latents - has_nsfw_concept = None - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - # Offload all models - self.maybe_free_model_hooks() - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py deleted file mode 100644 index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/hunyuanpaint/unet/modules.py b/hy3dgen/texgen/hunyuanpaint/unet/modules.py deleted file mode 100644 index 5d16bc6b6bb1ebc72c602dcb298d122429fe847d..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/hunyuanpaint/unet/modules.py +++ /dev/null @@ -1,440 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -import copy -import json -import os -from typing import Any, Dict, Optional - -import torch -import torch.nn as nn -from diffusers.models import UNet2DConditionModel -from diffusers.models.attention_processor import Attention -from diffusers.models.transformers.transformer_2d import BasicTransformerBlock -from einops import rearrange - - -def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int): - # "feed_forward_chunk_size" can be used to save memory - if hidden_states.shape[chunk_dim] % chunk_size != 0: - raise ValueError( - f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." - ) - - num_chunks = hidden_states.shape[chunk_dim] // chunk_size - ff_output = torch.cat( - [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)], - dim=chunk_dim, - ) - return ff_output - - -class Basic2p5DTransformerBlock(torch.nn.Module): - def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None: - super().__init__() - self.transformer = transformer - self.layer_name = layer_name - self.use_ma = use_ma - self.use_ra = use_ra - - # multiview attn - if self.use_ma: - self.attn_multiview = Attention( - query_dim=self.dim, - heads=self.num_attention_heads, - dim_head=self.attention_head_dim, - dropout=self.dropout, - bias=self.attention_bias, - cross_attention_dim=None, - upcast_attention=self.attn1.upcast_attention, - out_bias=True, - ) - - # ref attn - if self.use_ra: - self.attn_refview = Attention( - query_dim=self.dim, - heads=self.num_attention_heads, - dim_head=self.attention_head_dim, - dropout=self.dropout, - bias=self.attention_bias, - cross_attention_dim=None, - upcast_attention=self.attn1.upcast_attention, - out_bias=True, - ) - - def __getattr__(self, name: str): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.transformer, name) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - timestep: Optional[torch.LongTensor] = None, - cross_attention_kwargs: Dict[str, Any] = None, - class_labels: Optional[torch.LongTensor] = None, - added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> torch.Tensor: - - # Notice that normalization is always applied before the real computation in the following blocks. - # 0. Self-Attention - batch_size = hidden_states.shape[0] - - cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} - num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1) - mode = cross_attention_kwargs.pop('mode', None) - mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0) - ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0) - condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None) - - if self.norm_type == "ada_norm": - norm_hidden_states = self.norm1(hidden_states, timestep) - elif self.norm_type == "ada_norm_zero": - norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( - hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype - ) - elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]: - norm_hidden_states = self.norm1(hidden_states) - elif self.norm_type == "ada_norm_continuous": - norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"]) - elif self.norm_type == "ada_norm_single": - shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( - self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) - ).chunk(6, dim=1) - norm_hidden_states = self.norm1(hidden_states) - norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa - else: - raise ValueError("Incorrect norm used") - - if self.pos_embed is not None: - norm_hidden_states = self.pos_embed(norm_hidden_states) - - # 1. Prepare GLIGEN inputs - cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} - gligen_kwargs = cross_attention_kwargs.pop("gligen", None) - - attn_output = self.attn1( - norm_hidden_states, - encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, - attention_mask=attention_mask, - **cross_attention_kwargs, - ) - - if self.norm_type == "ada_norm_zero": - attn_output = gate_msa.unsqueeze(1) * attn_output - elif self.norm_type == "ada_norm_single": - attn_output = gate_msa * attn_output - - hidden_states = attn_output + hidden_states - if hidden_states.ndim == 4: - hidden_states = hidden_states.squeeze(1) - - # 1.2 Reference Attention - if 'w' in mode: - condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', - n=num_in_batch) # B, (N L), C - - if 'r' in mode and self.use_ra: - condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1, - 1) # B N L C - condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c') - - attn_output = self.attn_refview( - norm_hidden_states, - encoder_hidden_states=condition_embed, - attention_mask=None, - **cross_attention_kwargs - ) - ref_scale_timing = ref_scale - if isinstance(ref_scale, torch.Tensor): - ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1) - for _ in range(attn_output.ndim - 1): - ref_scale_timing = ref_scale_timing.unsqueeze(-1) - hidden_states = ref_scale_timing * attn_output + hidden_states - if hidden_states.ndim == 4: - hidden_states = hidden_states.squeeze(1) - - # 1.3 Multiview Attention - if num_in_batch > 1 and self.use_ma: - multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch) - - attn_output = self.attn_multiview( - multivew_hidden_states, - encoder_hidden_states=multivew_hidden_states, - **cross_attention_kwargs - ) - - attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch) - - hidden_states = mva_scale * attn_output + hidden_states - if hidden_states.ndim == 4: - hidden_states = hidden_states.squeeze(1) - - # 1.2 GLIGEN Control - if gligen_kwargs is not None: - hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) - - # 3. Cross-Attention - if self.attn2 is not None: - if self.norm_type == "ada_norm": - norm_hidden_states = self.norm2(hidden_states, timestep) - elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]: - norm_hidden_states = self.norm2(hidden_states) - elif self.norm_type == "ada_norm_single": - # For PixArt norm2 isn't applied here: - # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 - norm_hidden_states = hidden_states - elif self.norm_type == "ada_norm_continuous": - norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"]) - else: - raise ValueError("Incorrect norm") - - if self.pos_embed is not None and self.norm_type != "ada_norm_single": - norm_hidden_states = self.pos_embed(norm_hidden_states) - - attn_output = self.attn2( - norm_hidden_states, - encoder_hidden_states=encoder_hidden_states, - attention_mask=encoder_attention_mask, - **cross_attention_kwargs, - ) - - hidden_states = attn_output + hidden_states - - # 4. Feed-forward - # i2vgen doesn't have this norm 🤷‍♂️ - if self.norm_type == "ada_norm_continuous": - norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"]) - elif not self.norm_type == "ada_norm_single": - norm_hidden_states = self.norm3(hidden_states) - - if self.norm_type == "ada_norm_zero": - norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] - - if self.norm_type == "ada_norm_single": - norm_hidden_states = self.norm2(hidden_states) - norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp - - if self._chunk_size is not None: - # "feed_forward_chunk_size" can be used to save memory - ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) - else: - ff_output = self.ff(norm_hidden_states) - - if self.norm_type == "ada_norm_zero": - ff_output = gate_mlp.unsqueeze(1) * ff_output - elif self.norm_type == "ada_norm_single": - ff_output = gate_mlp * ff_output - - hidden_states = ff_output + hidden_states - if hidden_states.ndim == 4: - hidden_states = hidden_states.squeeze(1) - - return hidden_states - - -class UNet2p5DConditionModel(torch.nn.Module): - def __init__(self, unet: UNet2DConditionModel) -> None: - super().__init__() - self.unet = unet - - self.use_ma = True - self.use_ra = True - self.use_camera_embedding = True - self.use_dual_stream = True - - if self.use_dual_stream: - self.unet_dual = copy.deepcopy(unet) - self.init_attention(self.unet_dual) - self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra) - self.init_condition() - self.init_camera_embedding() - - @staticmethod - def from_pretrained(pretrained_model_name_or_path, **kwargs): - torch_dtype = kwargs.pop('torch_dtype', torch.float32) - config_path = os.path.join(pretrained_model_name_or_path, 'config.json') - unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin') - with open(config_path, 'r', encoding='utf-8') as file: - config = json.load(file) - unet = UNet2DConditionModel(**config) - unet = UNet2p5DConditionModel(unet) - unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True) - unet.load_state_dict(unet_ckpt, strict=True) - unet = unet.to(torch_dtype) - return unet - - def init_condition(self): - self.unet.conv_in = torch.nn.Conv2d( - 12, - self.unet.conv_in.out_channels, - kernel_size=self.unet.conv_in.kernel_size, - stride=self.unet.conv_in.stride, - padding=self.unet.conv_in.padding, - dilation=self.unet.conv_in.dilation, - groups=self.unet.conv_in.groups, - bias=self.unet.conv_in.bias is not None) - - self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024)) - self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024)) - - def init_camera_embedding(self): - - if self.use_camera_embedding: - time_embed_dim = 1280 - self.max_num_ref_image = 5 - self.max_num_gen_image = 12 * 3 + 4 * 2 - self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim) - - def init_attention(self, unet, use_ma=False, use_ra=False): - - for down_block_i, down_block in enumerate(unet.down_blocks): - if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention: - for attn_i, attn in enumerate(down_block.attentions): - for transformer_i, transformer in enumerate(attn.transformer_blocks): - if isinstance(transformer, BasicTransformerBlock): - attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, - f'down_{down_block_i}_{attn_i}_{transformer_i}', - use_ma, use_ra) - - if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention: - for attn_i, attn in enumerate(unet.mid_block.attentions): - for transformer_i, transformer in enumerate(attn.transformer_blocks): - if isinstance(transformer, BasicTransformerBlock): - attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, - f'mid_{attn_i}_{transformer_i}', - use_ma, use_ra) - - for up_block_i, up_block in enumerate(unet.up_blocks): - if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention: - for attn_i, attn in enumerate(up_block.attentions): - for transformer_i, transformer in enumerate(attn.transformer_blocks): - if isinstance(transformer, BasicTransformerBlock): - attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, - f'up_{up_block_i}_{attn_i}_{transformer_i}', - use_ma, use_ra) - - def __getattr__(self, name: str): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.unet, name) - - def forward( - self, sample, timestep, encoder_hidden_states, - *args, down_intrablock_additional_residuals=None, - down_block_res_samples=None, mid_block_res_sample=None, - **cached_condition, - ): - B, N_gen, _, H, W = sample.shape - assert H == W - - if self.use_camera_embedding: - camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image - camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)') - else: - camera_info_gen = None - - sample = [sample] - if 'normal_imgs' in cached_condition: - sample.append(cached_condition["normal_imgs"]) - if 'position_imgs' in cached_condition: - sample.append(cached_condition["position_imgs"]) - sample = torch.cat(sample, dim=2) - - sample = rearrange(sample, 'b n c h w -> (b n) c h w') - - encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1) - encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c') - - if self.use_ra: - if 'condition_embed_dict' in cached_condition: - condition_embed_dict = cached_condition['condition_embed_dict'] - else: - condition_embed_dict = {} - ref_latents = cached_condition['ref_latents'] - N_ref = ref_latents.shape[1] - if self.use_camera_embedding: - camera_info_ref = cached_condition['camera_info_ref'] - camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)') - else: - camera_info_ref = None - - ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w') - - encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1) - encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c') - - noisy_ref_latents = ref_latents - timestep_ref = 0 - - if self.use_dual_stream: - unet_ref = self.unet_dual - else: - unet_ref = self.unet - unet_ref( - noisy_ref_latents, timestep_ref, - encoder_hidden_states=encoder_hidden_states_ref, - class_labels=camera_info_ref, - # **kwargs - return_dict=False, - cross_attention_kwargs={ - 'mode': 'w', 'num_in_batch': N_ref, - 'condition_embed_dict': condition_embed_dict}, - ) - cached_condition['condition_embed_dict'] = condition_embed_dict - else: - condition_embed_dict = None - - mva_scale = cached_condition.get('mva_scale', 1.0) - ref_scale = cached_condition.get('ref_scale', 1.0) - - return self.unet( - sample, timestep, - encoder_hidden_states_gen, *args, - class_labels=camera_info_gen, - down_intrablock_additional_residuals=[ - sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals - ] if down_intrablock_additional_residuals is not None else None, - down_block_additional_residuals=[ - sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples - ] if down_block_res_samples is not None else None, - mid_block_additional_residual=( - mid_block_res_sample.to(dtype=self.unet.dtype) - if mid_block_res_sample is not None else None - ), - return_dict=False, - cross_attention_kwargs={ - 'mode': 'r', 'num_in_batch': N_gen, - 'condition_embed_dict': condition_embed_dict, - 'mva_scale': mva_scale, - 'ref_scale': ref_scale, - }, - ) diff --git a/hy3dgen/texgen/pipelines.py b/hy3dgen/texgen/pipelines.py deleted file mode 100644 index cff817cc7aaaa45f420d099675940c2442b82517..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/pipelines.py +++ /dev/null @@ -1,227 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -import logging -import os - -import numpy as np -import torch -from PIL import Image - -from .differentiable_renderer.mesh_render import MeshRender -from .utils.dehighlight_utils import Light_Shadow_Remover -from .utils.multiview_utils import Multiview_Diffusion_Net -from .utils.uv_warp_utils import mesh_uv_wrap - -logger = logging.getLogger(__name__) - - -class Hunyuan3DTexGenConfig: - - def __init__(self, light_remover_ckpt_path, multiview_ckpt_path): - self.device = 'cpu' - self.light_remover_ckpt_path = light_remover_ckpt_path - self.multiview_ckpt_path = multiview_ckpt_path - - self.candidate_camera_azims = [0, 90, 180, 270, 0, 180] - self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90] - self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05] - - self.render_size = 2048 - self.texture_size = 1024 - self.bake_exp = 4 - self.merge_method = 'fast' - - -class Hunyuan3DPaintPipeline: - @classmethod - def from_pretrained(cls, model_path): - original_model_path = model_path - if not os.path.exists(model_path): - # try local path - base_dir = os.environ.get('HY3DGEN_MODELS', '~/content/hy3dgen') - model_path = os.path.expanduser(os.path.join(base_dir, model_path)) - - delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') - multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0') - - if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path): - try: - import huggingface_hub - # download from huggingface - model_path = huggingface_hub.snapshot_download(repo_id=original_model_path) - delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') - multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0') - return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path)) - except ImportError: - logger.warning( - "You need to install HuggingFace Hub to load models from the hub." - ) - raise RuntimeError(f"Model path {model_path} not found") - else: - return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path)) - - raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface") - - def __init__(self, config): - self.config = config - self.models = {} - self.render = MeshRender( - default_resolution=self.config.render_size, - texture_size=self.config.texture_size) - - self.load_models() - - def load_models(self): - # empty cude cache - torch.cuda.empty_cache() - # Load model - self.models['delight_model'] = Light_Shadow_Remover(self.config) - self.models['multiview_model'] = Multiview_Diffusion_Net(self.config) - - def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True): - normal_maps = [] - for elev, azim in zip(camera_elevs, camera_azims): - normal_map = self.render.render_normal( - elev, azim, use_abs_coor=use_abs_coor, return_type='pl') - normal_maps.append(normal_map) - - return normal_maps - - def render_position_multiview(self, camera_elevs, camera_azims): - position_maps = [] - for elev, azim in zip(camera_elevs, camera_azims): - position_map = self.render.render_position( - elev, azim, return_type='pl') - position_maps.append(position_map) - - return position_maps - - def bake_from_multiview(self, views, camera_elevs, - camera_azims, view_weights, method='graphcut'): - project_textures, project_weighted_cos_maps = [], [] - project_boundary_maps = [] - for view, camera_elev, camera_azim, weight in zip( - views, camera_elevs, camera_azims, view_weights): - project_texture, project_cos_map, project_boundary_map = self.render.back_project( - view, camera_elev, camera_azim) - project_cos_map = weight * (project_cos_map ** self.config.bake_exp) - project_textures.append(project_texture) - project_weighted_cos_maps.append(project_cos_map) - project_boundary_maps.append(project_boundary_map) - - if method == 'fast': - texture, ori_trust_map = self.render.fast_bake_texture( - project_textures, project_weighted_cos_maps) - else: - raise f'no method {method}' - return texture, ori_trust_map > 1E-8 - - def texture_inpaint(self, texture, mask): - - texture_np = self.render.uv_inpaint(texture, mask) - texture = torch.tensor(texture_np / 255).float().to(texture.device) - - return texture - - def recenter_image(self, image, border_ratio=0.2): - if image.mode == 'RGB': - return image - elif image.mode == 'L': - image = image.convert('RGB') - return image - - alpha_channel = np.array(image)[:, :, 3] - non_zero_indices = np.argwhere(alpha_channel > 0) - if non_zero_indices.size == 0: - raise ValueError("Image is fully transparent") - - min_row, min_col = non_zero_indices.min(axis=0) - max_row, max_col = non_zero_indices.max(axis=0) - - cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1)) - - width, height = cropped_image.size - border_width = int(width * border_ratio) - border_height = int(height * border_ratio) - - new_width = width + 2 * border_width - new_height = height + 2 * border_height - - square_size = max(new_width, new_height) - - new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0)) - - paste_x = (square_size - new_width) // 2 + border_width - paste_y = (square_size - new_height) // 2 + border_height - - new_image.paste(cropped_image, (paste_x, paste_y)) - return new_image - - @torch.no_grad() - def __call__(self, mesh, image): - - if isinstance(image, str): - image_prompt = Image.open(image) - else: - image_prompt = image - - image_prompt = self.recenter_image(image_prompt) - - image_prompt = self.models['delight_model'](image_prompt) - - mesh = mesh_uv_wrap(mesh) - - self.render.load_mesh(mesh) - - selected_camera_elevs, selected_camera_azims, selected_view_weights = \ - self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights - - normal_maps = self.render_normal_multiview( - selected_camera_elevs, selected_camera_azims, use_abs_coor=True) - position_maps = self.render_position_multiview( - selected_camera_elevs, selected_camera_azims) - - camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[ - elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in - zip(selected_camera_azims, selected_camera_elevs)] - multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info) - - for i in range(len(multiviews)): - multiviews[i] = multiviews[i].resize( - (self.config.render_size, self.config.render_size)) - - texture, mask = self.bake_from_multiview(multiviews, - selected_camera_elevs, selected_camera_azims, selected_view_weights, - method=self.config.merge_method) - - mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8) - - texture = self.texture_inpaint(texture, mask_np) - - self.render.set_texture(texture) - textured_mesh = self.render.save_mesh() - - return textured_mesh diff --git a/hy3dgen/texgen/utils/__init__.py b/hy3dgen/texgen/utils/__init__.py deleted file mode 100644 index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/utils/alignImg4Tex_utils.py b/hy3dgen/texgen/utils/alignImg4Tex_utils.py deleted file mode 100644 index 0a09c17cfe1a3f1ac850688e96b66341f0226418..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/alignImg4Tex_utils.py +++ /dev/null @@ -1,132 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -import torch -from diffusers import EulerAncestralDiscreteScheduler -from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \ - AutoencoderKL - - -class Img2img_Control_Ip_adapter: - def __init__(self, device): - controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16, - variant="fp16", use_safetensors=True) - pipe = StableDiffusionControlNetPipeline.from_pretrained( - 'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True - ) - pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors") - pipe.set_ip_adapter_scale(0.7) - - pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) - # pipe.enable_model_cpu_offload() - self.pipe = pipe.to(device) - - def __call__( - self, - prompt, - control_image, - ip_adapter_image, - negative_prompt, - height=512, - width=512, - num_inference_steps=20, - guidance_scale=8.0, - controlnet_conditioning_scale=1.0, - output_type="pil", - **kwargs, - ): - results = self.pipe( - prompt=prompt, - negative_prompt=negative_prompt, - image=control_image, - ip_adapter_image=ip_adapter_image, - generator=torch.manual_seed(42), - seed=42, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - controlnet_conditioning_scale=controlnet_conditioning_scale, - strength=1, - # clip_skip=2, - height=height, - width=width, - output_type=output_type, - **kwargs, - ).images[0] - return results - - -################################################################ - -class HesModel: - def __init__(self, ): - controlnet_depth = ControlNetModel.from_pretrained( - 'diffusers/controlnet-depth-sdxl-1.0', - torch_dtype=torch.float16, - variant="fp16", - use_safetensors=True - ) - self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( - 'stabilityai/stable-diffusion-xl-base-1.0', - torch_dtype=torch.float16, - variant="fp16", - controlnet=controlnet_depth, - use_safetensors=True, - ) - self.pipe.vae = AutoencoderKL.from_pretrained( - 'madebyollin/sdxl-vae-fp16-fix', - torch_dtype=torch.float16 - ) - - self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors") - self.pipe.set_ip_adapter_scale(0.7) - self.pipe.to("cuda") - - def __call__(self, - init_image, - control_image, - ip_adapter_image=None, - prompt='3D image', - negative_prompt='2D image', - seed=42, - strength=0.8, - num_inference_steps=40, - guidance_scale=7.5, - controlnet_conditioning_scale=0.5, - **kwargs - ): - image = self.pipe( - prompt=prompt, - image=init_image, - control_image=control_image, - ip_adapter_image=ip_adapter_image, - negative_prompt=negative_prompt, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - strength=strength, - controlnet_conditioning_scale=controlnet_conditioning_scale, - seed=seed, - **kwargs - ).images[0] - return image diff --git a/hy3dgen/texgen/utils/counter_utils.py b/hy3dgen/texgen/utils/counter_utils.py deleted file mode 100644 index e0374fc327ad2127ec84bb0c267c19a3b9c8d738..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/counter_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -class RunningStats(): - def __init__(self) -> None: - self.count = 0 - self.sum = 0 - self.mean = 0 - self.min = None - self.max = None - - def add_value(self, value): - self.count += 1 - self.sum += value - self.mean = self.sum / self.count - - if self.min is None or value < self.min: - self.min = value - - if self.max is None or value > self.max: - self.max = value - - def get_count(self): - return self.count - - def get_sum(self): - return self.sum - - def get_mean(self): - return self.mean - - def get_min(self): - return self.min - - def get_max(self): - return self.max diff --git a/hy3dgen/texgen/utils/dehighlight_utils.py b/hy3dgen/texgen/utils/dehighlight_utils.py deleted file mode 100644 index 089076b08f712ec0db882835f422183fd7f94457..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/dehighlight_utils.py +++ /dev/null @@ -1,84 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import cv2 -import numpy as np -import torch -from PIL import Image -from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler - - -class Light_Shadow_Remover(): - def __init__(self, config): - self.device = config.device - self.cfg_image = 1.5 - self.cfg_text = 1.0 - - pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( - config.light_remover_ckpt_path, - torch_dtype=torch.float16, - safety_checker=None, - ) - pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config) - pipeline.set_progress_bar_config(disable=True) - - # self.pipeline = pipeline.to(self.device, torch.float16) - self.pipeline = pipeline # Needed to avoid displaying the warning - @torch.no_grad() - def __call__(self, image): - - image = image.resize((512, 512)) - - if image.mode == 'RGBA': - image_array = np.array(image) - alpha_channel = image_array[:, :, 3] - erosion_size = 3 - kernel = np.ones((erosion_size, erosion_size), np.uint8) - alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1) - image_array[alpha_channel == 0, :3] = 255 - image_array[:, :, 3] = alpha_channel - image = Image.fromarray(image_array) - - image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) - alpha = image_tensor[:, :, 3:] - rgb_target = image_tensor[:, :, :3] - else: - image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) - alpha = torch.ones_like(image_tensor)[:, :, :1] - rgb_target = image_tensor[:, :, :3] - - image = image.convert('RGB') - - image = self.pipeline( - prompt="", - image=image, - generator=torch.manual_seed(42), - height=512, - width=512, - num_inference_steps=50, - image_guidance_scale=self.cfg_image, - guidance_scale=self.cfg_text, - ).images[0] - - return image diff --git a/hy3dgen/texgen/utils/multiview_utils.py b/hy3dgen/texgen/utils/multiview_utils.py deleted file mode 100644 index ba5708b617e0d58d6d37025fcb94a75324b9e5a9..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/multiview_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import os -import random - -import numpy as np -import torch -from diffusers import DiffusionPipeline -from diffusers import EulerAncestralDiscreteScheduler - - -class Multiview_Diffusion_Net(): - def __init__(self, config) -> None: - self.device = config.device - self.view_size = 512 - multiview_ckpt_path = config.multiview_ckpt_path - - current_file_path = os.path.abspath(__file__) - custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint') - - pipeline = DiffusionPipeline.from_pretrained( - multiview_ckpt_path, - custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16) - - pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, - timestep_spacing='trailing') - - pipeline.set_progress_bar_config(disable=True) - self.pipeline = pipeline #.to(self.device) # only for cosmetics and not display the warning - - def seed_everything(self, seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - os.environ["PL_GLOBAL_SEED"] = str(seed) - - def __call__(self, input_image, control_images, camera_info): - - self.seed_everything(0) - - input_image = input_image.resize((self.view_size, self.view_size)) - for i in range(len(control_images)): - control_images[i] = control_images[i].resize((self.view_size, self.view_size)) - if control_images[i].mode == 'L': - control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1') - - kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0)) - - num_view = len(control_images) // 2 - normal_image = [[control_images[i] for i in range(num_view)]] - position_image = [[control_images[i + num_view] for i in range(num_view)]] - - camera_info_gen = [camera_info] - camera_info_ref = [[0]] - kwargs['width'] = self.view_size - kwargs['height'] = self.view_size - kwargs['num_in_batch'] = num_view - kwargs['camera_info_gen'] = camera_info_gen - kwargs['camera_info_ref'] = camera_info_ref - kwargs["normal_imgs"] = normal_image - kwargs["position_imgs"] = position_image - - mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images - return mvd_image diff --git a/hy3dgen/texgen/utils/simplify_mesh_utils.py b/hy3dgen/texgen/utils/simplify_mesh_utils.py deleted file mode 100644 index 915284d337e648c57fae886dee3333c0203856b6..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/simplify_mesh_utils.py +++ /dev/null @@ -1,46 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import trimesh - - -def remesh_mesh(mesh_path, remesh_path, method='trimesh'): - if method == 'trimesh': - mesh_simplify_trimesh(mesh_path, remesh_path) - else: - raise f'Method {method} has not been implemented.' - - -def mesh_simplify_trimesh(inputpath, outputpath): - import pymeshlab - ms = pymeshlab.MeshSet() - ms.load_new_mesh(inputpath, load_in_a_single_layer=True) - ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False) - - courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh') - face_num = courent.faces.shape[0] - - if face_num > 100000: - courent = courent.simplify_quadric_decimation(40000) - courent.export(outputpath) diff --git a/hy3dgen/texgen/utils/uv_warp_utils.py b/hy3dgen/texgen/utils/uv_warp_utils.py deleted file mode 100644 index b4f4082274b900aebcdbfcf29a7d6a9532dfa8cb..0000000000000000000000000000000000000000 --- a/hy3dgen/texgen/utils/uv_warp_utils.py +++ /dev/null @@ -1,42 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - -import trimesh -import xatlas - - -def mesh_uv_wrap(mesh): - if isinstance(mesh, trimesh.Scene): - mesh = mesh.dump(concatenate=True) - - # if len(mesh.faces) > 50000: - # raise ValueError("The mesh has more than 50,000 faces, which is not supported.") - - vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces) - - mesh.vertices = mesh.vertices[vmapping] - mesh.faces = indices - mesh.visual.uv = uvs - - return mesh diff --git a/hy3dgen/text2image.py b/hy3dgen/text2image.py deleted file mode 100644 index be920672cb72238cbe49cba930e3e02a7c287b82..0000000000000000000000000000000000000000 --- a/hy3dgen/text2image.py +++ /dev/null @@ -1,93 +0,0 @@ -# Open Source Model Licensed under the Apache License Version 2.0 -# and Other Licenses of the Third-Party Components therein: -# The below Model in this distribution may have been modified by THL A29 Limited -# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. - -# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. -# The below software and/or models in this distribution may have been -# modified by THL A29 Limited ("Tencent Modifications"). -# All Tencent Modifications are Copyright (C) THL A29 Limited. - -# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT -# except for the third-party components listed below. -# Hunyuan 3D does not impose any additional limitations beyond what is outlined -# in the repsective licenses of these third-party components. -# Users must comply with all terms and conditions of original licenses of these third-party -# components and must ensure that the usage of the third party components adheres to -# all relevant laws and regulations. - -# For avoidance of doubts, Hunyuan 3D means the large language models and -# their software and algorithms, including trained model weights, parameters (including -# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, -# fine-tuning enabling code and other elements of the foregoing made publicly available -# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. - - -import os -import random - -import numpy as np -import torch -from diffusers import AutoPipelineForText2Image - - -def seed_everything(seed): - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - os.environ["PL_GLOBAL_SEED"] = str(seed) - - -class HunyuanDiTPipeline: - def __init__( - self, - model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled", - device='cpu' - ): - torch.set_default_device('cpu') - self.device = device - self.pipe = AutoPipelineForText2Image.from_pretrained( - model_path, - torch_dtype=torch.float16, - enable_pag=True, - pag_applied_layers=["blocks.(16|17|18|19)"] - ) # .to(device) # needed to avoid displaying the warning - self.pos_txt = ",白色背景,3D风格,最佳质量" - self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \ - "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \ - "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \ - "额外的手臂,额外的腿,融合的手指,手指太多,长脖子" - - def compile(self): - # accelarate hunyuan-dit transformer,first inference will cost long time - torch.set_float32_matmul_precision('high') - self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True) - # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True) - generator = torch.Generator(device=self.pipe.device) # infer once for hot-start - out_img = self.pipe( - prompt='美少女战士', - negative_prompt='模糊', - num_inference_steps=25, - pag_scale=1.3, - width=1024, - height=1024, - generator=generator, - return_dict=False - )[0][0] - - @torch.no_grad() - def __call__(self, prompt, seed=0): - seed_everything(seed) - generator = torch.Generator(device="cuda") #self.pipe.device - generator = generator.manual_seed(int(seed)) - out_img = self.pipe( - prompt=self.pos_txt+prompt, - negative_prompt=self.neg_txt, - num_inference_steps=20, - pag_scale=1.3, - width=1024, - height=1024, - generator=generator, - return_dict=False - )[0][0] - return out_img