diff --git a/.gitignore b/.gitignore
index 0850089305657382152fbd25dd190ed3307c33c9..9bc430eb447ad26c675cffeccb4ac7fc1f804741 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@ __pycache__/
 venv/
 env/
 .venv/
+build/
+dist/
 
 # Jupyter Notebook checkpoints
 .ipynb_checkpoints/
@@ -25,4 +27,4 @@ env/
 .vscode/
 
 # Hugging Face cache (optional)
-/content/huggingface/
+~/.cache/huggingface/
diff --git a/dist/hy3dgen-2.0.0-py3.12.egg b/dist/hy3dgen-2.0.0-py3.12.egg
deleted file mode 100644
index 31ccfc3573626346a5da66f2afb8405d256c18db..0000000000000000000000000000000000000000
Binary files a/dist/hy3dgen-2.0.0-py3.12.egg and /dev/null differ
diff --git a/hy3dgen/__init__.py b/hy3dgen/__init__.py
deleted file mode 100644
index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000
--- a/hy3dgen/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/rembg.py b/hy3dgen/rembg.py
deleted file mode 100644
index c0d99483c8354fc10c6689b5cf12ebcd44368d92..0000000000000000000000000000000000000000
--- a/hy3dgen/rembg.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-from PIL import Image
-from rembg import remove, new_session
-
-
-class BackgroundRemover():
-    def __init__(self):
-        self.session = new_session()
-
-    def __call__(self, image: Image.Image):
-        output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
-        return output
diff --git a/hy3dgen/shapegen/__init__.py b/hy3dgen/shapegen/__init__.py
deleted file mode 100644
index d1f9534c15d029511d910d29e45da5ba7b8c8714..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
-from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
-from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
diff --git a/hy3dgen/shapegen/models/__init__.py b/hy3dgen/shapegen/models/__init__.py
deleted file mode 100644
index 684b3e389737fb988f5e363e777c34f6cd1fe4ea..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/models/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
-from .hunyuan3ddit import Hunyuan3DDiT
-from .vae import ShapeVAE
diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py
deleted file mode 100644
index 1af4c0cc440a193167c0837621c3494242b95f3d..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/models/conditioner.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import torch
-import torch.nn as nn
-from torchvision import transforms
-from transformers import (
-    CLIPVisionModelWithProjection,
-    CLIPVisionConfig,
-    Dinov2Model,
-    Dinov2Config,
-)
-
-
-class ImageEncoder(nn.Module):
-    def __init__(
-        self,
-        version=None,
-        config=None,
-        use_cls_token=True,
-        image_size=224,
-        **kwargs,
-    ):
-        super().__init__()
-
-        if config is None:
-            self.model = self.MODEL_CLASS.from_pretrained(version)
-        else:
-            self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
-        self.model.eval()
-        self.model.requires_grad_(False)
-        self.use_cls_token = use_cls_token
-        self.size = image_size // 14
-        self.num_patches = (image_size // 14) ** 2
-        if self.use_cls_token:
-            self.num_patches += 1
-
-        self.transform = transforms.Compose(
-            [
-                transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True),
-                transforms.CenterCrop(image_size),
-                transforms.Normalize(
-                    mean=self.mean,
-                    std=self.std,
-                ),
-            ]
-        )
-
-    def forward(self, image, mask=None, value_range=(-1, 1)):
-        if value_range is not None:
-            low, high = value_range
-            image = (image - low) / (high - low)
-
-        image = image.to(self.model.device, dtype=self.model.dtype)
-        inputs = self.transform(image)
-        outputs = self.model(inputs)
-
-        last_hidden_state = outputs.last_hidden_state
-        if not self.use_cls_token:
-            last_hidden_state = last_hidden_state[:, 1:, :]
-
-        return last_hidden_state
-
-    def unconditional_embedding(self, batch_size):
-        device = next(self.model.parameters()).device
-        dtype = next(self.model.parameters()).dtype
-        zero = torch.zeros(
-            batch_size,
-            self.num_patches,
-            self.model.config.hidden_size,
-            device=device,
-            dtype=dtype,
-        )
-
-        return zero
-
-
-class CLIPImageEncoder(ImageEncoder):
-    MODEL_CLASS = CLIPVisionModelWithProjection
-    MODEL_CONFIG_CLASS = CLIPVisionConfig
-    mean = [0.48145466, 0.4578275, 0.40821073]
-    std = [0.26862954, 0.26130258, 0.27577711]
-
-
-class DinoImageEncoder(ImageEncoder):
-    MODEL_CLASS = Dinov2Model
-    MODEL_CONFIG_CLASS = Dinov2Config
-    mean = [0.485, 0.456, 0.406]
-    std = [0.229, 0.224, 0.225]
-
-
-def build_image_encoder(config):
-    if config['type'] == 'CLIPImageEncoder':
-        return CLIPImageEncoder(**config['kwargs'])
-    elif config['type'] == 'DinoImageEncoder':
-        return DinoImageEncoder(**config['kwargs'])
-    else:
-        raise ValueError(f'Unknown image encoder type: {config["type"]}')
-
-
-class DualImageEncoder(nn.Module):
-    def __init__(
-        self,
-        main_image_encoder,
-        additional_image_encoder,
-    ):
-        super().__init__()
-        self.main_image_encoder = build_image_encoder(main_image_encoder)
-        self.additional_image_encoder = build_image_encoder(additional_image_encoder)
-
-    def forward(self, image, mask=None):
-        outputs = {
-            'main': self.main_image_encoder(image, mask=mask),
-            'additional': self.additional_image_encoder(image, mask=mask),
-        }
-        return outputs
-
-    def unconditional_embedding(self, batch_size):
-        outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size),
-            'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
-        }
-        return outputs
-
-
-class SingleImageEncoder(nn.Module):
-    def __init__(
-        self,
-        main_image_encoder,
-    ):
-        super().__init__()
-        self.main_image_encoder = build_image_encoder(main_image_encoder)
-
-    def forward(self, image, mask=None):
-        outputs = {
-            'main': self.main_image_encoder(image, mask=mask),
-        }
-        return outputs
-
-    def unconditional_embedding(self, batch_size):
-        outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size),
-        }
-        return outputs
diff --git a/hy3dgen/shapegen/models/hunyuan3ddit.py b/hy3dgen/shapegen/models/hunyuan3ddit.py
deleted file mode 100644
index d1c778666890cb13538eba15460cf0c05c7f9130..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/models/hunyuan3ddit.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import math
-from dataclasses import dataclass
-from typing import List, Tuple, Optional
-
-import torch
-from einops import rearrange
-from torch import Tensor, nn
-
-
-def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor:
-    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
-    x = rearrange(x, "B H L D -> B L (H D)")
-    return x
-
-
-def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
-    """
-    Create sinusoidal timestep embeddings.
-    :param t: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an (N, D) Tensor of positional embeddings.
-    """
-    t = time_factor * t
-    half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
-        t.device
-    )
-
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    if torch.is_floating_point(t):
-        embedding = embedding.to(t)
-    return embedding
-
-
-class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int):
-        super().__init__()
-        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-        self.silu = nn.SiLU()
-        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x: Tensor):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms).to(dtype=x_dtype) * self.scale
-
-
-class QKNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
-
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
-        q = self.query_norm(q)
-        k = self.key_norm(k)
-        return q.to(v), k.to(v)
-
-
-class SelfAttention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int = 8,
-        qkv_bias: bool = False,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.norm = QKNorm(head_dim)
-        self.proj = nn.Linear(dim, dim)
-
-    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
-        qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        x = attention(q, k, v, pe=pe)
-        x = self.proj(x)
-        return x
-
-
-@dataclass
-class ModulationOut:
-    shift: Tensor
-    scale: Tensor
-    gate: Tensor
-
-
-class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool):
-        super().__init__()
-        self.is_double = double
-        self.multiplier = 6 if double else 3
-        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
-
-    def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
-        out = self.lin(nn.functional.silu(vec))[:, None, :]
-        out = out.chunk(self.multiplier, dim=-1)
-
-        return (
-            ModulationOut(*out[:3]),
-            ModulationOut(*out[3:]) if self.is_double else None,
-        )
-
-
-class DoubleStreamBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float,
-        qkv_bias: bool = False,
-    ):
-        super().__init__()
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-
-        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-
-        self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
-
-        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-
-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]:
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-
-        q = torch.cat((txt_q, img_q), dim=2)
-        k = torch.cat((txt_k, img_k), dim=2)
-        v = torch.cat((txt_v, img_v), dim=2)
-
-        attn = attention(q, k, v, pe=pe)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
-
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
-
-        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
-        return img, txt
-
-
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: Optional[float] = None,
-    ):
-        super().__init__()
-
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-        # proj and mlp_out
-        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-
-        self.norm = QKNorm(head_dim)
-
-        self.hidden_size = hidden_size
-        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-
-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False)
-
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
-        mod, _ = self.modulation(vec)
-
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-
-        # compute attention
-        attn = attention(q, k, v, pe=pe)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        return x + mod.gate * output
-
-
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
-
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x
-
-
-class Hunyuan3DDiT(nn.Module):
-    def __init__(
-        self,
-        in_channels: int = 64,
-        context_in_dim: int = 1536,
-        hidden_size: int = 1024,
-        mlp_ratio: float = 4.0,
-        num_heads: int = 16,
-        depth: int = 16,
-        depth_single_blocks: int = 32,
-        axes_dim: List[int] = [64],
-        theta: int = 10_000,
-        qkv_bias: bool = True,
-        time_factor: float = 1000,
-        ckpt_path: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.context_in_dim = context_in_dim
-        self.hidden_size = hidden_size
-        self.mlp_ratio = mlp_ratio
-        self.num_heads = num_heads
-        self.depth = depth
-        self.depth_single_blocks = depth_single_blocks
-        self.axes_dim = axes_dim
-        self.theta = theta
-        self.qkv_bias = qkv_bias
-        self.time_factor = time_factor
-        self.out_channels = self.in_channels
-
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-        pe_dim = hidden_size // num_heads
-        if sum(axes_dim) != pe_dim:
-            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                )
-                for _ in range(depth)
-            ]
-        )
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=mlp_ratio,
-                )
-                for _ in range(depth_single_blocks)
-            ]
-        )
-
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-
-        if ckpt_path is not None:
-            print('restored denoiser ckpt', ckpt_path)
-
-            ckpt = torch.load(ckpt_path, map_location="cpu")
-            if 'state_dict' not in ckpt:
-                # deepspeed ckpt
-                state_dict = {}
-                for k in ckpt.keys():
-                    new_k = k.replace('_forward_module.', '')
-                    state_dict[new_k] = ckpt[k]
-            else:
-                state_dict = ckpt["state_dict"]
-
-            final_state_dict = {}
-            for k, v in state_dict.items():
-                if k.startswith('model.'):
-                    final_state_dict[k.replace('model.', '')] = v
-                else:
-                    final_state_dict[k] = v
-            missing, unexpected = self.load_state_dict(final_state_dict, strict=False)
-            print('unexpected keys:', unexpected)
-            print('missing keys:', missing)
-
-    def forward(
-        self,
-        x,
-        t,
-        contexts,
-        **kwargs,
-    ) -> Tensor:
-        cond = contexts['main']
-        latent = self.latent_in(x)
-        vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
-        cond = self.cond_in(cond)
-        pe = None
-
-        for block in self.double_blocks:
-            latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe)
-
-        latent = torch.cat((cond, latent), 1)
-        for block in self.single_blocks:
-            latent = block(latent, vec=vec, pe=pe)
-
-        latent = latent[:, cond.shape[1]:, ...]
-        latent = self.final_layer(latent, vec)
-        return latent
diff --git a/hy3dgen/shapegen/models/vae.py b/hy3dgen/shapegen/models/vae.py
deleted file mode 100644
index aef2784ac0db653714e711d12697eafc962c2aa3..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/models/vae.py
+++ /dev/null
@@ -1,636 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-from typing import Tuple, List, Union, Optional
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from skimage import measure
-from tqdm import tqdm
-
-
-class FourierEmbedder(nn.Module):
-    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
-    each feature dimension of `x[..., i]` into:
-        [
-            sin(x[..., i]),
-            sin(f_1*x[..., i]),
-            sin(f_2*x[..., i]),
-            ...
-            sin(f_N * x[..., i]),
-            cos(x[..., i]),
-            cos(f_1*x[..., i]),
-            cos(f_2*x[..., i]),
-            ...
-            cos(f_N * x[..., i]),
-            x[..., i]     # only present if include_input is True.
-        ], here f_i is the frequency.
-
-    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
-    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
-    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
-
-    Args:
-        num_freqs (int): the number of frequencies, default is 6;
-        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
-        input_dim (int): the input dimension, default is 3;
-        include_input (bool): include the input tensor or not, default is True.
-
-    Attributes:
-        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
-
-        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
-            otherwise, it is input_dim * num_freqs * 2.
-
-    """
-
-    def __init__(self,
-                 num_freqs: int = 6,
-                 logspace: bool = True,
-                 input_dim: int = 3,
-                 include_input: bool = True,
-                 include_pi: bool = True) -> None:
-
-        """The initialization"""
-
-        super().__init__()
-
-        if logspace:
-            frequencies = 2.0 ** torch.arange(
-                num_freqs,
-                dtype=torch.float32
-            )
-        else:
-            frequencies = torch.linspace(
-                1.0,
-                2.0 ** (num_freqs - 1),
-                num_freqs,
-                dtype=torch.float32
-            )
-
-        if include_pi:
-            frequencies *= torch.pi
-
-        self.register_buffer("frequencies", frequencies, persistent=False)
-        self.include_input = include_input
-        self.num_freqs = num_freqs
-
-        self.out_dim = self.get_dims(input_dim)
-
-    def get_dims(self, input_dim):
-        temp = 1 if self.include_input or self.num_freqs == 0 else 0
-        out_dim = input_dim * (self.num_freqs * 2 + temp)
-
-        return out_dim
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """ Forward process.
-
-        Args:
-            x: tensor of shape [..., dim]
-
-        Returns:
-            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
-                where temp is 1 if include_input is True and 0 otherwise.
-        """
-
-        if self.num_freqs > 0:
-            embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
-            if self.include_input:
-                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
-            else:
-                return torch.cat((embed.sin(), embed.cos()), dim=-1)
-        else:
-            return x
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-        'survival rate' as the argument.
-
-        """
-        if self.drop_prob == 0. or not self.training:
-            return x
-        keep_prob = 1 - self.drop_prob
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-        if keep_prob > 0.0 and self.scale_by_keep:
-            random_tensor.div_(keep_prob)
-        return x * random_tensor
-
-    def extra_repr(self):
-        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
-
-
-class MLP(nn.Module):
-    def __init__(
-        self, *,
-        width: int,
-        output_width: int = None,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.c_fc = nn.Linear(width, width * 4)
-        self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width)
-        self.gelu = nn.GELU()
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
-
-
-class QKVMultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        n_data: Optional[int] = None,
-        width=None,
-        qk_norm=False,
-        norm_layer=nn.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.n_data = n_data
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-    def forward(self, q, kv):
-        _, n_ctx, _ = q.shape
-        bs, n_data, width = kv.shape
-        attn_ch = width // self.heads // 2
-        q = q.view(bs, n_ctx, self.heads, -1)
-        kv = kv.view(bs, n_data, self.heads, -1)
-        k, v = torch.split(kv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
-
-        return out
-
-
-class MultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        n_data: Optional[int] = None,
-        data_width: Optional[int] = None,
-        norm_layer=nn.LayerNorm,
-        qk_norm: bool = False
-    ):
-        super().__init__()
-        self.n_data = n_data
-        self.width = width
-        self.heads = heads
-        self.data_width = width if data_width is None else data_width
-        self.c_q = nn.Linear(width, width, bias=qkv_bias)
-        self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
-        self.c_proj = nn.Linear(width, width)
-        self.attention = QKVMultiheadCrossAttention(
-            heads=heads,
-            n_data=n_data,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-
-    def forward(self, x, data):
-        x = self.c_q(x)
-        data = self.c_kv(data)
-        x = self.attention(x, data)
-        x = self.c_proj(x)
-        return x
-
-
-class ResidualCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        n_data: Optional[int] = None,
-        width: int,
-        heads: int,
-        data_width: Optional[int] = None,
-        qkv_bias: bool = True,
-        norm_layer=nn.LayerNorm,
-        qk_norm: bool = False
-    ):
-        super().__init__()
-
-        if data_width is None:
-            data_width = width
-
-        self.attn = MultiheadCrossAttention(
-            n_data=n_data,
-            width=width,
-            heads=heads,
-            data_width=data_width,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
-        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width)
-
-    def forward(self, x: torch.Tensor, data: torch.Tensor):
-        x = x + self.attn(self.ln_1(x), self.ln_2(data))
-        x = x + self.mlp(self.ln_3(x))
-        return x
-
-
-class QKVMultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        n_ctx: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=nn.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.n_ctx = n_ctx
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-    def forward(self, qkv):
-        bs, n_ctx, width = qkv.shape
-        attn_ch = width // self.heads // 3
-        qkv = qkv.view(bs, n_ctx, self.heads, -1)
-        q, k, v = torch.split(qkv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        n_ctx: int,
-        width: int,
-        heads: int,
-        qkv_bias: bool,
-        norm_layer=nn.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.n_ctx = n_ctx
-        self.width = width
-        self.heads = heads
-        self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
-        self.c_proj = nn.Linear(width, width)
-        self.attention = QKVMultiheadAttention(
-            heads=heads,
-            n_ctx=n_ctx,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        x = self.c_qkv(x)
-        x = self.attention(x)
-        x = self.drop_path(self.c_proj(x))
-        return x
-
-
-class ResidualAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        n_ctx: int,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=nn.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.attn = MultiheadAttention(
-            n_ctx=n_ctx,
-            width=width,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
-        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-
-    def forward(self, x: torch.Tensor):
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        *,
-        n_ctx: int,
-        width: int,
-        layers: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=nn.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.n_ctx = n_ctx
-        self.width = width
-        self.layers = layers
-        self.resblocks = nn.ModuleList(
-            [
-                ResidualAttentionBlock(
-                    n_ctx=n_ctx,
-                    width=width,
-                    heads=heads,
-                    qkv_bias=qkv_bias,
-                    norm_layer=norm_layer,
-                    qk_norm=qk_norm,
-                    drop_path_rate=drop_path_rate
-                )
-                for _ in range(layers)
-            ]
-        )
-
-    def forward(self, x: torch.Tensor):
-        for block in self.resblocks:
-            x = block(x)
-        return x
-
-
-class CrossAttentionDecoder(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        num_latents: int,
-        out_channels: int,
-        fourier_embedder: FourierEmbedder,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary"
-    ):
-        super().__init__()
-
-        self.fourier_embedder = fourier_embedder
-
-        self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width)
-
-        self.cross_attn_decoder = ResidualCrossAttentionBlock(
-            n_data=num_latents,
-            width=width,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm
-        )
-
-        self.ln_post = nn.LayerNorm(width)
-        self.output_proj = nn.Linear(width, out_channels)
-        self.label_type = label_type
-
-    def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
-        queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
-        x = self.cross_attn_decoder(queries, latents)
-        x = self.ln_post(x)
-        occ = self.output_proj(x)
-        return occ
-
-
-def generate_dense_grid_points(bbox_min: np.ndarray,
-                               bbox_max: np.ndarray,
-                               octree_depth: int,
-                               indexing: str = "ij",
-                               octree_resolution: int = None,
-                               ):
-    length = bbox_max - bbox_min
-    num_cells = np.exp2(octree_depth)
-    if octree_resolution is not None:
-        num_cells = octree_resolution
-
-    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
-    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
-    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
-    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
-    xyz = np.stack((xs, ys, zs), axis=-1)
-    xyz = xyz.reshape(-1, 3)
-    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
-
-    return xyz, grid_size, length
-
-
-def center_vertices(vertices):
-    """Translate the vertices so that bounding box is centered at zero."""
-    vert_min = vertices.min(dim=0)[0]
-    vert_max = vertices.max(dim=0)[0]
-    vert_center = 0.5 * (vert_min + vert_max)
-    return vertices - vert_center
-
-
-class Latent2MeshOutput:
-
-    def __init__(self, mesh_v=None, mesh_f=None):
-        self.mesh_v = mesh_v
-        self.mesh_f = mesh_f
-
-
-class ShapeVAE(nn.Module):
-    def __init__(
-        self,
-        *,
-        num_latents: int,
-        embed_dim: int,
-        width: int,
-        heads: int,
-        num_decoder_layers: int,
-        num_freqs: int = 8,
-        include_pi: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary",
-        drop_path_rate: float = 0.0,
-        scale_factor: float = 1.0,
-    ):
-        super().__init__()
-        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
-
-        self.post_kl = nn.Linear(embed_dim, width)
-
-        self.transformer = Transformer(
-            n_ctx=num_latents,
-            width=width,
-            layers=num_decoder_layers,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-
-        self.geo_decoder = CrossAttentionDecoder(
-            fourier_embedder=self.fourier_embedder,
-            out_channels=1,
-            num_latents=num_latents,
-            width=width,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            label_type=label_type,
-        )
-
-        self.scale_factor = scale_factor
-        self.latent_shape = (num_latents, embed_dim)
-
-    def forward(self, latents):
-        latents = self.post_kl(latents)
-        latents = self.transformer(latents)
-        return latents
-
-    @torch.no_grad()
-    def latents2mesh(
-        self,
-        latents: torch.FloatTensor,
-        bounds: Union[Tuple[float], List[float], float] = 1.1,
-        octree_depth: int = 7,
-        num_chunks: int = 10000,
-        mc_level: float = -1 / 512,
-        octree_resolution: int = None,
-        mc_algo: str = 'dmc',
-    ):
-        device = latents.device
-
-        # 1. generate query points
-        if isinstance(bounds, float):
-            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
-        bbox_min = np.array(bounds[0:3])
-        bbox_max = np.array(bounds[3:6])
-        bbox_size = bbox_max - bbox_min
-        xyz_samples, grid_size, length = generate_dense_grid_points(
-            bbox_min=bbox_min,
-            bbox_max=bbox_max,
-            octree_depth=octree_depth,
-            octree_resolution=octree_resolution,
-            indexing="ij"
-        )
-        xyz_samples = torch.FloatTensor(xyz_samples)
-
-        # 2. latents to 3d volume
-        batch_logits = []
-        batch_size = latents.shape[0]
-        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
-                          desc=f"MC Level {mc_level} Implicit Function:"):
-            queries = xyz_samples[start: start + num_chunks, :].to(device)
-            queries = queries.half()
-            batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
-
-            logits = self.geo_decoder(batch_queries.to(latents.dtype), latents)
-            if mc_level == -1:
-                mc_level = 0
-                logits = torch.sigmoid(logits) * 2 - 1
-                print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.')
-            batch_logits.append(logits)
-        grid_logits = torch.cat(batch_logits, dim=1)
-        grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float()
-
-        # 3. extract surface
-        outputs = []
-        for i in range(batch_size):
-            try:
-                if mc_algo == 'mc':
-                    vertices, faces, normals, _ = measure.marching_cubes(
-                        grid_logits[i].cpu().numpy(),
-                        mc_level,
-                        method="lewiner"
-                    )
-                    vertices = vertices / grid_size * bbox_size + bbox_min
-                elif mc_algo == 'dmc':
-                    if not hasattr(self, 'dmc'):
-                        try:
-                            from diso import DiffDMC
-                        except:
-                            raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'")
-                        self.dmc = DiffDMC(dtype=torch.float32).to(device)
-                    octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution
-                    sdf = -grid_logits[i] / octree_resolution
-                    verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
-                    verts = center_vertices(verts)
-                    vertices = verts.detach().cpu().numpy()
-                    faces = faces.detach().cpu().numpy()[:, ::-1]
-                else:
-                    raise ValueError(f"mc_algo {mc_algo} not supported.")
-
-                outputs.append(
-                    Latent2MeshOutput(
-                        mesh_v=vertices.astype(np.float32),
-                        mesh_f=np.ascontiguousarray(faces)
-                    )
-                )
-
-            except ValueError:
-                outputs.append(None)
-            except RuntimeError:
-                outputs.append(None)
-
-        return outputs
diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py
deleted file mode 100644
index 02fd79b5976b51df79aa242c11eab2378e92ee34..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/pipelines.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import copy
-import importlib
-import inspect
-import logging
-import os
-from typing import List, Optional, Union
-
-import numpy as np
-import torch
-import trimesh
-import yaml
-from PIL import Image
-from diffusers.utils.torch_utils import randn_tensor
-from tqdm import tqdm
-
-logger = logging.getLogger(__name__)
-
-
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-def export_to_trimesh(mesh_output):
-    if isinstance(mesh_output, list):
-        outputs = []
-        for mesh in mesh_output:
-            if mesh is None:
-                outputs.append(None)
-            else:
-                mesh.mesh_f = mesh.mesh_f[:, ::-1]
-                mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f)
-                outputs.append(mesh_output)
-        return outputs
-    else:
-        mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1]
-        mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f)
-        return mesh_output
-
-
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-
-
-def instantiate_from_config(config, **kwargs):
-    if "target" not in config:
-        raise KeyError("Expected key `target` to instantiate.")
-    cls = get_obj_from_str(config["target"])
-    params = config.get("params", dict())
-    kwargs.update(params)
-    instance = cls(**kwargs)
-    return instance
-
-
-class Hunyuan3DDiTPipeline:
-    @classmethod
-    def from_single_file(
-        cls,
-        ckpt_path,
-        config_path,
-        device='cpu',
-        dtype=torch.float16,
-        **kwargs,
-    ):
-        # load config
-        with open(config_path, 'r') as f:
-            config = yaml.safe_load(f)
-
-        # load ckpt
-        if not os.path.exists(ckpt_path):
-            raise FileNotFoundError(f"Model file {ckpt_path} not found")
-        logger.info(f"Loading model from {ckpt_path}")
-
-        if ckpt_path.endswith('.safetensors'):
-            # parse safetensors
-            import safetensors.torch
-            safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
-            ckpt = {}
-            for key, value in safetensors_ckpt.items():
-                model_name = key.split('.')[0]
-                new_key = key[len(model_name) + 1:]
-                if model_name not in ckpt:
-                    ckpt[model_name] = {}
-                ckpt[model_name][new_key] = value
-        else:
-            ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
-
-        # load model
-        from accelerate import init_empty_weights
-        with init_empty_weights():
-            model = instantiate_from_config(config['model'])
-            vae = instantiate_from_config(config['vae'])
-            conditioner = instantiate_from_config(config['conditioner'])
-            image_processor = instantiate_from_config(config['image_processor'])
-            scheduler = instantiate_from_config(config['scheduler'])
-        
-        model.load_state_dict(ckpt['model'], assign = True)
-        vae.load_state_dict(ckpt['vae'], assign = True)
-        if 'conditioner' in ckpt:
-            conditioner.load_state_dict(ckpt['conditioner'], assign = True)
-
-        model_kwargs = dict(
-            vae=vae,
-            model=model,
-            scheduler=scheduler,
-            conditioner=conditioner,
-            image_processor=image_processor,
-            device=device,
-            dtype=dtype,
-        )
-        model_kwargs.update(kwargs)
-
-        return cls(
-            **model_kwargs
-        )
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_path,
-        device='cuda',
-        dtype=torch.float16,
-        use_safetensors=None,
-        variant=None,
-        subfolder='hunyuan3d-dit-v2-0',
-        **kwargs,
-    ):
-        original_model_path = model_path
-        if not os.path.exists(model_path):
-            # try local path
-            base_dir = os.environ.get('HY3DGEN_MODELS', '/content/hy3dgen')
-            model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
-            if not os.path.exists(model_path):
-                try:
-                    import huggingface_hub
-                    # download from huggingface
-                    path = huggingface_hub.snapshot_download(repo_id=original_model_path)
-                    model_path = os.path.join(path, subfolder)
-                except ImportError:
-                    logger.warning(
-                        "You need to install HuggingFace Hub to load models from the hub."
-                    )
-                    raise RuntimeError(f"Model path {model_path} not found")
-        if not os.path.exists(model_path):
-            raise FileNotFoundError(f"Model path {original_model_path} not found")
-
-        extension = 'ckpt' if not use_safetensors else 'safetensors'
-        variant = '' if variant is None else f'.{variant}'
-        ckpt_name = f'model{variant}.{extension}'
-        config_path = os.path.join(model_path, 'config.yaml')
-        ckpt_path = os.path.join(model_path, ckpt_name)
-
-        return cls.from_single_file(
-            ckpt_path,
-            config_path,
-            device=device,
-            dtype=dtype,
-            use_safetensors=use_safetensors,
-            variant=variant,
-            **kwargs
-        )
-
-    def __init__(
-        self,
-        vae,
-        model,
-        scheduler,
-        conditioner,
-        image_processor,
-        device='cuda',
-        dtype=torch.float16,
-        **kwargs
-    ):
-        self.vae = vae
-        self.model = model
-        self.scheduler = scheduler
-        self.conditioner = conditioner
-        self.image_processor = image_processor
-
-        self.to(device, dtype)
-
-    def to(self, device=None, dtype=None):
-        if device is not None:
-            self.device = torch.device(device)
-            self.vae.to(device)
-            self.model.to(device)
-            self.conditioner.to(device)
-        if dtype is not None:
-            self.dtype = dtype
-            self.vae.to(dtype=dtype)
-            self.model.to(dtype=dtype)
-            self.conditioner.to(dtype=dtype)
-
-    def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
-        bsz = image.shape[0]
-        cond = self.conditioner(image=image, mask=mask)
-
-        if do_classifier_free_guidance:
-            un_cond = self.conditioner.unconditional_embedding(bsz)
-
-            if dual_guidance:
-                un_cond_drop_main = copy.deepcopy(un_cond)
-                un_cond_drop_main['additional'] = cond['additional']
-
-                def cat_recursive(a, b, c):
-                    if isinstance(a, torch.Tensor):
-                        return torch.cat([a, b, c], dim=0).to(self.dtype)
-                    out = {}
-                    for k in a.keys():
-                        out[k] = cat_recursive(a[k], b[k], c[k])
-                    return out
-
-                cond = cat_recursive(cond, un_cond_drop_main, un_cond)
-            else:
-                un_cond = self.conditioner.unconditional_embedding(bsz)
-
-                def cat_recursive(a, b):
-                    if isinstance(a, torch.Tensor):
-                        return torch.cat([a, b], dim=0).to(self.dtype)
-                    out = {}
-                    for k in a.keys():
-                        out[k] = cat_recursive(a[k], b[k])
-                    return out
-
-                cond = cat_recursive(cond, un_cond)
-        return cond
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def prepare_latents(self, batch_size, dtype, device, generator, latents=None):
-        shape = (batch_size, *self.vae.latent_shape)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
-        return latents
-
-    def prepare_image(self, image):
-        if isinstance(image, str) and not os.path.exists(image):
-            raise FileNotFoundError(f"Couldn't find image at path {image}")
-
-        if not isinstance(image, list):
-            image = [image]
-        image_pts = []
-        mask_pts = []
-        for img in image:
-            image_pt, mask_pt = self.image_processor(img, return_mask=True)
-            image_pts.append(image_pt)
-            mask_pts.append(mask_pt)
-
-        image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
-        if mask_pts[0] is not None:
-            mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
-        else:
-            mask_pts = None
-        return image_pts, mask_pts
-
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
-        """
-        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
-
-        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
-            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
-
-        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
-        """
-        assert len(w.shape) == 1
-        w = w * 1000.0
-
-        half_dim = embedding_dim // 2
-        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
-        emb = w.to(dtype)[:, None] * emb[None, :]
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = torch.nn.functional.pad(emb, (0, 1))
-        assert emb.shape == (w.shape[0], embedding_dim)
-        return emb
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: Union[str, List[str], Image.Image] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        sigmas: List[float] = None,
-        eta: float = 0.0,
-        guidance_scale: float = 7.5,
-        dual_guidance_scale: float = 10.5,
-        dual_guidance: bool = True,
-        generator=None,
-        box_v=1.01,
-        octree_resolution=384,
-        mc_level=-1 / 512,
-        num_chunks=8000,
-        mc_algo='mc',
-        output_type: Optional[str] = "trimesh",
-        enable_pbar=True,
-        **kwargs,
-    ) -> List[List[trimesh.Trimesh]]:
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        device = self.device
-        dtype = self.dtype
-        do_classifier_free_guidance = guidance_scale >= 0 and \
-                                      getattr(self.model, 'guidance_cond_proj_dim', None) is None
-        dual_guidance = dual_guidance_scale >= 0 and dual_guidance
-
-        image, mask = self.prepare_image(image)
-        cond = self.encode_cond(image=image,
-                                mask=mask,
-                                do_classifier_free_guidance=do_classifier_free_guidance,
-                                dual_guidance=dual_guidance)
-        batch_size = image.shape[0]
-
-        t_dtype = torch.long
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps, sigmas)
-
-        latents = self.prepare_latents(batch_size, dtype, device, generator)
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        guidance_cond = None
-        if getattr(self.model, 'guidance_cond_proj_dim', None) is not None:
-            print('Using lcm guidance scale')
-            guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size)
-            guidance_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
-        for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)):
-            # expand the latents if we are doing classifier free guidance
-            if do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2))
-            else:
-                latent_model_input = latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device)
-            timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
-            noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond)
-
-            # no drop, drop clip, all drop
-            if do_classifier_free_guidance:
-                if dual_guidance:
-                    noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3)
-                    noise_pred = (
-                        noise_pred_uncond
-                        + guidance_scale * (noise_pred_clip - noise_pred_dino)
-                        + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond)
-                    )
-                else:
-                    noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-            latents = outputs.prev_sample
-
-            if callback is not None and i % callback_steps == 0:
-                step_idx = i // getattr(self.scheduler, "order", 1)
-                callback(step_idx, t, outputs)
-
-        return self._export(
-            latents,
-            output_type,
-            box_v, mc_level, num_chunks, octree_resolution, mc_algo,
-        )
-
-    def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo):
-        if not output_type == "latent":
-            latents = 1. / self.vae.scale_factor * latents
-            latents = self.vae(latents)
-            outputs = self.vae.latents2mesh(
-                latents,
-                bounds=box_v,
-                mc_level=mc_level,
-                num_chunks=num_chunks,
-                octree_resolution=octree_resolution,
-                mc_algo=mc_algo,
-            )
-        else:
-            outputs = latents
-
-        if output_type == 'trimesh':
-            outputs = export_to_trimesh(outputs)
-
-        return outputs
-
-
-class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: Union[str, List[str], Image.Image] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        sigmas: List[float] = None,
-        eta: float = 0.0,
-        guidance_scale: float = 7.5,
-        generator=None,
-        box_v=1.01,
-        octree_resolution=384,
-        mc_level=0.0,
-        mc_algo='mc',
-        num_chunks=8000,
-        output_type: Optional[str] = "trimesh",
-        enable_pbar=True,
-        **kwargs,
-    ) -> List[List[trimesh.Trimesh]]:
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        device = self.device
-        dtype = self.dtype
-        do_classifier_free_guidance = guidance_scale >= 0 and not (
-            hasattr(self.model, 'guidance_embed') and
-            self.model.guidance_embed is True
-        )
-
-        image, mask = self.prepare_image(image)
-        cond = self.encode_cond(
-            image=image,
-            mask=mask,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            dual_guidance=False,
-        )
-        batch_size = image.shape[0]
-
-        # 5. Prepare timesteps
-        # NOTE: this is slightly different from common usage, we start from 0.
-        sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler,
-            num_inference_steps,
-            device,
-            sigmas=sigmas,
-        )
-        latents = self.prepare_latents(batch_size, dtype, device, generator)
-
-        guidance = None
-        if hasattr(self.model, 'guidance_embed') and \
-            self.model.guidance_embed is True:
-            guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
-
-        for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
-            # expand the latents if we are doing classifier free guidance
-            if do_classifier_free_guidance:
-                latent_model_input = torch.cat([latents] * 2)
-            else:
-                latent_model_input = latents
-
-            # NOTE: we assume model get timesteps ranged from 0 to 1
-            timestep = t.expand(latent_model_input.shape[0]).to(
-                latents.dtype) / self.scheduler.config.num_train_timesteps
-            noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance)
-
-            if do_classifier_free_guidance:
-                noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            outputs = self.scheduler.step(noise_pred, t, latents)
-            latents = outputs.prev_sample
-
-            if callback is not None and i % callback_steps == 0:
-                step_idx = i // getattr(self.scheduler, "order", 1)
-                callback(step_idx, t, outputs)
-
-        return self._export(
-            latents,
-            output_type,
-            box_v, mc_level, num_chunks, octree_resolution, mc_algo,
-        )
diff --git a/hy3dgen/shapegen/postprocessors.py b/hy3dgen/shapegen/postprocessors.py
deleted file mode 100644
index 0500fa2d8f70a3a933f8313d11126ad9b27bf57c..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/postprocessors.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import os
-import tempfile
-from typing import Union
-
-import pymeshlab
-import trimesh
-
-from .models.vae import Latent2MeshOutput
-
-
-def load_mesh(path):
-    if path.endswith(".glb"):
-        mesh = trimesh.load(path)
-    else:
-        mesh = pymeshlab.MeshSet()
-        mesh.load_new_mesh(path)
-    return mesh
-
-
-def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000):
-    mesh.apply_filter(
-        "meshing_decimation_quadric_edge_collapse",
-        targetfacenum=max_facenum,
-        qualitythr=1.0,
-        preserveboundary=True,
-        boundaryweight=3,
-        preservenormal=True,
-        preservetopology=True,
-        autoclean=True
-    )
-    return mesh
-
-
-def remove_floater(mesh: pymeshlab.MeshSet):
-    mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face",
-                      nbfaceratio=0.005)
-    mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
-    mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
-    return mesh
-
-
-def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
-    temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
-    temp_file.close()
-    temp_file_name = temp_file.name
-    
-    mesh.save_current_mesh(temp_file_name)
-    mesh = trimesh.load(temp_file_name)
-    if os.path.exists(temp_file_name):
-        os.remove(temp_file_name)
-          
-    # 检查加载的对象类型
-    if isinstance(mesh, trimesh.Scene):
-        combined_mesh = trimesh.Trimesh()
-        # 如果是Scene，遍历所有的geometry并合并
-        for geom in mesh.geometry.values():
-            combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
-        mesh = combined_mesh
-    return mesh
-
-
-def trimesh2pymeshlab(mesh: trimesh.Trimesh):
-    temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
-    temp_file.close()
-    temp_file_name = temp_file.name
-    
-    if isinstance(mesh, trimesh.scene.Scene):
-        for idx, obj in enumerate(mesh.geometry.values()):
-            if idx == 0:
-                temp_mesh = obj
-            else:
-                temp_mesh = temp_mesh + obj
-        mesh = temp_mesh
-    mesh.export(temp_file_name)
-    mesh = pymeshlab.MeshSet()
-    mesh.load_new_mesh(temp_file_name)
-    if os.path.exists(temp_file_name):
-        os.remove(temp_file_name)
-          
-    return mesh
-
-
-def export_mesh(input, output):
-    if isinstance(input, pymeshlab.MeshSet):
-        mesh = output
-    elif isinstance(input, Latent2MeshOutput):
-        output = Latent2MeshOutput()
-        output.mesh_v = output.current_mesh().vertex_matrix()
-        output.mesh_f = output.current_mesh().face_matrix()
-        mesh = output
-    else:
-        mesh = pymeshlab2trimesh(output)
-    return mesh
-
-
-def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
-    if isinstance(mesh, str):
-        mesh = load_mesh(mesh)
-    elif isinstance(mesh, Latent2MeshOutput):
-        mesh = pymeshlab.MeshSet()
-        mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
-        mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
-
-    if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
-        mesh = trimesh2pymeshlab(mesh)
-
-    return mesh
-
-
-class FaceReducer:
-    def __call__(
-        self,
-        mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
-        max_facenum: int = 40000
-    ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]:
-        ms = import_mesh(mesh)
-        ms = reduce_face(ms, max_facenum=max_facenum)
-        mesh = export_mesh(mesh, ms)
-        return mesh
-
-
-class FloaterRemover:
-    def __call__(
-        self,
-        mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
-    ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
-        ms = import_mesh(mesh)
-        ms = remove_floater(ms)
-        mesh = export_mesh(mesh, ms)
-        return mesh
-
-
-class DegenerateFaceRemover:
-    def __call__(
-        self,
-        mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
-    ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
-        ms = import_mesh(mesh)
-
-        temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
-        temp_file.close()
-        temp_file_name = temp_file.name
-
-        ms.save_current_mesh(temp_file_name)
-        ms = pymeshlab.MeshSet()
-        ms.load_new_mesh(temp_file_name)
-        if os.path.exists(temp_file_name):
-            os.remove(temp_file_name)
-               
-        mesh = export_mesh(mesh, ms)
-        return mesh
diff --git a/hy3dgen/shapegen/preprocessors.py b/hy3dgen/shapegen/preprocessors.py
deleted file mode 100644
index 2bdaff2d16cc0844d8d23c886d35c2f4e7286ff7..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/preprocessors.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-from einops import repeat, rearrange
-
-
-def array_to_tensor(np_array):
-    image_pt = torch.tensor(np_array).float()
-    image_pt = image_pt / 255 * 2 - 1
-    image_pt = rearrange(image_pt, "h w c -> c h w")
-    image_pts = repeat(image_pt, "c h w -> b c h w", b=1)
-    return image_pts
-
-
-class ImageProcessorV2:
-    def __init__(self, size=512, border_ratio=None):
-        self.size = size
-        self.border_ratio = border_ratio
-
-    @staticmethod
-    def recenter(image, border_ratio: float = 0.2):
-        """ recenter an image to leave some empty space at the image border.
-
-        Args:
-            image (ndarray): input image, float/uint8 [H, W, 3/4]
-            mask (ndarray): alpha mask, bool [H, W]
-            border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2.
-
-        Returns:
-            ndarray: output image, float/uint8 [H, W, 3/4]
-        """
-
-        if image.shape[-1] == 4:
-            mask = image[..., 3]
-        else:
-            mask = np.ones_like(image[..., 0:1]) * 255
-            image = np.concatenate([image, mask], axis=-1)
-            mask = mask[..., 0]
-
-        H, W, C = image.shape
-
-        size = max(H, W)
-        result = np.zeros((size, size, C), dtype=np.uint8)
-
-        coords = np.nonzero(mask)
-        x_min, x_max = coords[0].min(), coords[0].max()
-        y_min, y_max = coords[1].min(), coords[1].max()
-        h = x_max - x_min
-        w = y_max - y_min
-        if h == 0 or w == 0:
-            raise ValueError('input image is empty')
-        desired_size = int(size * (1 - border_ratio))
-        scale = desired_size / max(h, w)
-        h2 = int(h * scale)
-        w2 = int(w * scale)
-        x2_min = (size - h2) // 2
-        x2_max = x2_min + h2
-
-        y2_min = (size - w2) // 2
-        y2_max = y2_min + w2
-
-        result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2),
-                                                          interpolation=cv2.INTER_AREA)
-
-        bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
-        # bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
-        mask = result[..., 3:].astype(np.float32) / 255
-        result = result[..., :3] * mask + bg * (1 - mask)
-
-        mask = mask * 255
-        result = result.clip(0, 255).astype(np.uint8)
-        mask = mask.clip(0, 255).astype(np.uint8)
-        return result, mask
-
-    def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
-        if self.border_ratio is not None:
-            border_ratio = self.border_ratio
-            print(f"Using border_ratio from init: {border_ratio}")
-        if isinstance(image, str):
-            image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
-            image, mask = self.recenter(image, border_ratio=border_ratio)
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        elif isinstance(image, Image.Image):
-            image = np.asarray(image)
-            image, mask = self.recenter(image, border_ratio=border_ratio)
-
-        image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-        mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
-        mask = mask[..., np.newaxis]
-
-        if to_tensor:
-            image = array_to_tensor(image)
-            mask = array_to_tensor(mask)
-        if return_mask:
-            return image, mask
-        return image
-
-
-IMAGE_PROCESSORS = {
-    "v2": ImageProcessorV2,
-}
-
-DEFAULT_IMAGEPROCESSOR = 'v2'
diff --git a/hy3dgen/shapegen/schedulers.py b/hy3dgen/shapegen/schedulers.py
deleted file mode 100644
index 0069f5cd49c5095930b588f01129a77f172171a7..0000000000000000000000000000000000000000
--- a/hy3dgen/shapegen/schedulers.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.schedulers.scheduling_utils import SchedulerMixin
-from diffusers.utils import BaseOutput, logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's `step` function output.
-
-    Args:
-        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-    """
-
-    prev_sample: torch.FloatTensor
-
-
-class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed
-
-    Euler scheduler.
-
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        timestep_spacing (`str`, defaults to `"linspace"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        shift (`float`, defaults to 1.0):
-            The shift value for the timestep schedule.
-    """
-
-    _compatibles = []
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        shift: float = 1.0,
-        use_dynamic_shifting=False,
-    ):
-        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy()
-        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
-
-        sigmas = timesteps / num_train_timesteps
-        if not use_dynamic_shifting:
-            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
-            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
-
-        self.timesteps = sigmas * num_train_timesteps
-
-        self._step_index = None
-        self._begin_index = None
-
-        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
-        self.sigma_min = self.sigmas[-1].item()
-        self.sigma_max = self.sigmas[0].item()
-
-    @property
-    def step_index(self):
-        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
-        """
-        return self._step_index
-
-    @property
-    def begin_index(self):
-        """
-        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
-        """
-        return self._begin_index
-
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
-        """
-        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
-
-        Args:
-            begin_index (`int`):
-                The begin index for the scheduler.
-        """
-        self._begin_index = begin_index
-
-    def scale_noise(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[float, torch.FloatTensor],
-        noise: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Forward process in flow-matching
-
-        Args:
-            sample (`torch.FloatTensor`):
-                The input sample.
-            timestep (`int`, *optional*):
-                The current timestep in the diffusion chain.
-
-        Returns:
-            `torch.FloatTensor`:
-                A scaled input sample.
-        """
-        # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
-
-        if sample.device.type == "mps" and torch.is_floating_point(timestep):
-            # mps does not support float64
-            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
-            timestep = timestep.to(sample.device, dtype=torch.float32)
-        else:
-            schedule_timesteps = self.timesteps.to(sample.device)
-            timestep = timestep.to(sample.device)
-
-        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
-        if self.begin_index is None:
-            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timestep.shape[0]
-        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
-            step_indices = [self.begin_index] * timestep.shape[0]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(sample.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        sample = sigma * noise + (1.0 - sigma) * sample
-
-        return sample
-
-    def _sigma_to_t(self, sigma):
-        return sigma * self.config.num_train_timesteps
-
-    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
-        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
-
-    def set_timesteps(
-        self,
-        num_inference_steps: int = None,
-        device: Union[str, torch.device] = None,
-        sigmas: Optional[List[float]] = None,
-        mu: Optional[float] = None,
-    ):
-        """
-        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
-
-        Args:
-            num_inference_steps (`int`):
-                The number of diffusion steps used when generating samples with a pre-trained model.
-            device (`str` or `torch.device`, *optional*):
-                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        """
-
-        if self.config.use_dynamic_shifting and mu is None:
-            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
-
-        if sigmas is None:
-            self.num_inference_steps = num_inference_steps
-            timesteps = np.linspace(
-                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
-            )
-
-            sigmas = timesteps / self.config.num_train_timesteps
-
-        if self.config.use_dynamic_shifting:
-            sigmas = self.time_shift(mu, 1.0, sigmas)
-        else:
-            sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
-
-        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
-        timesteps = sigmas * self.config.num_train_timesteps
-
-        self.timesteps = timesteps.to(device=device)
-        self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)])
-
-        self._step_index = None
-        self._begin_index = None
-
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-
-        indices = (schedule_timesteps == timestep).nonzero()
-
-        # The sigma index that is taken for the **very** first `step`
-        # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in
-        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
-        pos = 1 if len(indices) > 1 else 0
-
-        return indices[pos].item()
-
-    def _init_step_index(self, timestep):
-        if self.begin_index is None:
-            if isinstance(timestep, torch.Tensor):
-                timestep = timestep.to(self.timesteps.device)
-            self._step_index = self.index_for_timestep(timestep)
-        else:
-            self._step_index = self._begin_index
-
-    def step(
-        self,
-        model_output: torch.FloatTensor,
-        timestep: Union[float, torch.FloatTensor],
-        sample: torch.FloatTensor,
-        s_churn: float = 0.0,
-        s_tmin: float = 0.0,
-        s_tmax: float = float("inf"),
-        s_noise: float = 1.0,
-        generator: Optional[torch.Generator] = None,
-        return_dict: bool = True,
-    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`torch.FloatTensor`):
-                The direct output from learned diffusion model.
-            timestep (`float`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.FloatTensor`):
-                A current instance of a sample created by the diffusion process.
-            s_churn (`float`):
-            s_tmin  (`float`):
-            s_tmax  (`float`):
-            s_noise (`float`, defaults to 1.0):
-                Scaling factor for noise added to the sample.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
-                tuple.
-
-        Returns:
-            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
-                returned, otherwise a tuple is returned where the first element is the sample tensor.
-        """
-
-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
-            raise ValueError(
-                (
-                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
-                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
-                    " one of the `scheduler.timesteps` as a timestep."
-                ),
-            )
-
-        if self.step_index is None:
-            self._init_step_index(timestep)
-
-        # Upcast to avoid precision issues when computing prev_sample
-        sample = sample.to(torch.float32)
-
-        sigma = self.sigmas[self.step_index]
-        sigma_next = self.sigmas[self.step_index + 1]
-
-        prev_sample = sample + (sigma_next - sigma) * model_output
-
-        # Cast sample back to model compatible dtype
-        prev_sample = prev_sample.to(model_output.dtype)
-
-        # upon completion increase step index by one
-        self._step_index += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/hy3dgen/texgen/__init__.py b/hy3dgen/texgen/__init__.py
deleted file mode 100644
index 1f890f024d507021eca8087d40dc472de36152bd..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py
deleted file mode 100644
index df40dcc8d4819eb903263ff1faf70ce902eb7e07..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-'''
-from .hierarchy import BuildHierarchy, BuildHierarchyWithColor
-from .io_obj import LoadObj, LoadObjWithTexture
-from .render import rasterize, interpolate
-'''
-from .io_glb import *
-from .io_obj import *
-from .render import *
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py
deleted file mode 100644
index c5d7dc8c6127e62848dda8e79fdc281c5a7b42cb..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import base64
-import io
-import os
-
-import numpy as np
-from PIL import Image as PILImage
-from pygltflib import GLTF2
-from scipy.spatial.transform import Rotation as R
-
-
-# Function to extract buffer data
-def get_buffer_data(gltf, buffer_view):
-    buffer = gltf.buffers[buffer_view.buffer]
-    buffer_data = gltf.get_data_from_buffer_uri(buffer.uri)
-    byte_offset = buffer_view.byteOffset if buffer_view.byteOffset else 0
-    byte_length = buffer_view.byteLength
-    return buffer_data[byte_offset:byte_offset + byte_length]
-
-
-# Function to extract attribute data
-def get_attribute_data(gltf, accessor_index):
-    accessor = gltf.accessors[accessor_index]
-    buffer_view = gltf.bufferViews[accessor.bufferView]
-    buffer_data = get_buffer_data(gltf, buffer_view)
-
-    comptype = {5120: np.int8, 5121: np.uint8, 5122: np.int16, 5123: np.uint16, 5125: np.uint32, 5126: np.float32}
-    dtype = comptype[accessor.componentType]
-
-    t2n = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT2': 4, 'MAT3': 9, 'MAT4': 16}
-    num_components = t2n[accessor.type]
-
-    # Calculate the correct slice of data
-    byte_offset = accessor.byteOffset if accessor.byteOffset else 0
-    byte_stride = buffer_view.byteStride if buffer_view.byteStride else num_components * np.dtype(dtype).itemsize
-    count = accessor.count
-
-    # Extract the attribute data
-    attribute_data = np.zeros((count, num_components), dtype=dtype)
-    for i in range(count):
-        start = byte_offset + i * byte_stride
-        end = start + num_components * np.dtype(dtype).itemsize
-        attribute_data[i] = np.frombuffer(buffer_data[start:end], dtype=dtype)
-
-    return attribute_data
-
-
-# Function to extract image data
-def get_image_data(gltf, image, folder):
-    if image.uri:
-        if image.uri.startswith('data:'):
-            # Data URI
-            header, encoded = image.uri.split(',', 1)
-            data = base64.b64decode(encoded)
-        else:
-            # External file
-            fn = image.uri
-            if not os.path.isabs(fn):
-                fn = folder + '/' + fn
-            with open(fn, 'rb') as f:
-                data = f.read()
-    else:
-        buffer_view = gltf.bufferViews[image.bufferView]
-        data = get_buffer_data(gltf, buffer_view)
-    return data
-
-
-# Function to convert triangle strip to triangles
-def convert_triangle_strip_to_triangles(indices):
-    triangles = []
-    for i in range(len(indices) - 2):
-        if i % 2 == 0:
-            triangles.append([indices[i], indices[i + 1], indices[i + 2]])
-        else:
-            triangles.append([indices[i], indices[i + 2], indices[i + 1]])
-    return np.array(triangles).reshape(-1, 3)
-
-
-# Function to convert triangle fan to triangles
-def convert_triangle_fan_to_triangles(indices):
-    triangles = []
-    for i in range(1, len(indices) - 1):
-        triangles.append([indices[0], indices[i], indices[i + 1]])
-    return np.array(triangles).reshape(-1, 3)
-
-
-# Function to get the transformation matrix from a node
-def get_node_transform(node):
-    if node.matrix:
-        return np.array(node.matrix).reshape(4, 4).T
-    else:
-        T = np.eye(4)
-        if node.translation:
-            T[:3, 3] = node.translation
-        if node.rotation:
-            R_mat = R.from_quat(node.rotation).as_matrix()
-            T[:3, :3] = R_mat
-        if node.scale:
-            S = np.diag(node.scale + [1])
-            T = T @ S
-        return T
-
-
-def get_world_transform(gltf, node_index, parents, world_transforms):
-    if parents[node_index] == -2:
-        return world_transforms[node_index]
-
-    node = gltf.nodes[node_index]
-    if parents[node_index] == -1:
-        world_transforms[node_index] = get_node_transform(node)
-        parents[node_index] = -2
-        return world_transforms[node_index]
-
-    parent_index = parents[node_index]
-    parent_transform = get_world_transform(gltf, parent_index, parents, world_transforms)
-    world_transforms[node_index] = parent_transform @ get_node_transform(node)
-    parents[node_index] = -2
-    return world_transforms[node_index]
-
-
-def LoadGlb(path):
-    # Load the GLB file using pygltflib
-    gltf = GLTF2().load(path)
-
-    primitives = []
-    images = {}
-    # Iterate through the meshes in the GLB file
-
-    world_transforms = [np.identity(4) for i in range(len(gltf.nodes))]
-    parents = [-1 for i in range(len(gltf.nodes))]
-    for node_index, node in enumerate(gltf.nodes):
-        for idx in node.children:
-            parents[idx] = node_index
-    # for i in range(len(gltf.nodes)):
-    #    get_world_transform(gltf, i, parents, world_transform)
-
-    for node_index, node in enumerate(gltf.nodes):
-        if node.mesh is not None:
-            world_transform = get_world_transform(gltf, node_index, parents, world_transforms)
-            # Iterate through the primitives in the mesh
-            mesh = gltf.meshes[node.mesh]
-            for primitive in mesh.primitives:
-                # Access the attributes of the primitive
-                attributes = primitive.attributes.__dict__
-                mode = primitive.mode if primitive.mode is not None else 4  # Default to TRIANGLES
-                result = {}
-                if primitive.indices is not None:
-                    indices = get_attribute_data(gltf, primitive.indices)
-                    if mode == 4:  # TRIANGLES
-                        face_indices = indices.reshape(-1, 3)
-                    elif mode == 5:  # TRIANGLE_STRIP
-                        face_indices = convert_triangle_strip_to_triangles(indices)
-                    elif mode == 6:  # TRIANGLE_FAN
-                        face_indices = convert_triangle_fan_to_triangles(indices)
-                    else:
-                        continue
-                    result['F'] = face_indices
-
-                # Extract vertex positions
-                if 'POSITION' in attributes and attributes['POSITION'] is not None:
-                    positions = get_attribute_data(gltf, attributes['POSITION'])
-                    # Apply the world transformation to the positions
-                    positions_homogeneous = np.hstack([positions, np.ones((positions.shape[0], 1))])
-                    transformed_positions = (world_transform @ positions_homogeneous.T).T[:, :3]
-                    result['V'] = transformed_positions
-
-                # Extract vertex colors
-                if 'COLOR_0' in attributes and attributes['COLOR_0'] is not None:
-                    colors = get_attribute_data(gltf, attributes['COLOR_0'])
-                    if colors.shape[-1] > 3:
-                        colors = colors[..., :3]
-                    result['VC'] = colors
-
-                # Extract UVs
-                if 'TEXCOORD_0' in attributes and not attributes['TEXCOORD_0'] is None:
-                    uvs = get_attribute_data(gltf, attributes['TEXCOORD_0'])
-                    result['UV'] = uvs
-
-                if primitive.material is not None:
-                    material = gltf.materials[primitive.material]
-                    if material.pbrMetallicRoughness is not None and material.pbrMetallicRoughness.baseColorTexture is not None:
-                        texture_index = material.pbrMetallicRoughness.baseColorTexture.index
-                        texture = gltf.textures[texture_index]
-                        image_index = texture.source
-                        if not image_index in images:
-                            image = gltf.images[image_index]
-                            image_data = get_image_data(gltf, image, os.path.dirname(path))
-                            pil_image = PILImage.open(io.BytesIO(image_data))
-                            if pil_image.mode != 'RGB':
-                                pil_image = pil_image.convert('RGB')
-                            images[image_index] = pil_image
-                        result['TEX'] = image_index
-                    elif material.emissiveTexture is not None:
-                        texture_index = material.emissiveTexture.index
-                        texture = gltf.textures[texture_index]
-                        image_index = texture.source
-                        if not image_index in images:
-                            image = gltf.images[image_index]
-                            image_data = get_image_data(gltf, image, os.path.dirname(path))
-                            pil_image = PILImage.open(io.BytesIO(image_data))
-                            if pil_image.mode != 'RGB':
-                                pil_image = pil_image.convert('RGB')
-                            images[image_index] = pil_image
-                        result['TEX'] = image_index
-                    else:
-                        if material.pbrMetallicRoughness is not None:
-                            base_color = material.pbrMetallicRoughness.baseColorFactor
-                        else:
-                            base_color = np.array([0.8, 0.8, 0.8], dtype=np.float32)
-                        result['MC'] = base_color
-
-                primitives.append(result)
-
-    return primitives, images
-
-
-def RotatePrimitives(primitives, transform):
-    for i in range(len(primitives)):
-        if 'V' in primitives[i]:
-            primitives[i]['V'] = primitives[i]['V'] @ transform.T
-
-
-if __name__ == '__main__':
-    path = 'data/test.glb'
-    LoadGlb(path)
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py
deleted file mode 100644
index a72c478d8efcb9a3d71a67ce5f167559ef76b922..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import cv2
-import numpy as np
-
-
-def LoadObj(fn):
-    lines = [l.strip() for l in open(fn)]
-    vertices = []
-    faces = []
-    for l in lines:
-        words = [w for w in l.split(' ') if w != '']
-        if len(words) == 0:
-            continue
-        if words[0] == 'v':
-            v = [float(words[i]) for i in range(1, 4)]
-            vertices.append(v)
-        elif words[0] == 'f':
-            f = [int(words[i]) - 1 for i in range(1, 4)]
-            faces.append(f)
-
-    return np.array(vertices).astype('float32'), np.array(faces).astype('int32')
-
-
-def LoadObjWithTexture(fn, tex_fn):
-    lines = [l.strip() for l in open(fn)]
-    vertices = []
-    vertex_textures = []
-    faces = []
-    face_textures = []
-    for l in lines:
-        words = [w for w in l.split(' ') if w != '']
-        if len(words) == 0:
-            continue
-        if words[0] == 'v':
-            v = [float(words[i]) for i in range(1, len(words))]
-            vertices.append(v)
-        elif words[0] == 'vt':
-            v = [float(words[i]) for i in range(1, len(words))]
-            vertex_textures.append(v)
-        elif words[0] == 'f':
-            f = []
-            ft = []
-            for i in range(1, len(words)):
-                t = words[i].split('/')
-                f.append(int(t[0]) - 1)
-                ft.append(int(t[1]) - 1)
-            for i in range(2, len(f)):
-                faces.append([f[0], f[i - 1], f[i]])
-                face_textures.append([ft[0], ft[i - 1], ft[i]])
-
-    tex_image = cv2.cvtColor(cv2.imread(tex_fn), cv2.COLOR_BGR2RGB)
-    return np.array(vertices).astype('float32'), np.array(vertex_textures).astype('float32'), np.array(faces).astype(
-        'int32'), np.array(face_textures).astype('int32'), tex_image
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py
deleted file mode 100644
index 743d4aac4da9e1e18374ce712ac24d19e6788870..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import custom_rasterizer_kernel
-import torch
-
-
-def rasterize(pos, tri, resolution, clamp_depth=torch.zeros(0), use_depth_prior=0):
-    assert (pos.device == tri.device)
-    findices, barycentric = custom_rasterizer_kernel.rasterize_image(pos[0], tri, clamp_depth, resolution[1],
-                                                                     resolution[0], 1e-6, use_depth_prior)
-    return findices, barycentric
-
-
-def interpolate(col, findices, barycentric, tri):
-    f = findices - 1 + (findices == 0)
-    vcol = col[0, tri.long()[f.long()]]
-    result = barycentric.view(*barycentric.shape, 1) * vcol
-    result = torch.sum(result, axis=-2)
-    return result.view(1, *result.shape)
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py
deleted file mode 100644
index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
deleted file mode 100644
index dab3983eef9cae227710bcdc4d86fc2e50b4e6be..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
+++ /dev/null
@@ -1,575 +0,0 @@
-#include "rasterizer.h"
-#include <fstream>
-
-inline int pos2key(float* p, int resolution) {
-    int x = (p[0] * 0.5 + 0.5) * resolution;
-    int y = (p[1] * 0.5 + 0.5) * resolution;
-    int z = (p[2] * 0.5 + 0.5) * resolution;
-    return (x * resolution + y) * resolution + z;
-}
-
-inline void key2pos(int key, int resolution, float* p) {
-    int x = key / resolution / resolution;
-    int y = key / resolution % resolution;
-    int z = key % resolution;
-    p[0] = ((x + 0.5) / resolution - 0.5) * 2;
-    p[1] = ((y + 0.5) / resolution - 0.5) * 2;
-    p[2] = ((z + 0.5) / resolution - 0.5) * 2;
-}
-
-inline void key2cornerpos(int key, int resolution, float* p) {
-    int x = key / resolution / resolution;
-    int y = key / resolution % resolution;
-    int z = key % resolution;
-    p[0] = ((x + 0.75) / resolution - 0.5) * 2;
-    p[1] = ((y + 0.25) / resolution - 0.5) * 2;
-    p[2] = ((z + 0.75) / resolution - 0.5) * 2;
-}
-
-inline float* pos_ptr(int l, int i, int j, torch::Tensor t) {
-    float* pdata = t.data_ptr<float>();
-    int height = t.size(1);
-    int width = t.size(2);
-    return &pdata[((l * height + i) * width + j) * 4];
-}
-
-struct Grid
-{
-    std::vector<int> seq2oddcorner;
-    std::vector<int> seq2evencorner;
-    std::vector<int> seq2grid;
-    std::vector<int> seq2normal;
-    std::vector<int> seq2neighbor;
-    std::unordered_map<int, int> grid2seq;
-    std::vector<int> downsample_seq;
-    int num_origin_seq;
-    int resolution;
-    int stride;
-};
-
-inline void pos_from_seq(Grid& grid, int seq, float* p) {
-    auto k = grid.seq2grid[seq];
-    key2pos(k, grid.resolution, p);
-}
-
-inline int fetch_seq(Grid& grid, int l, int i, int j, torch::Tensor pdata) {
-    float* p = pos_ptr(l, i, j, pdata);
-    if (p[3] == 0)
-        return -1;
-    auto key = pos2key(p, grid.resolution);
-    int seq = grid.grid2seq[key];
-    return seq;
-}
-
-inline int fetch_last_seq(Grid& grid, int i, int j, torch::Tensor pdata) {
-    int num_layers = pdata.size(0);
-    int l = 0;
-    int idx = fetch_seq(grid, l, i, j, pdata);
-    while (l < num_layers - 1) {
-        l += 1;
-        int new_idx = fetch_seq(grid, l, i, j, pdata);
-        if (new_idx == -1)
-            break;
-        idx = new_idx;
-    }
-    return idx;
-}
-
-inline int fetch_nearest_seq(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) {
-    float p[3];
-    float max_dist = 1e10;
-    int best_idx = -1;
-    int num_layers = pdata.size(0);
-    for (int l = 0; l < num_layers; ++l) {
-        int idx = fetch_seq(grid, l, i, j, pdata);
-        if (idx == -1)
-            break;
-        pos_from_seq(grid, idx, p);
-        float dist = std::abs(d - p[(dim + 2) % 3]);
-        if (dist < max_dist) {
-            max_dist = dist;
-            best_idx = idx;
-        }
-    }
-    return best_idx;
-}
-
-inline int fetch_nearest_seq_layer(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) {
-    float p[3];
-    float max_dist = 1e10;
-    int best_layer = -1;
-    int num_layers = pdata.size(0);
-    for (int l = 0; l < num_layers; ++l) {
-        int idx = fetch_seq(grid, l, i, j, pdata);
-        if (idx == -1)
-            break;
-        pos_from_seq(grid, idx, p);
-        float dist = std::abs(d - p[(dim + 2) % 3]);
-        if (dist < max_dist) {
-            max_dist = dist;
-            best_layer = l;
-        }
-    }
-    return best_layer;
-}
-
-void FetchNeighbor(Grid& grid, int seq, float* pos, int dim, int boundary_info, std::vector<torch::Tensor>& view_layer_positions,
-    int* output_indices)
-{
-    auto t = view_layer_positions[dim];
-    int height = t.size(1);
-    int width = t.size(2);
-    int top = 0;
-    int ci = 0;
-    int cj = 0;
-    if (dim == 0) {
-        ci = (pos[1]/2+0.5)*height;
-        cj = (pos[0]/2+0.5)*width;
-    }
-    else if (dim == 1) {
-        ci = (pos[1]/2+0.5)*height;
-        cj = (pos[2]/2+0.5)*width;
-    }
-    else {
-        ci = (-pos[2]/2+0.5)*height;
-        cj = (pos[0]/2+0.5)*width;
-    }
-    int stride = grid.stride;
-    for (int ni = ci + stride; ni >= ci - stride; ni -= stride) {
-        for (int nj = cj - stride; nj <= cj + stride; nj += stride) {
-            int idx = -1;
-            if (ni == ci && nj == cj)
-                idx = seq;
-            else if (!(ni < 0 || ni >= height || nj < 0 || nj >= width)) {
-                if (boundary_info == -1)
-                    idx = fetch_seq(grid, 0, ni, nj, t);
-                else if (boundary_info == 1)
-                    idx = fetch_last_seq(grid, ni, nj, t);
-                else
-                    idx = fetch_nearest_seq(grid, ni, nj, dim, pos[(dim + 2) % 3], t);
-            }
-            output_indices[top] = idx;
-            top += 1;
-        }
-    }
-}
-
-void DownsampleGrid(Grid& src, Grid& tar)
-{
-    src.downsample_seq.resize(src.seq2grid.size(), -1);
-    tar.resolution = src.resolution / 2;
-    tar.stride = src.stride * 2;
-    float pos[3];
-    std::vector<int> seq2normal_count;
-    for (int i = 0; i < src.seq2grid.size(); ++i) {
-        key2pos(src.seq2grid[i], src.resolution, pos);
-        int k = pos2key(pos, tar.resolution);
-        int s = seq2normal_count.size();
-        if (!tar.grid2seq.count(k)) {
-            tar.grid2seq[k] = tar.seq2grid.size();
-            tar.seq2grid.emplace_back(k);
-            seq2normal_count.emplace_back(0);
-            seq2normal_count.emplace_back(0);
-            seq2normal_count.emplace_back(0);
-            //tar.seq2normal.emplace_back(src.seq2normal[i]);
-        } else {
-            s = tar.grid2seq[k] * 3;
-        }
-        seq2normal_count[s + src.seq2normal[i]] += 1;
-        src.downsample_seq[i] = tar.grid2seq[k];
-    }
-    tar.seq2normal.resize(seq2normal_count.size() / 3);
-    for (int i = 0; i < seq2normal_count.size(); i += 3) {
-        int t = 0;
-        for (int j = 1; j < 3; ++j) {
-            if (seq2normal_count[i + j] > seq2normal_count[i + t])
-                t = j;
-        }
-        tar.seq2normal[i / 3] = t;
-    }
-}
-
-void NeighborGrid(Grid& grid, std::vector<torch::Tensor> view_layer_positions, int v)
-{
-    grid.seq2evencorner.resize(grid.seq2grid.size(), 0);
-    grid.seq2oddcorner.resize(grid.seq2grid.size(), 0);
-    std::unordered_set<int> visited_seq;
-    for (int vd = 0; vd < 3; ++vd) {
-        auto t = view_layer_positions[vd];
-        auto t0 = view_layer_positions[v];
-        int height = t.size(1);
-        int width = t.size(2);
-        int num_layers = t.size(0);
-        int num_view_layers = t0.size(0);
-        for (int i = 0; i < height; ++i) {
-            for (int j = 0; j < width; ++j) {
-                for (int l = 0; l < num_layers; ++l) {
-                    int seq = fetch_seq(grid, l, i, j, t);
-                    if (seq == -1)
-                        break;
-                    int dim = grid.seq2normal[seq];
-                    if (dim != v)
-                        continue;
-
-                    float pos[3];
-                    pos_from_seq(grid, seq, pos);
-
-                    int ci = 0;
-                    int cj = 0;
-                    if (dim == 0) {
-                        ci = (pos[1]/2+0.5)*height;
-                        cj = (pos[0]/2+0.5)*width;
-                    }
-                    else if (dim == 1) {
-                        ci = (pos[1]/2+0.5)*height;
-                        cj = (pos[2]/2+0.5)*width;
-                    }
-                    else {
-                        ci = (-pos[2]/2+0.5)*height;
-                        cj = (pos[0]/2+0.5)*width;
-                    }
-
-                    if ((ci % (grid.stride * 2) < grid.stride) && (cj % (grid.stride * 2) >= grid.stride))
-                        grid.seq2evencorner[seq] = 1;
-
-                    if ((ci % (grid.stride * 2) >= grid.stride) && (cj % (grid.stride * 2) < grid.stride))
-                        grid.seq2oddcorner[seq] = 1;
-
-                    bool is_boundary = false;
-                    if (vd == v) {
-                        if (l == 0 || l == num_layers - 1)
-                            is_boundary = true;
-                        else {
-                            int seq_new = fetch_seq(grid, l + 1, i, j, t);
-                            if (seq_new == -1)
-                                is_boundary = true;
-                        }
-                    }
-                    int boundary_info = 0;
-                    if (is_boundary && (l == 0))
-                        boundary_info = -1;
-                    else if (is_boundary)
-                        boundary_info = 1;
-                    if (visited_seq.count(seq))
-                        continue;
-                    visited_seq.insert(seq);
-
-                    FetchNeighbor(grid, seq, pos, dim, boundary_info, view_layer_positions, &grid.seq2neighbor[seq * 9]);
-                }
-            }
-        }
-    }
-}
-
-void PadGrid(Grid& src, Grid& tar, std::vector<torch::Tensor>& view_layer_positions) {
-    auto& downsample_seq = src.downsample_seq;
-    auto& seq2evencorner = src.seq2evencorner;
-    auto& seq2oddcorner = src.seq2oddcorner;
-    int indices[9];
-    std::vector<int> mapped_even_corners(tar.seq2grid.size(), 0);
-    std::vector<int> mapped_odd_corners(tar.seq2grid.size(), 0);
-    for (int i = 0; i < downsample_seq.size(); ++i) {
-        if (seq2evencorner[i] > 0) {
-            mapped_even_corners[downsample_seq[i]] = 1;
-        }
-        if (seq2oddcorner[i] > 0) {
-            mapped_odd_corners[downsample_seq[i]] = 1;
-        }
-    }
-    auto& tar_seq2normal = tar.seq2normal;
-    auto& tar_seq2grid = tar.seq2grid;
-    for (int i = 0; i < tar_seq2grid.size(); ++i) {
-        if (mapped_even_corners[i] == 1 && mapped_odd_corners[i] == 1)
-            continue;
-        auto k = tar_seq2grid[i];
-        float p[3];
-        key2cornerpos(k, tar.resolution, p);
-
-        int src_key = pos2key(p, src.resolution);
-        if (!src.grid2seq.count(src_key)) {
-            int seq = src.seq2grid.size();
-            src.grid2seq[src_key] = seq;
-            src.seq2evencorner.emplace_back((mapped_even_corners[i] == 0));
-            src.seq2oddcorner.emplace_back((mapped_odd_corners[i] == 0));
-            src.seq2grid.emplace_back(src_key);
-            src.seq2normal.emplace_back(tar_seq2normal[i]);
-            FetchNeighbor(src, seq, p, tar_seq2normal[i], 0, view_layer_positions, indices);
-            for (int j = 0; j < 9; ++j) {
-                src.seq2neighbor.emplace_back(indices[j]);
-            }
-            src.downsample_seq.emplace_back(i);
-        } else {
-            int seq = src.grid2seq[src_key];
-            if (mapped_even_corners[i] == 0)
-                src.seq2evencorner[seq] = 1;
-            if (mapped_odd_corners[i] == 0)
-                src.seq2oddcorner[seq] = 1;
-        }
-    }
-}
-
-std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tensor> view_layer_positions,
-    std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution)
-{
-    if (view_layer_positions.size() != 3 || num_level < 1) {
-        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
-        return {{},{},{},{}};
-    }
-
-    std::vector<Grid> grids;
-    grids.resize(num_level);
-
-    std::vector<float> seq2pos;
-    auto& seq2grid = grids[0].seq2grid;
-    auto& seq2normal = grids[0].seq2normal;
-    auto& grid2seq = grids[0].grid2seq;
-    grids[0].resolution = resolution;
-    grids[0].stride = 1;
-
-    auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
-    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
-
-    for (int v = 0; v < 3; ++v) {
-        int num_layers = view_layer_positions[v].size(0);
-        int height = view_layer_positions[v].size(1);
-        int width = view_layer_positions[v].size(2);
-        float* data = view_layer_positions[v].data_ptr<float>();
-        float* data_normal = view_layer_normals[v].data_ptr<float>();
-        for (int l = 0; l < num_layers; ++l) {
-            for (int i = 0; i < height; ++i) {
-                for (int j = 0; j < width; ++j) {
-                    float* p = &data[(i * width + j) * 4];
-                    float* n = &data_normal[(i * width + j) * 3];
-                    if (p[3] == 0)
-                        continue;
-                    auto k = pos2key(p, resolution);
-                    if (!grid2seq.count(k)) {
-                        int dim = 0;
-                        for (int d = 0; d < 3; ++d) {
-                            if (std::abs(n[d]) > std::abs(n[dim]))
-                                dim = d;
-                        }
-                        dim = (dim + 1) % 3;
-                        grid2seq[k] = seq2grid.size();
-                        seq2grid.emplace_back(k);
-                        seq2pos.push_back(p[0]);
-                        seq2pos.push_back(p[1]);
-                        seq2pos.push_back(p[2]);
-                        seq2normal.emplace_back(dim);
-                    }
-                }
-            }
-            data += (height * width * 4);
-            data_normal += (height * width * 3);
-        }
-    }
-
-    for (int i = 0; i < num_level - 1; ++i) {
-        DownsampleGrid(grids[i], grids[i + 1]);
-    }
-
-    for (int l = 0; l < num_level; ++l) {
-        grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1);
-        grids[l].num_origin_seq = grids[l].seq2grid.size();
-        for (int d = 0; d < 3; ++d) {
-            NeighborGrid(grids[l], view_layer_positions, d);
-        }
-    }
-
-    for (int i = num_level - 2; i >= 0; --i) {
-        PadGrid(grids[i], grids[i + 1], view_layer_positions);
-    }
-    for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) {
-        int k = grids[0].seq2grid[i];
-        float p[3];
-        key2pos(k, grids[0].resolution, p);
-        seq2pos.push_back(p[0]);
-        seq2pos.push_back(p[1]);
-        seq2pos.push_back(p[2]);
-    }
-
-    std::vector<torch::Tensor> texture_positions(2);
-    std::vector<torch::Tensor> grid_neighbors(grids.size());
-    std::vector<torch::Tensor> grid_downsamples(grids.size() - 1);
-    std::vector<torch::Tensor> grid_evencorners(grids.size());
-    std::vector<torch::Tensor> grid_oddcorners(grids.size());
-
-
-    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), static_cast<int64_t>(3)}, float_options);
-    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
-    float* positions_out_ptr = texture_positions[0].data_ptr<float>();
-    memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
-    positions_out_ptr = texture_positions[1].data_ptr<float>();
-    for (int i = 0; i < grids[0].seq2grid.size(); ++i) {
-        positions_out_ptr[i] = (i < grids[0].num_origin_seq);
-    }
-
-    for (int i = 0; i < grids.size(); ++i) {
-        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), static_cast<int64_t>(9)}, int64_options);
-        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
-        for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
-            nptr[j] = grids[i].seq2neighbor[j];
-        }
-
-        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
-        grid_oddcorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2oddcorner.size())}, int64_options);
-        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
-        for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
-            dptr[j] = grids[i].seq2evencorner[j];
-        }
-        dptr = grid_oddcorners[i].data_ptr<int64_t>();
-        for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
-            dptr[j] = grids[i].seq2oddcorner[j];
-        }            
-        if (i + 1 < grids.size()) {
-            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
-            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
-            for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
-                dptr[j] = grids[i].downsample_seq[j];
-            }
-        }
-
-    }
-    return {texture_positions, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
-}
-
-std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
-    std::vector<torch::Tensor> view_layer_positions,
-    std::vector<torch::Tensor> view_layer_normals,
-    std::vector<torch::Tensor> view_layer_feats,
-    int num_level, int resolution)
-{
-    if (view_layer_positions.size() != 3 || num_level < 1) {
-        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
-        return {{},{},{},{}};
-    }
-
-    std::vector<Grid> grids;
-    grids.resize(num_level);
-
-    std::vector<float> seq2pos;
-    std::vector<float> seq2feat;
-    auto& seq2grid = grids[0].seq2grid;
-    auto& seq2normal = grids[0].seq2normal;
-    auto& grid2seq = grids[0].grid2seq;
-    grids[0].resolution = resolution;
-    grids[0].stride = 1;
-
-    auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
-    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
-
-    int feat_channel = 3;
-    for (int v = 0; v < 3; ++v) {
-        int num_layers = view_layer_positions[v].size(0);
-        int height = view_layer_positions[v].size(1);
-        int width = view_layer_positions[v].size(2);
-        float* data = view_layer_positions[v].data_ptr<float>();
-        float* data_normal = view_layer_normals[v].data_ptr<float>();
-        float* data_feat = view_layer_feats[v].data_ptr<float>();
-        feat_channel = view_layer_feats[v].size(3);
-        for (int l = 0; l < num_layers; ++l) {
-            for (int i = 0; i < height; ++i) {
-                for (int j = 0; j < width; ++j) {
-                    float* p = &data[(i * width + j) * 4];
-                    float* n = &data_normal[(i * width + j) * 3];
-                    float* f = &data_feat[(i * width + j) * feat_channel];
-                    if (p[3] == 0)
-                        continue;
-                    auto k = pos2key(p, resolution);
-                    if (!grid2seq.count(k)) {
-                        int dim = 0;
-                        for (int d = 0; d < 3; ++d) {
-                            if (std::abs(n[d]) > std::abs(n[dim]))
-                                dim = d;
-                        }
-                        dim = (dim + 1) % 3;
-                        grid2seq[k] = seq2grid.size();
-                        seq2grid.emplace_back(k);
-                        seq2pos.push_back(p[0]);
-                        seq2pos.push_back(p[1]);
-                        seq2pos.push_back(p[2]);
-                        for (int c = 0; c < feat_channel; ++c) {
-                            seq2feat.emplace_back(f[c]);
-                        }
-                        seq2normal.emplace_back(dim);
-                    }
-                }
-            }
-            data += (height * width * 4);
-            data_normal += (height * width * 3);
-            data_feat += (height * width * feat_channel);
-        }
-    }
-
-    for (int i = 0; i < num_level - 1; ++i) {
-        DownsampleGrid(grids[i], grids[i + 1]);
-    }
-
-    for (int l = 0; l < num_level; ++l) {
-        grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1);
-        grids[l].num_origin_seq = grids[l].seq2grid.size();
-        for (int d = 0; d < 3; ++d) {
-            NeighborGrid(grids[l], view_layer_positions, d);
-        }
-    }
-
-    for (int i = num_level - 2; i >= 0; --i) {
-        PadGrid(grids[i], grids[i + 1], view_layer_positions);
-    }
-    for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) {
-        int k = grids[0].seq2grid[i];
-        float p[3];
-        key2pos(k, grids[0].resolution, p);
-        seq2pos.push_back(p[0]);
-        seq2pos.push_back(p[1]);
-        seq2pos.push_back(p[2]);
-        for (int c = 0; c < feat_channel; ++c) {
-            seq2feat.emplace_back(0.5);
-        }
-    }
-
-    std::vector<torch::Tensor> texture_positions(2);
-    std::vector<torch::Tensor> texture_feats(1);
-    std::vector<torch::Tensor> grid_neighbors(grids.size());
-    std::vector<torch::Tensor> grid_downsamples(grids.size() - 1);
-    std::vector<torch::Tensor> grid_evencorners(grids.size());
-    std::vector<torch::Tensor> grid_oddcorners(grids.size());
-
-    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), static_cast<int64_t>(3)}, float_options);
-    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
-    texture_feats[0] = torch::zeros({static_cast<int64_t>(seq2feat.size() / feat_channel), static_cast<int64_t>(feat_channel)}, float_options);
-    float* positions_out_ptr = texture_positions[0].data_ptr<float>();
-    memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
-    positions_out_ptr = texture_positions[1].data_ptr<float>();
-    for (int i = 0; i < grids[0].seq2grid.size(); ++i) {
-        positions_out_ptr[i] = (i < grids[0].num_origin_seq);
-    }
-    float* feats_out_ptr = texture_feats[0].data_ptr<float>();
-    memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size());
-
-    for (int i = 0; i < grids.size(); ++i) {
-        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), static_cast<int64_t>(9)}, int64_options);
-        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
-        for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
-            nptr[j] = grids[i].seq2neighbor[j];
-        }
-        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
-        grid_oddcorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2oddcorner.size())}, int64_options);
-        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
-        for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
-            dptr[j] = grids[i].seq2evencorner[j];
-        }
-        dptr = grid_oddcorners[i].data_ptr<int64_t>();
-        for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
-            dptr[j] = grids[i].seq2oddcorner[j];
-        }
-        if (i + 1 < grids.size()) {
-            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
-            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
-            for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
-                dptr[j] = grids[i].downsample_seq[j];
-            }
-        }
-    }
-    return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
-}
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
deleted file mode 100644
index 4529d7eb674d5263f5103f7a2c2aa5085ee752d5..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include "rasterizer.h"
-
-void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
-    float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
-    float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
-    float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
-    float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1]));
-
-    for (int px = x_min; px < x_max + 1; ++px) {
-        if (px < 0 || px >= width)
-            continue;
-        for (int py = y_min; py < y_max + 1; ++py) {
-            if (py < 0 || py >= height)
-                continue;
-            float vt[2] = {px + 0.5f, py + 0.5f};
-            float baryCentricCoordinate[3];
-            calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate);
-            if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
-                int pixel = py * width + px;
-                if (zbuffer == 0) {
-                    zbuffer[pixel] = (INT64)(idx + 1);
-                    continue;
-                }
-
-                float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
-                float depth_thres = 0;
-                if (d) {
-                    depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation;
-                }
-                
-                int z_quantize = depth * (2<<17);
-                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
-                if (depth < depth_thres)
-                    continue;
-                zbuffer[pixel] = std::min(zbuffer[pixel], token);
-            }
-        }
-    }
-}
-
-void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
-    float* barycentric_map, int pix)
-{
-    INT64 f = zbuffer[pix] % MAXINT;
-    if (f == (MAXINT-1)) {
-        findices[pix] = 0;
-        barycentric_map[pix * 3] = 0;
-        barycentric_map[pix * 3 + 1] = 0;
-        barycentric_map[pix * 3 + 2] = 0;
-        return;
-    }
-    findices[pix] = f;
-    f -= 1;
-    float barycentric[3] = {0, 0, 0};
-    if (f >= 0) {
-        float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f};
-        float* vt0_ptr = V + (F[f * 3] * 4);
-        float* vt1_ptr = V + (F[f * 3 + 1] * 4);
-        float* vt2_ptr = V + (F[f * 3 + 2] * 4);
-
-        float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f};
-        float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f};
-        float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f};
-
-        calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric);
-
-        barycentric[0] = barycentric[0] / vt0_ptr[3];
-        barycentric[1] = barycentric[1] / vt1_ptr[3];
-        barycentric[2] = barycentric[2] / vt2_ptr[3];
-        float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]);
-        barycentric[0] *= w;
-        barycentric[1] *= w;
-        barycentric[2] *= w;
-
-    }
-    barycentric_map[pix * 3] = barycentric[0];
-    barycentric_map[pix * 3 + 1] = barycentric[1];
-    barycentric_map[pix * 3 + 2] = barycentric[2];
-}
-
-void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
-{
-    float* vt0_ptr = V + (F[f * 3] * 4);
-    float* vt1_ptr = V + (F[f * 3 + 1] * 4);
-    float* vt2_ptr = V + (F[f * 3 + 2] * 4);
-
-    float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f};
-    float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f};
-    float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f};
-
-    rasterizeTriangleCPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc);
-}
-
-std::vector<torch::Tensor> rasterize_image_cpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
-    int width, int height, float occlusion_truncation, int use_depth_prior)
-{
-    int num_faces = F.size(0);
-    int num_vertices = V.size(0);
-    auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false);
-    auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
-    auto findices = torch::zeros({height, width}, options);
-    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
-    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
-
-    if (!use_depth_prior) {
-        for (int i = 0; i < num_faces; ++i) {
-            rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), 0,
-                (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
-        }
-    } else {
-        for (int i = 0; i < num_faces; ++i)
-            rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
-                (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
-    }
-
-    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
-    auto barycentric = torch::zeros({height, width, 3}, float_options);
-    for (int i = 0; i < width * height; ++i)
-        barycentricFromImgcoordCPU(V.data_ptr<float>(), F.data_ptr<int>(),
-            findices.data_ptr<int>(), (INT64*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
-
-    return {findices, barycentric};
-}
-
-std::vector<torch::Tensor> rasterize_image(torch::Tensor V, torch::Tensor F, torch::Tensor D,
-    int width, int height, float occlusion_truncation, int use_depth_prior)
-{
-    int device_id = V.get_device();
-    if (device_id == -1)
-        return rasterize_image_cpu(V, F, D, width, height, occlusion_truncation, use_depth_prior);
-    else
-        return rasterize_image_gpu(V, F, D, width, height, occlusion_truncation, use_depth_prior);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("rasterize_image", &rasterize_image, "Custom image rasterization");
-  m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization");
-  m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization");
-}
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
deleted file mode 100644
index a1fa8ff2150cbf34644c5027a77f6400c8c9cdde..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef RASTERIZER_H_
-#define RASTERIZER_H_
-
-#include <torch/extension.h>
-#include <vector>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h> // For CUDA context
-#include <cstdint>
-#define INT64 uint64_t
-#define MAXINT 2147483647
-
-__host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) {
-    return ((c[0] - a[0]) * (b[1] - a[1]) - (b[0] - a[0]) * (c[1] - a[1]));
-}
-
-__host__ __device__  inline void calculateBarycentricCoordinate(float* a, float* b, float* c, float* p,
-    float* barycentric)
-{
-    float beta_tri = calculateSignedArea2(a, p, c);
-    float gamma_tri = calculateSignedArea2(a, b, p);
-    float area = calculateSignedArea2(a, b, c);
-    if (area == 0) {
-        barycentric[0] = -1.0;
-        barycentric[1] = -1.0;
-        barycentric[2] = -1.0;
-        return;
-    }
-    float tri_inv = 1.0 / area;
-    float beta = beta_tri * tri_inv;
-    float gamma = gamma_tri * tri_inv;
-    float alpha = 1.0 - beta - gamma;
-    barycentric[0] = alpha;
-    barycentric[1] = beta;
-    barycentric[2] = gamma;
-}
-
-__host__ __device__  inline bool isBarycentricCoordInBounds(float* barycentricCoord) {
-    return barycentricCoord[0] >= 0.0 && barycentricCoord[0] <= 1.0 &&
-           barycentricCoord[1] >= 0.0 && barycentricCoord[1] <= 1.0 &&
-           barycentricCoord[2] >= 0.0 && barycentricCoord[2] <= 1.0;
-}
-
-std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
-    int width, int height, float occlusion_truncation, int use_depth_prior);
-
-std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tensor> view_layer_positions, std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution);
-
-std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
-    std::vector<torch::Tensor> view_layer_positions,
-    std::vector<torch::Tensor> view_layer_normals,
-    std::vector<torch::Tensor> view_layer_feats,
-    int num_level, int resolution);
-
-#endif
\ No newline at end of file
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
deleted file mode 100644
index cc6f354c0e2801b9ac84ec4547845c8edb606a60..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
+++ /dev/null
@@ -1,127 +0,0 @@
-#include "rasterizer.h"
-
-__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
-    float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
-    float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
-    float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
-    float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1]));
-
-    for (int px = x_min; px < x_max + 1; ++px) {
-        if (px < 0 || px >= width)
-            continue;
-        for (int py = y_min; py < y_max + 1; ++py) {
-            if (py < 0 || py >= height)
-                continue;
-            float vt[2] = {px + 0.5f, py + 0.5f};
-            float baryCentricCoordinate[3];
-            calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate);
-            if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
-                int pixel = py * width + px;
-                if (zbuffer == 0) {
-                    atomicExch(&zbuffer[pixel], (INT64)(idx + 1));
-                    continue;
-                }
-                float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
-                float depth_thres = 0;
-                if (d) {
-                    depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation;
-                }
-                
-                int z_quantize = depth * (2<<17);
-                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
-                if (depth < depth_thres)
-                    continue;
-                atomicMin(&zbuffer[pixel], token);
-            }
-        }
-    }
-}
-
-__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
-    float* barycentric_map)
-{
-    int pix = blockIdx.x * blockDim.x + threadIdx.x;
-    if (pix >= width * height)
-        return;
-    INT64 f = zbuffer[pix] % MAXINT;
-    if (f == (MAXINT-1)) {
-        findices[pix] = 0;
-        barycentric_map[pix * 3] = 0;
-        barycentric_map[pix * 3 + 1] = 0;
-        barycentric_map[pix * 3 + 2] = 0;
-        return;
-    }
-    findices[pix] = f;
-    f -= 1;
-    float barycentric[3] = {0, 0, 0};
-    if (f >= 0) {
-        float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f};
-        float* vt0_ptr = V + (F[f * 3] * 4);
-        float* vt1_ptr = V + (F[f * 3 + 1] * 4);
-        float* vt2_ptr = V + (F[f * 3 + 2] * 4);
-
-        float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f};
-        float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f};
-        float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f};
-
-        calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric);
-
-        barycentric[0] = barycentric[0] / vt0_ptr[3];
-        barycentric[1] = barycentric[1] / vt1_ptr[3];
-        barycentric[2] = barycentric[2] / vt2_ptr[3];
-        float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]);
-        barycentric[0] *= w;
-        barycentric[1] *= w;
-        barycentric[2] *= w;
-
-    }
-    barycentric_map[pix * 3] = barycentric[0];
-    barycentric_map[pix * 3 + 1] = barycentric[1];
-    barycentric_map[pix * 3 + 2] = barycentric[2];
-}
-
-__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
-{
-    int f = blockIdx.x * blockDim.x + threadIdx.x;
-    if (f >= num_faces)
-        return; 
-
-    float* vt0_ptr = V + (F[f * 3] * 4);
-    float* vt1_ptr = V + (F[f * 3 + 1] * 4);
-    float* vt2_ptr = V + (F[f * 3 + 2] * 4);
-
-    float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f};
-    float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f};
-    float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f};
-
-    rasterizeTriangleGPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc);
-}
-
-std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
-    int width, int height, float occlusion_truncation, int use_depth_prior)
-{
-    int device_id = V.get_device();
-    cudaSetDevice(device_id);
-    int num_faces = F.size(0);
-    int num_vertices = V.size(0);
-    auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false);
-    auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false);
-    auto findices = torch::zeros({height, width}, options);
-    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
-    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
-
-    if (!use_depth_prior) {
-        rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), 0,
-            (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
-    } else {
-        rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
-            (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
-    }
-
-    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false);
-    auto barycentric = torch::zeros({height, width, 3}, float_options);
-    barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr<float>(), F.data_ptr<int>(),
-        findices.data_ptr<int>(), (INT64*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
-
-    return {findices, barycentric};
-}
diff --git a/hy3dgen/texgen/custom_rasterizer/setup.py b/hy3dgen/texgen/custom_rasterizer/setup.py
deleted file mode 100644
index 3e312a7f45689753b5ba3ed4befff1fefecff6fd..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/custom_rasterizer/setup.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from setuptools import setup, find_packages
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-# build custom rasterizer
-# build with `python setup.py install`
-# nvcc is needed
-
-custom_rasterizer_module = CUDAExtension('custom_rasterizer_kernel', [
-    'lib/custom_rasterizer_kernel/rasterizer.cpp',
-    'lib/custom_rasterizer_kernel/grid_neighbor.cpp',
-    'lib/custom_rasterizer_kernel/rasterizer_gpu.cu',
-])
-
-setup(
-    packages=find_packages(),
-    version='0.1',
-    name='custom_rasterizer',
-    include_package_data=True,
-    package_dir={'': '.'},
-    ext_modules=[
-        custom_rasterizer_module,
-    ],
-    cmdclass={
-        'build_ext': BuildExtension
-    }
-)
diff --git a/hy3dgen/texgen/differentiable_renderer/__init__.py b/hy3dgen/texgen/differentiable_renderer/__init__.py
deleted file mode 100644
index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp
deleted file mode 100644
index cb7a9671b7e96564de44070afdced28da0f631b7..0000000000000000000000000000000000000000
Binary files a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp and /dev/null differ
diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib
deleted file mode 100644
index 19b554dd00907fa3cacbf26d59f00247cd76985b..0000000000000000000000000000000000000000
Binary files a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib and /dev/null differ
diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj
deleted file mode 100644
index 318c2eddbb7c258091e2825e02abff7f65ef35b9..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1aa1f67f69a3f4389d88b5824de08503705112177eb5d8c7dd5ad09c2847e8b6
-size 7617045
diff --git a/hy3dgen/texgen/differentiable_renderer/camera_utils.py b/hy3dgen/texgen/differentiable_renderer/camera_utils.py
deleted file mode 100644
index 289710ab787a174b39154f1010fc6209e4c92dfe..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/camera_utils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import math
-
-import numpy as np
-import torch
-
-
-def transform_pos(mtx, pos, keepdim=False):
-    t_mtx = torch.from_numpy(mtx).to(
-        pos.device) if isinstance(
-        mtx, np.ndarray) else mtx
-    if pos.shape[-1] == 3:
-        posw = torch.cat(
-            [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1)
-    else:
-        posw = pos
-
-    if keepdim:
-        return torch.matmul(posw, t_mtx.t())[...]
-    else:
-        return torch.matmul(posw, t_mtx.t())[None, ...]
-
-
-def get_mv_matrix(elev, azim, camera_distance, center=None):
-    elev = -elev
-    azim += 90
-
-    elev_rad = math.radians(elev)
-    azim_rad = math.radians(azim)
-
-    camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad),
-                                camera_distance *
-                                math.cos(elev_rad) * math.sin(azim_rad),
-                                camera_distance * math.sin(elev_rad)])
-
-    if center is None:
-        center = np.array([0, 0, 0])
-    else:
-        center = np.array(center)
-
-    lookat = center - camera_position
-    lookat = lookat / np.linalg.norm(lookat)
-
-    up = np.array([0, 0, 1.0])
-    right = np.cross(lookat, up)
-    right = right / np.linalg.norm(right)
-    up = np.cross(right, lookat)
-    up = up / np.linalg.norm(up)
-
-    c2w = np.concatenate(
-        [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1)
-
-    w2c = np.zeros((4, 4))
-    w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0))
-    w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:])
-    w2c[3, 3] = 1.0
-
-    return w2c.astype(np.float32)
-
-
-def get_orthographic_projection_matrix(
-    left=-1, right=1, bottom=-1, top=1, near=0, far=2):
-    """
-    计算正交投影矩阵。
-
-    参数:
-        left (float): 投影区域左侧边界。
-        right (float): 投影区域右侧边界。
-        bottom (float): 投影区域底部边界。
-        top (float): 投影区域顶部边界。
-        near (float): 投影区域近裁剪面距离。
-        far (float): 投影区域远裁剪面距离。
-
-    返回:
-        numpy.ndarray: 正交投影矩阵。
-    """
-    ortho_matrix = np.eye(4, dtype=np.float32)
-    ortho_matrix[0, 0] = 2 / (right - left)
-    ortho_matrix[1, 1] = 2 / (top - bottom)
-    ortho_matrix[2, 2] = -2 / (far - near)
-    ortho_matrix[0, 3] = -(right + left) / (right - left)
-    ortho_matrix[1, 3] = -(top + bottom) / (top - bottom)
-    ortho_matrix[2, 3] = -(far + near) / (far - near)
-    return ortho_matrix
-
-
-def get_perspective_projection_matrix(fovy, aspect_wh, near, far):
-    fovy_rad = math.radians(fovy)
-    return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0],
-                     [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0],
-                     [0, 0, -(far + near) / (far - near), -
-                     2.0 * far * near / (far - near)],
-                     [0, 0, -1, 0]]).astype(np.float32)
diff --git a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat
deleted file mode 100644
index 3947b0f03f9f6245dac95db7460703076444a304..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat
+++ /dev/null
@@ -1,3 +0,0 @@
-FOR /F "tokens=*" %%i IN ('python -m pybind11 --includes') DO SET PYINCLUDES=%%i
-echo %PYINCLUDES%
-g++ -O3 -Wall -shared -std=c++11 -fPIC %PYINCLUDES% mesh_processor.cpp -o mesh_processor.pyd -lpython3.12
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg b/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg
deleted file mode 100644
index 94aa03de74fc9b82fc5335e097d1c2f538610577..0000000000000000000000000000000000000000
Binary files a/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg and /dev/null differ
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp
deleted file mode 100644
index ca8650fada02099d3fce0f551fa4f953f278cf34..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-#include <vector>
-#include <queue>
-#include <cmath>
-#include <algorithm>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
-using namespace std;
-
-std::pair<py::array_t<float>,
-  py::array_t<uint8_t>>  meshVerticeInpaint_smooth(py::array_t<float> texture,
-py::array_t<uint8_t> mask,
-                 py::array_t<float> vtx_pos, py::array_t<float> vtx_uv, 
-                 py::array_t<int> pos_idx, py::array_t<int> uv_idx) {
-    auto texture_buf = texture.request();
-    auto mask_buf = mask.request();
-    auto vtx_pos_buf = vtx_pos.request();
-    auto vtx_uv_buf = vtx_uv.request();
-    auto pos_idx_buf = pos_idx.request();
-    auto uv_idx_buf = uv_idx.request();
-
-    int texture_height = texture_buf.shape[0];
-    int texture_width = texture_buf.shape[1];
-    int texture_channel = texture_buf.shape[2];
-    float* texture_ptr = static_cast<float*>(texture_buf.ptr);
-    uint8_t* mask_ptr = static_cast<uint8_t*>(mask_buf.ptr);
-
-    int vtx_num = vtx_pos_buf.shape[0];
-    float* vtx_pos_ptr = static_cast<float*>(vtx_pos_buf.ptr);
-    float* vtx_uv_ptr = static_cast<float*>(vtx_uv_buf.ptr);
-    int* pos_idx_ptr = static_cast<int*>(pos_idx_buf.ptr);
-    int* uv_idx_ptr = static_cast<int*>(uv_idx_buf.ptr);
-
-    vector<float> vtx_mask(vtx_num, 0.0f);
-    vector<vector<float>> vtx_color(vtx_num, vector<float>(texture_channel, 0.0f));
-    vector<int> uncolored_vtxs;
-
-    vector<vector<int>> G(vtx_num);
-
-    for (int i = 0; i < uv_idx_buf.shape[0]; ++i) {
-        for (int k = 0; k < 3; ++k) {
-            int vtx_uv_idx = uv_idx_ptr[i * 3 + k];
-            int vtx_idx = pos_idx_ptr[i * 3 + k];
-            int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1));
-            int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1));
-
-            if (mask_ptr[uv_u * texture_width + uv_v] > 0) {
-                vtx_mask[vtx_idx] = 1.0f;
-                for (int c = 0; c < texture_channel; ++c) {
-                    vtx_color[vtx_idx][c] = texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c];
-                }
-            }else{
-                uncolored_vtxs.push_back(vtx_idx);
-            }
-
-            G[pos_idx_ptr[i * 3 + k]].push_back(pos_idx_ptr[i * 3 + (k + 1) % 3]);
-        }
-    }
-
-    int smooth_count = 2;
-    int last_uncolored_vtx_count = 0;
-    while (smooth_count>0) {
-        int uncolored_vtx_count = 0;
-
-        for (int vtx_idx : uncolored_vtxs) {
-
-            vector<float> sum_color(texture_channel, 0.0f);
-            float total_weight = 0.0f;
-
-            array<float, 3> vtx_0 = {vtx_pos_ptr[vtx_idx * 3],
-vtx_pos_ptr[vtx_idx * 3 + 1], vtx_pos_ptr[vtx_idx * 3 + 2]};
-            for (int connected_idx : G[vtx_idx]) {
-                if (vtx_mask[connected_idx] > 0) {
-                    array<float, 3> vtx1 = {vtx_pos_ptr[connected_idx * 3],
-                    vtx_pos_ptr[connected_idx * 3 + 1], vtx_pos_ptr[connected_idx * 3 + 2]};
-                    float dist_weight = 1.0f / max(sqrt(pow(vtx_0[0] - vtx1[0], 2) + pow(vtx_0[1] - vtx1[1], 2) + \
-                     pow(vtx_0[2] - vtx1[2], 2)), 1E-4);
-                    dist_weight = dist_weight * dist_weight;
-                    for (int c = 0; c < texture_channel; ++c) {
-                        sum_color[c] += vtx_color[connected_idx][c] * dist_weight;
-                    }
-                    total_weight += dist_weight;
-                }
-            }
-
-            if (total_weight > 0.0f) {
-                for (int c = 0; c < texture_channel; ++c) {
-                    vtx_color[vtx_idx][c] = sum_color[c] / total_weight;
-                }
-                vtx_mask[vtx_idx] = 1.0f;
-            } else {
-                uncolored_vtx_count++;
-            }
-            
-        }
-
-        if(last_uncolored_vtx_count==uncolored_vtx_count){
-            smooth_count--;
-        }else{
-            smooth_count++;
-        }
-        last_uncolored_vtx_count = uncolored_vtx_count;
-    }
-
-    // Create new arrays for the output
-    py::array_t<float> new_texture(texture_buf.size);
-    py::array_t<uint8_t> new_mask(mask_buf.size);
-
-    auto new_texture_buf = new_texture.request();
-    auto new_mask_buf = new_mask.request();
-
-    float* new_texture_ptr = static_cast<float*>(new_texture_buf.ptr);
-    uint8_t* new_mask_ptr = static_cast<uint8_t*>(new_mask_buf.ptr);
-    // Copy original texture and mask to new arrays
-    std::copy(texture_ptr, texture_ptr + texture_buf.size, new_texture_ptr);
-    std::copy(mask_ptr, mask_ptr + mask_buf.size, new_mask_ptr);
-
-    for (int face_idx = 0; face_idx < uv_idx_buf.shape[0]; ++face_idx) {
-        for (int k = 0; k < 3; ++k) {
-            int vtx_uv_idx = uv_idx_ptr[face_idx * 3 + k];
-            int vtx_idx = pos_idx_ptr[face_idx * 3 + k];
-
-            if (vtx_mask[vtx_idx] == 1.0f) {
-                int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1));
-                int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1));
-
-                for (int c = 0; c < texture_channel; ++c) {
-                    new_texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c] = vtx_color[vtx_idx][c];
-                }
-                new_mask_ptr[uv_u * texture_width + uv_v] = 255;
-            }
-        }
-    }
-
-    // Reshape the new arrays to match the original texture and mask shapes
-    new_texture.resize({texture_height, texture_width, 3});
-    new_mask.resize({texture_height, texture_width});
-  return std::make_pair(new_texture, new_mask);
-}
-
-
-std::pair<py::array_t<float>, py::array_t<uint8_t>> meshVerticeInpaint(py::array_t<float> texture,
-          py::array_t<uint8_t> mask,
-          py::array_t<float> vtx_pos, py::array_t<float> vtx_uv,
-          py::array_t<int> pos_idx, py::array_t<int> uv_idx, const std::string& method = "smooth") {
-    if (method == "smooth") {
-        return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx);
-    } else {
-        throw std::invalid_argument("Invalid method. Use 'smooth' or 'forward'.");
-    }
-}
-
-PYBIND11_MODULE(mesh_processor, m) {
-    m.def("meshVerticeInpaint", &meshVerticeInpaint, "A function to process mesh",
-          py::arg("texture"), py::arg("mask"),
-          py::arg("vtx_pos"), py::arg("vtx_uv"),
-          py::arg("pos_idx"), py::arg("uv_idx"),
-          py::arg("method") = "smooth");
-}
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO
deleted file mode 100644
index ddb5e19214f697ef854a3c010d9e1e1e25a49702..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO
+++ /dev/null
@@ -1,7 +0,0 @@
-Metadata-Version: 2.2
-Name: mesh_processor
-Version: 0.0.0
-Requires-Python: >=3.6
-Requires-Dist: pybind11>=2.6.0
-Dynamic: requires-dist
-Dynamic: requires-python
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt
deleted file mode 100644
index 0ca24855f9323bfe0f20a2fab4dc2f55e6e34079..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-mesh_processor.cpp
-setup.py
-mesh_processor.egg-info/PKG-INFO
-mesh_processor.egg-info/SOURCES.txt
-mesh_processor.egg-info/dependency_links.txt
-mesh_processor.egg-info/requires.txt
-mesh_processor.egg-info/top_level.txt
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt
deleted file mode 100644
index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt
deleted file mode 100644
index d89789fcaa28db9e76d59597b04095a0a9f99fa3..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt
+++ /dev/null
@@ -1 +0,0 @@
-pybind11>=2.6.0
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt
deleted file mode 100644
index ccd72df0d4e79e7f3ee7e8ad3728d300bde6c3fe..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-mesh_processor
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
deleted file mode 100644
index a96955c19757df5ad18095b33829962140c04647..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import numpy as np
-
-def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx):
-    texture_height, texture_width, texture_channel = texture.shape
-    vtx_num = vtx_pos.shape[0]
-
-    vtx_mask = np.zeros(vtx_num, dtype=np.float32)
-    vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)]
-    uncolored_vtxs = []
-    G = [[] for _ in range(vtx_num)]
-
-    for i in range(uv_idx.shape[0]):
-        for k in range(3):
-            vtx_uv_idx = uv_idx[i, k]
-            vtx_idx = pos_idx[i, k]
-            uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
-            uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
-            if mask[uv_u, uv_v] > 0:
-                vtx_mask[vtx_idx] = 1.0
-                vtx_color[vtx_idx] = texture[uv_u, uv_v]
-            else:
-                uncolored_vtxs.append(vtx_idx)
-            G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3])
-
-    smooth_count = 2
-    last_uncolored_vtx_count = 0
-    while smooth_count > 0:
-        uncolored_vtx_count = 0
-        for vtx_idx in uncolored_vtxs:
-            sum_color = np.zeros(texture_channel, dtype=np.float32)
-            total_weight = 0.0
-            vtx_0 = vtx_pos[vtx_idx]
-            for connected_idx in G[vtx_idx]:
-                if vtx_mask[connected_idx] > 0:
-                    vtx1 = vtx_pos[connected_idx]
-                    dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2))
-                    dist_weight = 1.0 / max(dist, 1e-4)
-                    dist_weight *= dist_weight
-                    sum_color += vtx_color[connected_idx] * dist_weight
-                    total_weight += dist_weight
-            if total_weight > 0:
-                vtx_color[vtx_idx] = sum_color / total_weight
-                vtx_mask[vtx_idx] = 1.0
-            else:
-                uncolored_vtx_count += 1
-
-        if last_uncolored_vtx_count == uncolored_vtx_count:
-            smooth_count -= 1
-        else:
-            smooth_count += 1
-        last_uncolored_vtx_count = uncolored_vtx_count
-
-    new_texture = texture.copy()
-    new_mask = mask.copy()
-    for face_idx in range(uv_idx.shape[0]):
-        for k in range(3):
-            vtx_uv_idx = uv_idx[face_idx, k]
-            vtx_idx = pos_idx[face_idx, k]
-            if vtx_mask[vtx_idx] == 1.0:
-                uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
-                uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
-                new_texture[uv_u, uv_v] = vtx_color[vtx_idx]
-                new_mask[uv_u, uv_v] = 255
-    return new_texture, new_mask
-
-def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"):
-    if method == "smooth":
-        return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
-    else:
-        raise ValueError("Invalid method. Use 'smooth' or 'forward'.")
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_render.py b/hy3dgen/texgen/differentiable_renderer/mesh_render.py
deleted file mode 100644
index c85b80e043221282e9ff6bfb81764fb32c5d48ed..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_render.py
+++ /dev/null
@@ -1,833 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-import trimesh
-from PIL import Image
-
-from .camera_utils import (
-    transform_pos,
-    get_mv_matrix,
-    get_orthographic_projection_matrix,
-    get_perspective_projection_matrix,
-)
-from .mesh_processor import meshVerticeInpaint
-from .mesh_utils import load_mesh, save_mesh
-
-
-def stride_from_shape(shape):
-    stride = [1]
-    for x in reversed(shape[1:]):
-        stride.append(stride[-1] * x)
-    return list(reversed(stride))
-
-
-def scatter_add_nd_with_count(input, count, indices, values, weights=None):
-    # input: [..., C], D dimension + C channel
-    # count: [..., 1], D dimension
-    # indices: [N, D], long
-    # values: [N, C]
-
-    D = indices.shape[-1]
-    C = input.shape[-1]
-    size = input.shape[:-1]
-    stride = stride_from_shape(size)
-
-    assert len(size) == D
-
-    input = input.view(-1, C)  # [HW, C]
-    count = count.view(-1, 1)
-
-    flatten_indices = (indices * torch.tensor(stride,
-                                              dtype=torch.long, device=indices.device)).sum(-1)  # [N]
-
-    if weights is None:
-        weights = torch.ones_like(values[..., :1])
-
-    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
-    count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
-
-    return input.view(*size, C), count.view(*size, 1)
-
-
-def linear_grid_put_2d(H, W, coords, values, return_count=False):
-    # coords: [N, 2], float in [0, 1]
-    # values: [N, C]
-
-    C = values.shape[-1]
-
-    indices = coords * torch.tensor(
-        [H - 1, W - 1], dtype=torch.float32, device=coords.device
-    )
-    indices_00 = indices.floor().long()  # [N, 2]
-    indices_00[:, 0].clamp_(0, H - 2)
-    indices_00[:, 1].clamp_(0, W - 2)
-    indices_01 = indices_00 + torch.tensor(
-        [0, 1], dtype=torch.long, device=indices.device
-    )
-    indices_10 = indices_00 + torch.tensor(
-        [1, 0], dtype=torch.long, device=indices.device
-    )
-    indices_11 = indices_00 + torch.tensor(
-        [1, 1], dtype=torch.long, device=indices.device
-    )
-
-    h = indices[..., 0] - indices_00[..., 0].float()
-    w = indices[..., 1] - indices_00[..., 1].float()
-    w_00 = (1 - h) * (1 - w)
-    w_01 = (1 - h) * w
-    w_10 = h * (1 - w)
-    w_11 = h * w
-
-    result = torch.zeros(H, W, C, device=values.device,
-                         dtype=values.dtype)  # [H, W, C]
-    count = torch.zeros(H, W, 1, device=values.device,
-                        dtype=values.dtype)  # [H, W, 1]
-    weights = torch.ones_like(values[..., :1])  # [N, 1]
-
-    result, count = scatter_add_nd_with_count(
-        result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1))
-    result, count = scatter_add_nd_with_count(
-        result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1))
-    result, count = scatter_add_nd_with_count(
-        result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1))
-    result, count = scatter_add_nd_with_count(
-        result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1))
-
-    if return_count:
-        return result, count
-
-    mask = (count.squeeze(-1) > 0)
-    result[mask] = result[mask] / count[mask].repeat(1, C)
-
-    return result
-
-
-class MeshRender():
-    def __init__(
-        self,
-        camera_distance=1.45, camera_type='orth',
-        default_resolution=1024, texture_size=1024,
-        use_antialias=True, max_mip_level=None, filter_mode='linear',
-        bake_mode='linear', raster_mode='cr', device='cuda'):
-
-        self.device = device
-
-        self.set_default_render_resolution(default_resolution)
-        self.set_default_texture_resolution(texture_size)
-
-        self.camera_distance = camera_distance
-        self.use_antialias = use_antialias
-        self.max_mip_level = max_mip_level
-        self.filter_mode = filter_mode
-
-        self.bake_angle_thres = 75
-        self.bake_unreliable_kernel_size = int(
-            (2 / 512) * max(self.default_resolution[0], self.default_resolution[1]))
-        self.bake_mode = bake_mode
-
-        self.raster_mode = raster_mode
-        if self.raster_mode == 'cr':
-            import custom_rasterizer as cr
-            self.raster = cr
-        else:
-            raise f'No raster named {self.raster_mode}'
-
-        if camera_type == 'orth':
-            self.ortho_scale = 1.2
-            self.camera_proj_mat = get_orthographic_projection_matrix(
-                left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5,
-                bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5,
-                near=0.1, far=100
-            )
-        elif camera_type == 'perspective':
-            self.camera_proj_mat = get_perspective_projection_matrix(
-                49.13, self.default_resolution[1] / self.default_resolution[0],
-                0.01, 100.0
-            )
-        else:
-            raise f'No camera type {camera_type}'
-
-    def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True):
-
-        if self.raster_mode == 'cr':
-            rast_out_db = None
-            if pos.dim() == 2:
-                pos = pos.unsqueeze(0)
-            findices, barycentric = self.raster.rasterize(pos, tri, resolution)
-            rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1)
-            rast_out = rast_out.unsqueeze(0)
-        else:
-            raise f'No raster named {self.raster_mode}'
-
-        return rast_out, rast_out_db
-
-    def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None):
-
-        if self.raster_mode == 'cr':
-            textd = None
-            barycentric = rast_out[0, ..., :-1]
-            findices = rast_out[0, ..., -1]
-            if uv.dim() == 2:
-                uv = uv.unsqueeze(0)
-            textc = self.raster.interpolate(uv, findices, barycentric, uv_idx)
-        else:
-            raise f'No raster named {self.raster_mode}'
-
-        return textc, textd
-
-    def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto',
-                       boundary_mode='wrap', max_mip_level=None):
-
-        if self.raster_mode == 'cr':
-            raise f'Texture is not implemented in cr'
-        else:
-            raise f'No raster named {self.raster_mode}'
-
-        return color
-
-    def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
-
-        if self.raster_mode == 'cr':
-            # Antialias has not been supported yet
-            color = color
-        else:
-            raise f'No raster named {self.raster_mode}'
-
-        return color
-
-    def load_mesh(
-        self,
-        mesh,
-        scale_factor=1.15,
-        auto_center=True,
-    ):
-        vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh)
-        self.mesh_copy = mesh
-        self.set_mesh(vtx_pos, pos_idx,
-                      vtx_uv=vtx_uv, uv_idx=uv_idx,
-                      scale_factor=scale_factor, auto_center=auto_center
-                      )
-        if texture_data is not None:
-            self.set_texture(texture_data)
-
-    def save_mesh(self):
-        texture_data = self.get_texture()
-        texture_data = Image.fromarray((texture_data * 255).astype(np.uint8))
-        return save_mesh(self.mesh_copy, texture_data)
-
-    def set_mesh(
-        self,
-        vtx_pos, pos_idx,
-        vtx_uv=None, uv_idx=None,
-        scale_factor=1.15, auto_center=True
-    ):
-
-        self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float()
-        self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int)
-        if (vtx_uv is not None) and (uv_idx is not None):
-            self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float()
-            self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int)
-        else:
-            self.vtx_uv = None
-            self.uv_idx = None
-
-        self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]]
-        self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]]
-        if (vtx_uv is not None) and (uv_idx is not None):
-            self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1]
-
-        if auto_center:
-            max_bb = (self.vtx_pos - 0).max(0)[0]
-            min_bb = (self.vtx_pos - 0).min(0)[0]
-            center = (max_bb + min_bb) / 2
-            scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0
-            self.vtx_pos = (self.vtx_pos - center) * \
-                           (scale_factor / float(scale))
-            self.scale_factor = scale_factor
-
-    def set_texture(self, tex):
-        if isinstance(tex, np.ndarray):
-            tex = Image.fromarray((tex * 255).astype(np.uint8))
-        elif isinstance(tex, torch.Tensor):
-            tex = tex.cpu().numpy()
-            tex = Image.fromarray((tex * 255).astype(np.uint8))
-
-        tex = tex.resize(self.texture_size).convert('RGB')
-        tex = np.array(tex) / 255.0
-        self.tex = torch.from_numpy(tex).to(self.device)
-        self.tex = self.tex.float()
-
-    def set_default_render_resolution(self, default_resolution):
-        if isinstance(default_resolution, int):
-            default_resolution = (default_resolution, default_resolution)
-        self.default_resolution = default_resolution
-
-    def set_default_texture_resolution(self, texture_size):
-        if isinstance(texture_size, int):
-            texture_size = (texture_size, texture_size)
-        self.texture_size = texture_size
-
-    def get_mesh(self):
-        vtx_pos = self.vtx_pos.cpu().numpy()
-        pos_idx = self.pos_idx.cpu().numpy()
-        vtx_uv = self.vtx_uv.cpu().numpy()
-        uv_idx = self.uv_idx.cpu().numpy()
-
-        # 坐标变换的逆变换
-        vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]]
-        vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]]
-
-        vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1]
-        return vtx_pos, pos_idx, vtx_uv, uv_idx
-
-    def get_texture(self):
-        return self.tex.cpu().numpy()
-
-    def to(self, device):
-        self.device = device
-
-        for attr_name in dir(self):
-            attr_value = getattr(self, attr_name)
-            if isinstance(attr_value, torch.Tensor):
-                setattr(self, attr_name, attr_value.to(self.device))
-
-    def color_rgb_to_srgb(self, image):
-        if isinstance(image, Image.Image):
-            image_rgb = torch.tesnor(
-                np.array(image) /
-                255.0).float().to(
-                self.device)
-        elif isinstance(image, np.ndarray):
-            image_rgb = torch.tensor(image).float()
-        else:
-            image_rgb = image.to(self.device)
-
-        image_srgb = torch.where(
-            image_rgb <= 0.0031308,
-            12.92 * image_rgb,
-            1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055
-        )
-
-        if isinstance(image, Image.Image):
-            image_srgb = Image.fromarray(
-                (image_srgb.cpu().numpy() *
-                 255).astype(
-                    np.uint8))
-        elif isinstance(image, np.ndarray):
-            image_srgb = image_srgb.cpu().numpy()
-        else:
-            image_srgb = image_srgb.to(image.device)
-
-        return image_srgb
-
-    def _render(
-        self,
-        glctx,
-        mvp,
-        pos,
-        pos_idx,
-        uv,
-        uv_idx,
-        tex,
-        resolution,
-        max_mip_level,
-        keep_alpha,
-        filter_mode
-    ):
-        pos_clip = transform_pos(mvp, pos)
-        if isinstance(resolution, (int, float)):
-            resolution = [resolution, resolution]
-        rast_out, rast_out_db = self.raster_rasterize(
-            glctx, pos_clip, pos_idx, resolution=resolution)
-
-        tex = tex.contiguous()
-        if filter_mode == 'linear-mipmap-linear':
-            texc, texd = self.raster_interpolate(
-                uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
-            color = self.raster_texture(
-                tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
-        else:
-            texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx)
-            color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode)
-
-        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
-        color = color * visible_mask  # Mask out background.
-        if self.use_antialias:
-            color = self.raster_antialias(color, rast_out, pos_clip, pos_idx)
-
-        if keep_alpha:
-            color = torch.cat([color, visible_mask], dim=-1)
-        return color[0, ...]
-
-    def render(
-        self,
-        elev,
-        azim,
-        camera_distance=None,
-        center=None,
-        resolution=None,
-        tex=None,
-        keep_alpha=True,
-        bgcolor=None,
-        filter_mode=None,
-        return_type='th'
-    ):
-
-        proj = self.camera_proj_mat
-        r_mv = get_mv_matrix(
-            elev=elev,
-            azim=azim,
-            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
-            center=center)
-        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
-        if tex is not None:
-            if isinstance(tex, Image.Image):
-                tex = torch.tensor(np.array(tex) / 255.0)
-            elif isinstance(tex, np.ndarray):
-                tex = torch.tensor(tex)
-            if tex.dim() == 2:
-                tex = tex.unsqueeze(-1)
-            tex = tex.float().to(self.device)
-        image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx,
-                             self.tex if tex is None else tex,
-                             self.default_resolution if resolution is None else resolution,
-                             self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode)
-        mask = (image[..., [-1]] == 1).float()
-        if bgcolor is None:
-            bgcolor = [0 for _ in range(image.shape[-1] - 1)]
-        image = image * mask + (1 - mask) * \
-                torch.tensor(bgcolor + [0]).to(self.device)
-        if keep_alpha == False:
-            image = image[..., :-1]
-        if return_type == 'np':
-            image = image.cpu().numpy()
-        elif return_type == 'pl':
-            image = image.squeeze(-1).cpu().numpy() * 255
-            image = Image.fromarray(image.astype(np.uint8))
-        return image
-
-    def render_normal(
-        self,
-        elev,
-        azim,
-        camera_distance=None,
-        center=None,
-        resolution=None,
-        bg_color=[1, 1, 1],
-        use_abs_coor=False,
-        normalize_rgb=True,
-        return_type='th'
-    ):
-
-        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
-        if resolution is None:
-            resolution = self.default_resolution
-        if isinstance(resolution, (int, float)):
-            resolution = [resolution, resolution]
-        rast_out, rast_out_db = self.raster_rasterize(
-            pos_clip, self.pos_idx, resolution=resolution)
-
-        if use_abs_coor:
-            mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :]
-        else:
-            pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
-            mesh_triangles = pos_camera[self.pos_idx[:, :3], :]
-        face_normals = F.normalize(
-            torch.cross(mesh_triangles[:,
-                        1,
-                        :] - mesh_triangles[:,
-                             0,
-                             :],
-                        mesh_triangles[:,
-                        2,
-                        :] - mesh_triangles[:,
-                             0,
-                             :],
-                        dim=-1),
-            dim=-1)
-
-        vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
-                                                              faces=self.pos_idx.cpu(),
-                                                              face_normals=face_normals.cpu(), )
-        vertex_normals = torch.from_numpy(
-            vertex_normals).float().to(self.device).contiguous()
-
-        # Interpolate normal values across the rasterized pixels
-        normal, _ = self.raster_interpolate(
-            vertex_normals[None, ...], rast_out, self.pos_idx)
-
-        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
-        normal = normal * visible_mask + \
-                 torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
-                                                                                    visible_mask)  # Mask out background.
-
-        if normalize_rgb:
-            normal = (normal + 1) * 0.5
-        if self.use_antialias:
-            normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx)
-
-        image = normal[0, ...]
-        if return_type == 'np':
-            image = image.cpu().numpy()
-        elif return_type == 'pl':
-            image = image.cpu().numpy() * 255
-            image = Image.fromarray(image.astype(np.uint8))
-
-        return image
-
-    def convert_normal_map(self, image):
-        # blue is front, red is left, green is top
-        if isinstance(image, Image.Image):
-            image = np.array(image)
-        mask = (image == [255, 255, 255]).all(axis=-1)
-
-        image = (image / 255.0) * 2.0 - 1.0
-
-        image[..., [1]] = -image[..., [1]]
-        image[..., [1, 2]] = image[..., [2, 1]]
-        image[..., [0]] = -image[..., [0]]
-
-        image = (image + 1.0) * 0.5
-
-        image = (image * 255).astype(np.uint8)
-        image[mask] = [127, 127, 255]
-
-        return Image.fromarray(image)
-
-    def get_pos_from_mvp(self, elev, azim, camera_distance, center):
-        proj = self.camera_proj_mat
-        r_mv = get_mv_matrix(
-            elev=elev,
-            azim=azim,
-            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
-            center=center)
-
-        pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
-        pos_clip = transform_pos(proj, pos_camera)
-
-        return pos_camera, pos_clip
-
-    def render_depth(
-        self,
-        elev,
-        azim,
-        camera_distance=None,
-        center=None,
-        resolution=None,
-        return_type='th'
-    ):
-        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
-
-        if resolution is None:
-            resolution = self.default_resolution
-        if isinstance(resolution, (int, float)):
-            resolution = [resolution, resolution]
-        rast_out, rast_out_db = self.raster_rasterize(
-            pos_clip, self.pos_idx, resolution=resolution)
-
-        pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
-        tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
-
-        # Interpolate depth values across the rasterized pixels
-        depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
-
-        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
-        depth_max, depth_min = depth[visible_mask >
-                                     0].max(), depth[visible_mask > 0].min()
-        depth = (depth - depth_min) / (depth_max - depth_min)
-
-        depth = depth * visible_mask  # Mask out background.
-        if self.use_antialias:
-            depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx)
-
-        image = depth[0, ...]
-        if return_type == 'np':
-            image = image.cpu().numpy()
-        elif return_type == 'pl':
-            image = image.squeeze(-1).cpu().numpy() * 255
-            image = Image.fromarray(image.astype(np.uint8))
-        return image
-
-    def render_position(self, elev, azim, camera_distance=None, center=None,
-                        resolution=None, bg_color=[1, 1, 1], return_type='th'):
-        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
-        if resolution is None:
-            resolution = self.default_resolution
-        if isinstance(resolution, (int, float)):
-            resolution = [resolution, resolution]
-        rast_out, rast_out_db = self.raster_rasterize(
-            pos_clip, self.pos_idx, resolution=resolution)
-
-        tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor
-        tex_position = tex_position.contiguous()
-
-        # Interpolate depth values across the rasterized pixels
-        position, _ = self.raster_interpolate(
-            tex_position[None, ...], rast_out, self.pos_idx)
-
-        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
-
-        position = position * visible_mask + \
-                   torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
-                                                                                      visible_mask)  # Mask out background.
-        if self.use_antialias:
-            position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx)
-
-        image = position[0, ...]
-
-        if return_type == 'np':
-            image = image.cpu().numpy()
-        elif return_type == 'pl':
-            image = image.squeeze(-1).cpu().numpy() * 255
-            image = Image.fromarray(image.astype(np.uint8))
-        return image
-
-    def render_uvpos(self, return_type='th'):
-        image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5)
-        if return_type == 'np':
-            image = image.cpu().numpy()
-        elif return_type == 'pl':
-            image = image.cpu().numpy() * 255
-            image = Image.fromarray(image.astype(np.uint8))
-        return image
-
-    def uv_feature_map(self, vert_feat, bg=None):
-        vtx_uv = self.vtx_uv * 2 - 1.0
-        vtx_uv = torch.cat(
-            [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0)
-        vtx_uv[..., -1] = 1
-        uv_idx = self.uv_idx
-        rast_out, rast_out_db = self.raster_rasterize(
-            vtx_uv, uv_idx, resolution=self.texture_size)
-        feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx)
-        feat_map = feat_map[0, ...]
-        if bg is not None:
-            visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
-            feat_map[visible_mask == 0] = bg
-        return feat_map
-
-    def render_sketch_from_geometry(self, normal_image, depth_image):
-        normal_image_np = normal_image.cpu().numpy()
-        depth_image_np = depth_image.cpu().numpy()
-
-        normal_image_np = (normal_image_np * 255).astype(np.uint8)
-        depth_image_np = (depth_image_np * 255).astype(np.uint8)
-        normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY)
-
-        normal_edges = cv2.Canny(normal_image_np, 80, 150)
-        depth_edges = cv2.Canny(depth_image_np, 30, 80)
-
-        combined_edges = np.maximum(normal_edges, depth_edges)
-
-        sketch_image = torch.from_numpy(combined_edges).to(
-            normal_image.device).float() / 255.0
-        sketch_image = sketch_image.unsqueeze(-1)
-
-        return sketch_image
-
-    def render_sketch_from_depth(self, depth_image):
-        depth_image_np = depth_image.cpu().numpy()
-        depth_image_np = (depth_image_np * 255).astype(np.uint8)
-        depth_edges = cv2.Canny(depth_image_np, 30, 80)
-        combined_edges = depth_edges
-        sketch_image = torch.from_numpy(combined_edges).to(
-            depth_image.device).float() / 255.0
-        sketch_image = sketch_image.unsqueeze(-1)
-        return sketch_image
-
-    def back_project(self, image, elev, azim,
-                     camera_distance=None, center=None, method=None):
-        if isinstance(image, Image.Image):
-            image = torch.tensor(np.array(image) / 255.0)
-        elif isinstance(image, np.ndarray):
-            image = torch.tensor(image)
-        if image.dim() == 2:
-            image = image.unsqueeze(-1)
-        image = image.float().to(self.device)
-        resolution = image.shape[:2]
-        channel = image.shape[-1]
-        texture = torch.zeros(self.texture_size + (channel,)).to(self.device)
-        cos_map = torch.zeros(self.texture_size + (1,)).to(self.device)
-
-        proj = self.camera_proj_mat
-        r_mv = get_mv_matrix(
-            elev=elev,
-            azim=azim,
-            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
-            center=center)
-        pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
-        pos_clip = transform_pos(proj, pos_camera)
-        pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
-        v0 = pos_camera[self.pos_idx[:, 0], :]
-        v1 = pos_camera[self.pos_idx[:, 1], :]
-        v2 = pos_camera[self.pos_idx[:, 2], :]
-        face_normals = F.normalize(
-            torch.cross(
-                v1 - v0,
-                v2 - v0,
-                dim=-1),
-            dim=-1)
-        vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
-                                                              faces=self.pos_idx.cpu(),
-                                                              face_normals=face_normals.cpu(), )
-        vertex_normals = torch.from_numpy(
-            vertex_normals).float().to(self.device).contiguous()
-        tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
-        rast_out, rast_out_db = self.raster_rasterize(
-            pos_clip, self.pos_idx, resolution=resolution)
-        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
-
-        normal, _ = self.raster_interpolate(
-            vertex_normals[None, ...], rast_out, self.pos_idx)
-        normal = normal[0, ...]
-        uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx)
-        depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
-        depth = depth[0, ...]
-
-        depth_max, depth_min = depth[visible_mask >
-                                     0].max(), depth[visible_mask > 0].min()
-        depth_normalized = (depth - depth_min) / (depth_max - depth_min)
-        depth_image = depth_normalized * visible_mask  # Mask out background.
-
-        sketch_image = self.render_sketch_from_depth(depth_image)
-
-        lookat = torch.tensor([[0, 0, -1]], device=self.device)
-        cos_image = torch.nn.functional.cosine_similarity(
-            lookat, normal.view(-1, 3))
-        cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1)
-
-        cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi)
-        cos_image[cos_image < cos_thres] = 0
-
-        # shrink
-        kernel_size = self.bake_unreliable_kernel_size * 2 + 1
-        kernel = torch.ones(
-            (1, 1, kernel_size, kernel_size), dtype=torch.float32).to(
-            sketch_image.device)
-
-        visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float()
-        visible_mask = F.conv2d(
-            1.0 - visible_mask,
-            kernel,
-            padding=kernel_size // 2)
-        visible_mask = 1.0 - (visible_mask > 0).float()  # 二值化
-        visible_mask = visible_mask.squeeze(0).permute(1, 2, 0)
-
-        sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0)
-        sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2)
-        sketch_image = (sketch_image > 0).float()  # 二值化
-        sketch_image = sketch_image.squeeze(0).permute(1, 2, 0)
-        visible_mask = visible_mask * (sketch_image < 0.5)
-
-        cos_image[visible_mask == 0] = 0
-
-        method = self.bake_mode if method is None else method
-
-        if method == 'linear':
-            proj_mask = (visible_mask != 0).view(-1)
-            uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask]
-            image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask]
-            cos_image = cos_image.contiguous().view(-1, 1)[proj_mask]
-            sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask]
-
-            texture = linear_grid_put_2d(
-                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image)
-            cos_map = linear_grid_put_2d(
-                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image)
-            boundary_map = linear_grid_put_2d(
-                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image)
-        else:
-            raise f'No bake mode {method}'
-
-        return texture, cos_map, boundary_map
-
-    def bake_texture(self, colors, elevs, azims,
-                     camera_distance=None, center=None, exp=6, weights=None):
-        for i in range(len(colors)):
-            if isinstance(colors[i], Image.Image):
-                colors[i] = torch.tensor(
-                    np.array(
-                        colors[i]) / 255.0,
-                    device=self.device).float()
-        if weights is None:
-            weights = [1.0 for _ in range(colors)]
-        textures = []
-        cos_maps = []
-        for color, elev, azim, weight in zip(colors, elevs, azims, weights):
-            texture, cos_map, _ = self.back_project(
-                color, elev, azim, camera_distance, center)
-            cos_map = weight * (cos_map ** exp)
-            textures.append(texture)
-            cos_maps.append(cos_map)
-
-        texture_merge, trust_map_merge = self.fast_bake_texture(
-            textures, cos_maps)
-        return texture_merge, trust_map_merge
-
-    @torch.no_grad()
-    def fast_bake_texture(self, textures, cos_maps):
-
-        channel = textures[0].shape[-1]
-        texture_merge = torch.zeros(
-            self.texture_size + (channel,)).to(self.device)
-        trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device)
-        for texture, cos_map in zip(textures, cos_maps):
-            view_sum = (cos_map > 0).sum()
-            painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum()
-            if painted_sum / view_sum > 0.99:
-                continue
-            texture_merge += texture * cos_map
-            trust_map_merge += cos_map
-        texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8)
-
-        return texture_merge, trust_map_merge > 1E-8
-
-    def uv_inpaint(self, texture, mask):
-
-        if isinstance(texture, torch.Tensor):
-            texture_np = texture.cpu().numpy()
-        elif isinstance(texture, np.ndarray):
-            texture_np = texture
-        elif isinstance(texture, Image.Image):
-            texture_np = np.array(texture) / 255.0
-
-        vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh()
-
-        texture_np, mask = meshVerticeInpaint(
-            texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
-
-        texture_np = cv2.inpaint(
-            (texture_np *
-             255).astype(
-                np.uint8),
-            255 -
-            mask,
-            3,
-            cv2.INPAINT_NS)
-
-        return texture_np
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
deleted file mode 100644
index ca0ba1a6145c68651ec033b97e80900cd2c9d7ec..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import trimesh
-
-
-def load_mesh(mesh):
-    vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None
-    pos_idx = mesh.faces if hasattr(mesh, 'faces') else None
-
-    vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None
-    uv_idx = mesh.faces if hasattr(mesh, 'faces') else None
-
-    texture_data = None
-
-    return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data
-
-
-def save_mesh(mesh, texture_data):
-    material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255))
-    texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material)
-    mesh.visual = texture_visuals
-    return mesh
diff --git a/hy3dgen/texgen/differentiable_renderer/setup.py b/hy3dgen/texgen/differentiable_renderer/setup.py
deleted file mode 100644
index 2ea78693fe96ac027742bd752238421c6d83f8fc..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/differentiable_renderer/setup.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from setuptools import setup, Extension
-import pybind11
-import sys
-import platform
-
-def get_platform_specific_args():
-    system = platform.system().lower()
-    cpp_std = 'c++14'  # Make configurable if needed
-    
-    if sys.platform == 'win32':
-        compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj']
-        link_args = []
-        extra_includes = []
-    elif system == 'linux':
-        compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread']
-        link_args = ['-fPIC', '-pthread']
-        extra_includes = []
-    elif sys.platform == 'darwin':
-        compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra',
-                       '-stdlib=libc++', '-mmacosx-version-min=10.14']
-        link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib']
-        extra_includes = []
-    else:
-        raise RuntimeError(f"Unsupported platform: {system}")
-    
-    return compile_args, link_args, extra_includes
-
-extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args()
-include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)]
-include_dirs.extend(platform_includes)
-
-ext_modules = [
-    Extension(
-        "mesh_processor",
-        ["mesh_processor.cpp"],
-        include_dirs=include_dirs,
-        language='c++',
-        extra_compile_args=extra_compile_args,
-        extra_link_args=extra_link_args,
-    ),
-]
-
-setup(
-    name="mesh_processor",
-    ext_modules=ext_modules,
-    install_requires=['pybind11>=2.6.0'],
-    python_requires='>=3.6',
-)
\ No newline at end of file
diff --git a/hy3dgen/texgen/hunyuanpaint/__init__.py b/hy3dgen/texgen/hunyuanpaint/__init__.py
deleted file mode 100644
index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/hunyuanpaint/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py
deleted file mode 100644
index 436ce34efb8bc40c3df2b3902b7a29dffa39ae91..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/hunyuanpaint/pipeline.py
+++ /dev/null
@@ -1,554 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy
-import numpy as np
-import torch
-import torch.distributed
-import torch.utils.checkpoint
-from PIL import Image
-from diffusers import (
-    AutoencoderKL,
-    DiffusionPipeline,
-    ImagePipelineOutput
-)
-from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
-from diffusers.image_processor import PipelineImageInput
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \
-    rescale_noise_cfg
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import deprecate
-from einops import rearrange
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from .unet.modules import UNet2p5DConditionModel
-
-
-def to_rgb_image(maybe_rgba: Image.Image):
-    if maybe_rgba.mode == 'RGB':
-        return maybe_rgba
-    elif maybe_rgba.mode == 'RGBA':
-        rgba = maybe_rgba
-        img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8)
-        img = Image.fromarray(img, 'RGB')
-        img.paste(rgba, mask=rgba.getchannel('A'))
-        return img
-    else:
-        raise ValueError("Unsupported image type.", maybe_rgba.mode)
-
-
-class HunyuanPaintPipeline(StableDiffusionPipeline):
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2p5DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        feature_extractor: CLIPImageProcessor,
-        safety_checker=None,
-        use_torch_compile=False,
-    ):
-        DiffusionPipeline.__init__(self)
-
-        safety_checker = None
-        self.register_modules(
-            vae=torch.compile(vae) if use_torch_compile else vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
-    @torch.no_grad()
-    def encode_images(self, images):
-        B = images.shape[0]
-        images = rearrange(images, 'b n c h w -> (b n) c h w')
-
-        dtype = next(self.vae.parameters()).dtype
-        images = (images - 0.5) * 2.0
-        posterior = self.vae.encode(images.to(dtype)).latent_dist
-        latents = posterior.sample() * self.vae.config.scaling_factor
-
-        latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B)
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: Image.Image = None,
-        prompt=None,
-        negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast',
-        *args,
-        num_images_per_prompt: Optional[int] = 1,
-        guidance_scale=2.0,
-        output_type: Optional[str] = "pil",
-        width=512,
-        height=512,
-        num_inference_steps=28,
-        return_dict=True,
-        **cached_condition,
-    ):
-        if image is None:
-            raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.")
-        assert not isinstance(image, torch.Tensor)
-
-        image = to_rgb_image(image)
-
-        image_vae = torch.tensor(np.array(image) / 255.0)
-        image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0)
-        image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype)
-
-        batch_size = image_vae.shape[0]
-        assert batch_size == 1
-        assert num_images_per_prompt == 1
-
-        ref_latents = self.encode_images(image_vae)
-
-        def convert_pil_list_to_tensor(images):
-            bg_c = [1., 1., 1.]
-            images_tensor = []
-            for batch_imgs in images:
-                view_imgs = []
-                for pil_img in batch_imgs:
-                    img = numpy.asarray(pil_img, dtype=numpy.float32) / 255.
-                    if img.shape[2] > 3:
-                        alpha = img[:, :, 3:]
-                        img = img[:, :, :3] * alpha + bg_c * (1 - alpha)
-                    img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda")
-                    view_imgs.append(img)
-                view_imgs = torch.cat(view_imgs, dim=0)
-                images_tensor.append(view_imgs.unsqueeze(0))
-
-            images_tensor = torch.cat(images_tensor, dim=0)
-            return images_tensor
-
-        if "normal_imgs" in cached_condition:
-
-            if isinstance(cached_condition["normal_imgs"], List):
-                cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"])
-
-            cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"])
-
-        if "position_imgs" in cached_condition:
-
-            if isinstance(cached_condition["position_imgs"], List):
-                cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"])
-
-            cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"])
-
-        if 'camera_info_gen' in cached_condition:
-            camera_info = cached_condition['camera_info_gen']  # B,N
-            if isinstance(camera_info, List):
-                camera_info = torch.tensor(camera_info)
-            camera_info = camera_info.to(image_vae.device).to(torch.int64)
-            cached_condition['camera_info_gen'] = camera_info
-        if 'camera_info_ref' in cached_condition:
-            camera_info = cached_condition['camera_info_ref']  # B,N
-            if isinstance(camera_info, List):
-                camera_info = torch.tensor(camera_info)
-            camera_info = camera_info.to(image_vae.device).to(torch.int64)
-            cached_condition['camera_info_ref'] = camera_info
-
-        cached_condition['ref_latents'] = ref_latents
-
-        if guidance_scale > 1:
-            negative_ref_latents = torch.zeros_like(cached_condition['ref_latents'])
-            cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']])
-            cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents'])
-            if "normal_imgs" in cached_condition:
-                cached_condition['normal_imgs'] = torch.cat(
-                    (cached_condition['normal_imgs'], cached_condition['normal_imgs']))
-
-            if "position_imgs" in cached_condition:
-                cached_condition['position_imgs'] = torch.cat(
-                    (cached_condition['position_imgs'], cached_condition['position_imgs']))
-
-            if 'position_maps' in cached_condition:
-                cached_condition['position_maps'] = torch.cat(
-                    (cached_condition['position_maps'], cached_condition['position_maps']))
-
-            if 'camera_info_gen' in cached_condition:
-                cached_condition['camera_info_gen'] = torch.cat(
-                    (cached_condition['camera_info_gen'], cached_condition['camera_info_gen']))
-            if 'camera_info_ref' in cached_condition:
-                cached_condition['camera_info_ref'] = torch.cat(
-                    (cached_condition['camera_info_ref'], cached_condition['camera_info_ref']))
-
-        prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1)
-        negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-
-        latents: torch.Tensor = self.denoise(
-            None,
-            *args,
-            cross_attention_kwargs=None,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            num_inference_steps=num_inference_steps,
-            output_type='latent',
-            width=width,
-            height=height,
-            **cached_condition
-        ).images
-
-        if not output_type == "latent":
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-        else:
-            image = latents
-
-        image = self.image_processor.postprocess(image, output_type=output_type)
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
-
-    def denoise(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        sigmas: List[float] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        **kwargs,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            sigmas (`List[float]`, *optional*):
-                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
-                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
-                will be used.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.0):
-                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
-                using zero terminal SNR.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
-                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
-                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
-                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
-                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
-            )
-
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-        # to deal with lora scaling and other possible forward hooks
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            ip_adapter_image,
-            ip_adapter_image_embeds,
-            callback_on_step_end_tensor_inputs,
-        )
-
-        self._guidance_scale = guidance_scale
-        self._guidance_rescale = guidance_rescale
-        self._clip_skip = clip_skip
-        self._cross_attention_kwargs = cross_attention_kwargs
-        self._interrupt = False
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-
-        # 3. Encode input prompt
-        lora_scale = (
-            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
-        )
-
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            self.do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=lora_scale,
-            clip_skip=self.clip_skip,
-        )
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image,
-                ip_adapter_image_embeds,
-                device,
-                batch_size * num_images_per_prompt,
-                self.do_classifier_free_guidance,
-            )
-
-        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps, sigmas
-        )
-        assert num_images_per_prompt == 1
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * kwargs['num_in_batch'],  # num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 6.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
-            else None
-        )
-
-        # 6.2 Optionally get Guidance Scale Embedding
-        timestep_cond = None
-        if self.unet.config.time_cond_proj_dim is not None:
-            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
-            timestep_cond = self.get_guidance_scale_embedding(
-                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
-            ).to(device=device, dtype=latents.dtype)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                # expand the latents if we are doing classifier free guidance
-                latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch'])
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w')
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch'])
-
-                # predict the noise residual
-
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    timestep_cond=timestep_cond,
-                    cross_attention_kwargs=self.cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=False, **kwargs
-                )[0]
-                latents = rearrange(latents, 'b n c h w -> (b n) c h w')
-                # perform guidance
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = \
-                    self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs,
-                                        return_dict=False)[0]
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, t, latents)
-
-        if not output_type == "latent":
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
-                0
-            ]
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
deleted file mode 100644
index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/texgen/hunyuanpaint/unet/modules.py b/hy3dgen/texgen/hunyuanpaint/unet/modules.py
deleted file mode 100644
index 5d16bc6b6bb1ebc72c602dcb298d122429fe847d..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/hunyuanpaint/unet/modules.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-import copy
-import json
-import os
-from typing import Any, Dict, Optional
-
-import torch
-import torch.nn as nn
-from diffusers.models import UNet2DConditionModel
-from diffusers.models.attention_processor import Attention
-from diffusers.models.transformers.transformer_2d import BasicTransformerBlock
-from einops import rearrange
-
-
-def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
-    # "feed_forward_chunk_size" can be used to save memory
-    if hidden_states.shape[chunk_dim] % chunk_size != 0:
-        raise ValueError(
-            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-        )
-
-    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
-    ff_output = torch.cat(
-        [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
-        dim=chunk_dim,
-    )
-    return ff_output
-
-
-class Basic2p5DTransformerBlock(torch.nn.Module):
-    def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None:
-        super().__init__()
-        self.transformer = transformer
-        self.layer_name = layer_name
-        self.use_ma = use_ma
-        self.use_ra = use_ra
-
-        # multiview attn
-        if self.use_ma:
-            self.attn_multiview = Attention(
-                query_dim=self.dim,
-                heads=self.num_attention_heads,
-                dim_head=self.attention_head_dim,
-                dropout=self.dropout,
-                bias=self.attention_bias,
-                cross_attention_dim=None,
-                upcast_attention=self.attn1.upcast_attention,
-                out_bias=True,
-            )
-
-        # ref attn
-        if self.use_ra:
-            self.attn_refview = Attention(
-                query_dim=self.dim,
-                heads=self.num_attention_heads,
-                dim_head=self.attention_head_dim,
-                dropout=self.dropout,
-                bias=self.attention_bias,
-                cross_attention_dim=None,
-                upcast_attention=self.attn1.upcast_attention,
-                out_bias=True,
-            )
-
-    def __getattr__(self, name: str):
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.transformer, name)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 0. Self-Attention
-        batch_size = hidden_states.shape[0]
-
-        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
-        num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1)
-        mode = cross_attention_kwargs.pop('mode', None)
-        mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0)
-        ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0)
-        condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None)
-
-        if self.norm_type == "ada_norm":
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.norm_type == "ada_norm_zero":
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
-            norm_hidden_states = self.norm1(hidden_states)
-        elif self.norm_type == "ada_norm_continuous":
-            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
-        elif self.norm_type == "ada_norm_single":
-            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
-            ).chunk(6, dim=1)
-            norm_hidden_states = self.norm1(hidden_states)
-            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
-        else:
-            raise ValueError("Incorrect norm used")
-
-        if self.pos_embed is not None:
-            norm_hidden_states = self.pos_embed(norm_hidden_states)
-
-        # 1. Prepare GLIGEN inputs
-        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
-        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
-
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-
-        if self.norm_type == "ada_norm_zero":
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        elif self.norm_type == "ada_norm_single":
-            attn_output = gate_msa * attn_output
-
-        hidden_states = attn_output + hidden_states
-        if hidden_states.ndim == 4:
-            hidden_states = hidden_states.squeeze(1)
-
-        # 1.2 Reference Attention
-        if 'w' in mode:
-            condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c',
-                                                              n=num_in_batch)  # B, (N L), C
-
-        if 'r' in mode and self.use_ra:
-            condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1,
-                                                                                        1)  # B N L C
-            condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c')
-
-            attn_output = self.attn_refview(
-                norm_hidden_states,
-                encoder_hidden_states=condition_embed,
-                attention_mask=None,
-                **cross_attention_kwargs
-            )
-            ref_scale_timing = ref_scale
-            if isinstance(ref_scale, torch.Tensor):
-                ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1)
-                for _ in range(attn_output.ndim - 1):
-                    ref_scale_timing = ref_scale_timing.unsqueeze(-1)
-            hidden_states = ref_scale_timing * attn_output + hidden_states
-            if hidden_states.ndim == 4:
-                hidden_states = hidden_states.squeeze(1)
-
-        # 1.3 Multiview Attention
-        if num_in_batch > 1 and self.use_ma:
-            multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch)
-
-            attn_output = self.attn_multiview(
-                multivew_hidden_states,
-                encoder_hidden_states=multivew_hidden_states,
-                **cross_attention_kwargs
-            )
-
-            attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch)
-
-            hidden_states = mva_scale * attn_output + hidden_states
-            if hidden_states.ndim == 4:
-                hidden_states = hidden_states.squeeze(1)
-
-        # 1.2 GLIGEN Control
-        if gligen_kwargs is not None:
-            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
-
-        # 3. Cross-Attention
-        if self.attn2 is not None:
-            if self.norm_type == "ada_norm":
-                norm_hidden_states = self.norm2(hidden_states, timestep)
-            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
-                norm_hidden_states = self.norm2(hidden_states)
-            elif self.norm_type == "ada_norm_single":
-                # For PixArt norm2 isn't applied here:
-                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
-                norm_hidden_states = hidden_states
-            elif self.norm_type == "ada_norm_continuous":
-                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
-            else:
-                raise ValueError("Incorrect norm")
-
-            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
-                norm_hidden_states = self.pos_embed(norm_hidden_states)
-
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-
-            hidden_states = attn_output + hidden_states
-
-        # 4. Feed-forward
-        # i2vgen doesn't have this norm 🤷‍♂️
-        if self.norm_type == "ada_norm_continuous":
-            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
-        elif not self.norm_type == "ada_norm_single":
-            norm_hidden_states = self.norm3(hidden_states)
-
-        if self.norm_type == "ada_norm_zero":
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
-        if self.norm_type == "ada_norm_single":
-            norm_hidden_states = self.norm2(hidden_states)
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
-        else:
-            ff_output = self.ff(norm_hidden_states)
-
-        if self.norm_type == "ada_norm_zero":
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        elif self.norm_type == "ada_norm_single":
-            ff_output = gate_mlp * ff_output
-
-        hidden_states = ff_output + hidden_states
-        if hidden_states.ndim == 4:
-            hidden_states = hidden_states.squeeze(1)
-
-        return hidden_states
-
-
-class UNet2p5DConditionModel(torch.nn.Module):
-    def __init__(self, unet: UNet2DConditionModel) -> None:
-        super().__init__()
-        self.unet = unet
-
-        self.use_ma = True
-        self.use_ra = True
-        self.use_camera_embedding = True
-        self.use_dual_stream = True
-
-        if self.use_dual_stream:
-            self.unet_dual = copy.deepcopy(unet)
-            self.init_attention(self.unet_dual)
-        self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra)
-        self.init_condition()
-        self.init_camera_embedding()
-
-    @staticmethod
-    def from_pretrained(pretrained_model_name_or_path, **kwargs):
-        torch_dtype = kwargs.pop('torch_dtype', torch.float32)
-        config_path = os.path.join(pretrained_model_name_or_path, 'config.json')
-        unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin')
-        with open(config_path, 'r', encoding='utf-8') as file:
-            config = json.load(file)
-        unet = UNet2DConditionModel(**config)
-        unet = UNet2p5DConditionModel(unet)
-        unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True)
-        unet.load_state_dict(unet_ckpt, strict=True)
-        unet = unet.to(torch_dtype)
-        return unet
-
-    def init_condition(self):
-        self.unet.conv_in = torch.nn.Conv2d(
-            12,
-            self.unet.conv_in.out_channels,
-            kernel_size=self.unet.conv_in.kernel_size,
-            stride=self.unet.conv_in.stride,
-            padding=self.unet.conv_in.padding,
-            dilation=self.unet.conv_in.dilation,
-            groups=self.unet.conv_in.groups,
-            bias=self.unet.conv_in.bias is not None)
-
-        self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024))
-        self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024))
-
-    def init_camera_embedding(self):
-
-        if self.use_camera_embedding:
-            time_embed_dim = 1280
-            self.max_num_ref_image = 5
-            self.max_num_gen_image = 12 * 3 + 4 * 2
-            self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim)
-
-    def init_attention(self, unet, use_ma=False, use_ra=False):
-
-        for down_block_i, down_block in enumerate(unet.down_blocks):
-            if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention:
-                for attn_i, attn in enumerate(down_block.attentions):
-                    for transformer_i, transformer in enumerate(attn.transformer_blocks):
-                        if isinstance(transformer, BasicTransformerBlock):
-                            attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
-                                                                                               f'down_{down_block_i}_{attn_i}_{transformer_i}',
-                                                                                               use_ma, use_ra)
-
-        if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention:
-            for attn_i, attn in enumerate(unet.mid_block.attentions):
-                for transformer_i, transformer in enumerate(attn.transformer_blocks):
-                    if isinstance(transformer, BasicTransformerBlock):
-                        attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
-                                                                                           f'mid_{attn_i}_{transformer_i}',
-                                                                                           use_ma, use_ra)
-
-        for up_block_i, up_block in enumerate(unet.up_blocks):
-            if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention:
-                for attn_i, attn in enumerate(up_block.attentions):
-                    for transformer_i, transformer in enumerate(attn.transformer_blocks):
-                        if isinstance(transformer, BasicTransformerBlock):
-                            attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
-                                                                                               f'up_{up_block_i}_{attn_i}_{transformer_i}',
-                                                                                               use_ma, use_ra)
-
-    def __getattr__(self, name: str):
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.unet, name)
-
-    def forward(
-        self, sample, timestep, encoder_hidden_states,
-        *args, down_intrablock_additional_residuals=None,
-        down_block_res_samples=None, mid_block_res_sample=None,
-        **cached_condition,
-    ):
-        B, N_gen, _, H, W = sample.shape
-        assert H == W
-
-        if self.use_camera_embedding:
-            camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image
-            camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)')
-        else:
-            camera_info_gen = None
-
-        sample = [sample]
-        if 'normal_imgs' in cached_condition:
-            sample.append(cached_condition["normal_imgs"])
-        if 'position_imgs' in cached_condition:
-            sample.append(cached_condition["position_imgs"])
-        sample = torch.cat(sample, dim=2)
-
-        sample = rearrange(sample, 'b n c h w -> (b n) c h w')
-
-        encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1)
-        encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c')
-
-        if self.use_ra:
-            if 'condition_embed_dict' in cached_condition:
-                condition_embed_dict = cached_condition['condition_embed_dict']
-            else:
-                condition_embed_dict = {}
-                ref_latents = cached_condition['ref_latents']
-                N_ref = ref_latents.shape[1]
-                if self.use_camera_embedding:
-                    camera_info_ref = cached_condition['camera_info_ref']
-                    camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)')
-                else:
-                    camera_info_ref = None
-
-                ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w')
-
-                encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1)
-                encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c')
-
-                noisy_ref_latents = ref_latents
-                timestep_ref = 0
-
-                if self.use_dual_stream:
-                    unet_ref = self.unet_dual
-                else:
-                    unet_ref = self.unet
-                unet_ref(
-                    noisy_ref_latents, timestep_ref,
-                    encoder_hidden_states=encoder_hidden_states_ref,
-                    class_labels=camera_info_ref,
-                    # **kwargs
-                    return_dict=False,
-                    cross_attention_kwargs={
-                        'mode': 'w', 'num_in_batch': N_ref,
-                        'condition_embed_dict': condition_embed_dict},
-                )
-                cached_condition['condition_embed_dict'] = condition_embed_dict
-        else:
-            condition_embed_dict = None
-
-        mva_scale = cached_condition.get('mva_scale', 1.0)
-        ref_scale = cached_condition.get('ref_scale', 1.0)
-
-        return self.unet(
-            sample, timestep,
-            encoder_hidden_states_gen, *args,
-            class_labels=camera_info_gen,
-            down_intrablock_additional_residuals=[
-                sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals
-            ] if down_intrablock_additional_residuals is not None else None,
-            down_block_additional_residuals=[
-                sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples
-            ] if down_block_res_samples is not None else None,
-            mid_block_additional_residual=(
-                mid_block_res_sample.to(dtype=self.unet.dtype)
-                if mid_block_res_sample is not None else None
-            ),
-            return_dict=False,
-            cross_attention_kwargs={
-                'mode': 'r', 'num_in_batch': N_gen,
-                'condition_embed_dict': condition_embed_dict,
-                'mva_scale': mva_scale,
-                'ref_scale': ref_scale,
-            },
-        )
diff --git a/hy3dgen/texgen/pipelines.py b/hy3dgen/texgen/pipelines.py
deleted file mode 100644
index cff817cc7aaaa45f420d099675940c2442b82517..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/pipelines.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-import logging
-import os
-
-import numpy as np
-import torch
-from PIL import Image
-
-from .differentiable_renderer.mesh_render import MeshRender
-from .utils.dehighlight_utils import Light_Shadow_Remover
-from .utils.multiview_utils import Multiview_Diffusion_Net
-from .utils.uv_warp_utils import mesh_uv_wrap
-
-logger = logging.getLogger(__name__)
-
-
-class Hunyuan3DTexGenConfig:
-
-    def __init__(self, light_remover_ckpt_path, multiview_ckpt_path):
-        self.device = 'cpu'
-        self.light_remover_ckpt_path = light_remover_ckpt_path
-        self.multiview_ckpt_path = multiview_ckpt_path
-
-        self.candidate_camera_azims = [0, 90, 180, 270, 0, 180]
-        self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90]
-        self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05]
-
-        self.render_size = 2048
-        self.texture_size = 1024
-        self.bake_exp = 4
-        self.merge_method = 'fast'
-
-
-class Hunyuan3DPaintPipeline:
-    @classmethod
-    def from_pretrained(cls, model_path):
-        original_model_path = model_path
-        if not os.path.exists(model_path):
-            # try local path
-            base_dir = os.environ.get('HY3DGEN_MODELS', '~/content/hy3dgen')
-            model_path = os.path.expanduser(os.path.join(base_dir, model_path))
-
-            delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
-            multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
-
-            if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path):
-                try:
-                    import huggingface_hub
-                    # download from huggingface
-                    model_path = huggingface_hub.snapshot_download(repo_id=original_model_path)
-                    delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
-                    multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
-                    return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
-                except ImportError:
-                    logger.warning(
-                        "You need to install HuggingFace Hub to load models from the hub."
-                    )
-                    raise RuntimeError(f"Model path {model_path} not found")
-            else:
-                return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
-
-        raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface")
-
-    def __init__(self, config):
-        self.config = config
-        self.models = {}
-        self.render = MeshRender(
-            default_resolution=self.config.render_size,
-            texture_size=self.config.texture_size)
-
-        self.load_models()
-
-    def load_models(self):
-        # empty cude cache
-        torch.cuda.empty_cache()
-        # Load model
-        self.models['delight_model'] = Light_Shadow_Remover(self.config)
-        self.models['multiview_model'] = Multiview_Diffusion_Net(self.config)
-
-    def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True):
-        normal_maps = []
-        for elev, azim in zip(camera_elevs, camera_azims):
-            normal_map = self.render.render_normal(
-                elev, azim, use_abs_coor=use_abs_coor, return_type='pl')
-            normal_maps.append(normal_map)
-
-        return normal_maps
-
-    def render_position_multiview(self, camera_elevs, camera_azims):
-        position_maps = []
-        for elev, azim in zip(camera_elevs, camera_azims):
-            position_map = self.render.render_position(
-                elev, azim, return_type='pl')
-            position_maps.append(position_map)
-
-        return position_maps
-
-    def bake_from_multiview(self, views, camera_elevs,
-                            camera_azims, view_weights, method='graphcut'):
-        project_textures, project_weighted_cos_maps = [], []
-        project_boundary_maps = []
-        for view, camera_elev, camera_azim, weight in zip(
-            views, camera_elevs, camera_azims, view_weights):
-            project_texture, project_cos_map, project_boundary_map = self.render.back_project(
-                view, camera_elev, camera_azim)
-            project_cos_map = weight * (project_cos_map ** self.config.bake_exp)
-            project_textures.append(project_texture)
-            project_weighted_cos_maps.append(project_cos_map)
-            project_boundary_maps.append(project_boundary_map)
-
-        if method == 'fast':
-            texture, ori_trust_map = self.render.fast_bake_texture(
-                project_textures, project_weighted_cos_maps)
-        else:
-            raise f'no method {method}'
-        return texture, ori_trust_map > 1E-8
-
-    def texture_inpaint(self, texture, mask):
-
-        texture_np = self.render.uv_inpaint(texture, mask)
-        texture = torch.tensor(texture_np / 255).float().to(texture.device)
-
-        return texture
-
-    def recenter_image(self, image, border_ratio=0.2):
-        if image.mode == 'RGB':
-            return image
-        elif image.mode == 'L':
-            image = image.convert('RGB')
-            return image
-
-        alpha_channel = np.array(image)[:, :, 3]
-        non_zero_indices = np.argwhere(alpha_channel > 0)
-        if non_zero_indices.size == 0:
-            raise ValueError("Image is fully transparent")
-
-        min_row, min_col = non_zero_indices.min(axis=0)
-        max_row, max_col = non_zero_indices.max(axis=0)
-
-        cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1))
-
-        width, height = cropped_image.size
-        border_width = int(width * border_ratio)
-        border_height = int(height * border_ratio)
-
-        new_width = width + 2 * border_width
-        new_height = height + 2 * border_height
-
-        square_size = max(new_width, new_height)
-
-        new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0))
-
-        paste_x = (square_size - new_width) // 2 + border_width
-        paste_y = (square_size - new_height) // 2 + border_height
-
-        new_image.paste(cropped_image, (paste_x, paste_y))
-        return new_image
-
-    @torch.no_grad()
-    def __call__(self, mesh, image):
-
-        if isinstance(image, str):
-            image_prompt = Image.open(image)
-        else:
-            image_prompt = image
-
-        image_prompt = self.recenter_image(image_prompt)
-
-        image_prompt = self.models['delight_model'](image_prompt)
-
-        mesh = mesh_uv_wrap(mesh)
-
-        self.render.load_mesh(mesh)
-
-        selected_camera_elevs, selected_camera_azims, selected_view_weights = \
-            self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights
-
-        normal_maps = self.render_normal_multiview(
-            selected_camera_elevs, selected_camera_azims, use_abs_coor=True)
-        position_maps = self.render_position_multiview(
-            selected_camera_elevs, selected_camera_azims)
-
-        camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[
-            elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in
-                       zip(selected_camera_azims, selected_camera_elevs)]
-        multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info)
-
-        for i in range(len(multiviews)):
-            multiviews[i] = multiviews[i].resize(
-                (self.config.render_size, self.config.render_size))
-
-        texture, mask = self.bake_from_multiview(multiviews,
-                                                 selected_camera_elevs, selected_camera_azims, selected_view_weights,
-                                                 method=self.config.merge_method)
-
-        mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8)
-
-        texture = self.texture_inpaint(texture, mask_np)
-
-        self.render.set_texture(texture)
-        textured_mesh = self.render.save_mesh()
-
-        return textured_mesh
diff --git a/hy3dgen/texgen/utils/__init__.py b/hy3dgen/texgen/utils/__init__.py
deleted file mode 100644
index e307c3f8c1292da02f308e4b59ef0bcd6fe7305e..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/texgen/utils/alignImg4Tex_utils.py b/hy3dgen/texgen/utils/alignImg4Tex_utils.py
deleted file mode 100644
index 0a09c17cfe1a3f1ac850688e96b66341f0226418..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/alignImg4Tex_utils.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-import torch
-from diffusers import EulerAncestralDiscreteScheduler
-from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \
-    AutoencoderKL
-
-
-class Img2img_Control_Ip_adapter:
-    def __init__(self, device):
-        controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16,
-                                                     variant="fp16", use_safetensors=True)
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
-        )
-        pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors")
-        pipe.set_ip_adapter_scale(0.7)
-
-        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-        # pipe.enable_model_cpu_offload()
-        self.pipe = pipe.to(device)
-
-    def __call__(
-        self,
-        prompt,
-        control_image,
-        ip_adapter_image,
-        negative_prompt,
-        height=512,
-        width=512,
-        num_inference_steps=20,
-        guidance_scale=8.0,
-        controlnet_conditioning_scale=1.0,
-        output_type="pil",
-        **kwargs,
-    ):
-        results = self.pipe(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            image=control_image,
-            ip_adapter_image=ip_adapter_image,
-            generator=torch.manual_seed(42),
-            seed=42,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            strength=1,
-            # clip_skip=2,
-            height=height,
-            width=width,
-            output_type=output_type,
-            **kwargs,
-        ).images[0]
-        return results
-
-
-################################################################
-
-class HesModel:
-    def __init__(self, ):
-        controlnet_depth = ControlNetModel.from_pretrained(
-            'diffusers/controlnet-depth-sdxl-1.0',
-            torch_dtype=torch.float16,
-            variant="fp16",
-            use_safetensors=True
-        )
-        self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
-            'stabilityai/stable-diffusion-xl-base-1.0',
-            torch_dtype=torch.float16,
-            variant="fp16",
-            controlnet=controlnet_depth,
-            use_safetensors=True,
-        )
-        self.pipe.vae = AutoencoderKL.from_pretrained(
-            'madebyollin/sdxl-vae-fp16-fix',
-            torch_dtype=torch.float16
-        )
-
-        self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors")
-        self.pipe.set_ip_adapter_scale(0.7)
-        self.pipe.to("cuda")
-
-    def __call__(self,
-                 init_image,
-                 control_image,
-                 ip_adapter_image=None,
-                 prompt='3D image',
-                 negative_prompt='2D image',
-                 seed=42,
-                 strength=0.8,
-                 num_inference_steps=40,
-                 guidance_scale=7.5,
-                 controlnet_conditioning_scale=0.5,
-                 **kwargs
-                 ):
-        image = self.pipe(
-            prompt=prompt,
-            image=init_image,
-            control_image=control_image,
-            ip_adapter_image=ip_adapter_image,
-            negative_prompt=negative_prompt,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            strength=strength,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            seed=seed,
-            **kwargs
-        ).images[0]
-        return image
diff --git a/hy3dgen/texgen/utils/counter_utils.py b/hy3dgen/texgen/utils/counter_utils.py
deleted file mode 100644
index e0374fc327ad2127ec84bb0c267c19a3b9c8d738..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/counter_utils.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-class RunningStats():
-    def __init__(self) -> None:
-        self.count = 0
-        self.sum = 0
-        self.mean = 0
-        self.min = None
-        self.max = None
-
-    def add_value(self, value):
-        self.count += 1
-        self.sum += value
-        self.mean = self.sum / self.count
-
-        if self.min is None or value < self.min:
-            self.min = value
-
-        if self.max is None or value > self.max:
-            self.max = value
-
-    def get_count(self):
-        return self.count
-
-    def get_sum(self):
-        return self.sum
-
-    def get_mean(self):
-        return self.mean
-
-    def get_min(self):
-        return self.min
-
-    def get_max(self):
-        return self.max
diff --git a/hy3dgen/texgen/utils/dehighlight_utils.py b/hy3dgen/texgen/utils/dehighlight_utils.py
deleted file mode 100644
index 089076b08f712ec0db882835f422183fd7f94457..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/dehighlight_utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
-
-
-class Light_Shadow_Remover():
-    def __init__(self, config):
-        self.device = config.device
-        self.cfg_image = 1.5
-        self.cfg_text = 1.0
-
-        pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            config.light_remover_ckpt_path,
-            torch_dtype=torch.float16,
-            safety_checker=None,
-        )
-        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
-        pipeline.set_progress_bar_config(disable=True)
-
-        # self.pipeline = pipeline.to(self.device, torch.float16)
-        self.pipeline = pipeline # Needed to avoid displaying the warning
-    @torch.no_grad()
-    def __call__(self, image):
-
-        image = image.resize((512, 512))
-
-        if image.mode == 'RGBA':
-            image_array = np.array(image)
-            alpha_channel = image_array[:, :, 3]
-            erosion_size = 3
-            kernel = np.ones((erosion_size, erosion_size), np.uint8)
-            alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1)
-            image_array[alpha_channel == 0, :3] = 255
-            image_array[:, :, 3] = alpha_channel
-            image = Image.fromarray(image_array)
-
-            image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
-            alpha = image_tensor[:, :, 3:]
-            rgb_target = image_tensor[:, :, :3]
-        else:
-            image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
-            alpha = torch.ones_like(image_tensor)[:, :, :1]
-            rgb_target = image_tensor[:, :, :3]
-
-        image = image.convert('RGB')
-
-        image = self.pipeline(
-            prompt="",
-            image=image,
-            generator=torch.manual_seed(42),
-            height=512,
-            width=512,
-            num_inference_steps=50,
-            image_guidance_scale=self.cfg_image,
-            guidance_scale=self.cfg_text,
-        ).images[0]
-
-        return image
diff --git a/hy3dgen/texgen/utils/multiview_utils.py b/hy3dgen/texgen/utils/multiview_utils.py
deleted file mode 100644
index ba5708b617e0d58d6d37025fcb94a75324b9e5a9..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/multiview_utils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import os
-import random
-
-import numpy as np
-import torch
-from diffusers import DiffusionPipeline
-from diffusers import EulerAncestralDiscreteScheduler
-
-
-class Multiview_Diffusion_Net():
-    def __init__(self, config) -> None:
-        self.device = config.device
-        self.view_size = 512
-        multiview_ckpt_path = config.multiview_ckpt_path
-
-        current_file_path = os.path.abspath(__file__)
-        custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint')
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            multiview_ckpt_path,
-            custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16)
-
-        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config,
-                                                                         timestep_spacing='trailing')
-
-        pipeline.set_progress_bar_config(disable=True)
-        self.pipeline = pipeline #.to(self.device) # only for cosmetics and not display the warning 
-
-    def seed_everything(self, seed):
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        os.environ["PL_GLOBAL_SEED"] = str(seed)
-
-    def __call__(self, input_image, control_images, camera_info):
-
-        self.seed_everything(0)
-
-        input_image = input_image.resize((self.view_size, self.view_size))
-        for i in range(len(control_images)):
-            control_images[i] = control_images[i].resize((self.view_size, self.view_size))
-            if control_images[i].mode == 'L':
-                control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1')
-
-        kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0))
-
-        num_view = len(control_images) // 2
-        normal_image = [[control_images[i] for i in range(num_view)]]
-        position_image = [[control_images[i + num_view] for i in range(num_view)]]
-
-        camera_info_gen = [camera_info]
-        camera_info_ref = [[0]]
-        kwargs['width'] = self.view_size
-        kwargs['height'] = self.view_size
-        kwargs['num_in_batch'] = num_view
-        kwargs['camera_info_gen'] = camera_info_gen
-        kwargs['camera_info_ref'] = camera_info_ref
-        kwargs["normal_imgs"] = normal_image
-        kwargs["position_imgs"] = position_image
-
-        mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images
-        return mvd_image
diff --git a/hy3dgen/texgen/utils/simplify_mesh_utils.py b/hy3dgen/texgen/utils/simplify_mesh_utils.py
deleted file mode 100644
index 915284d337e648c57fae886dee3333c0203856b6..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/simplify_mesh_utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import trimesh
-
-
-def remesh_mesh(mesh_path, remesh_path, method='trimesh'):
-    if method == 'trimesh':
-        mesh_simplify_trimesh(mesh_path, remesh_path)
-    else:
-        raise f'Method {method} has not been implemented.'
-
-
-def mesh_simplify_trimesh(inputpath, outputpath):
-    import pymeshlab
-    ms = pymeshlab.MeshSet()
-    ms.load_new_mesh(inputpath, load_in_a_single_layer=True)
-    ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False)
-
-    courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh')
-    face_num = courent.faces.shape[0]
-
-    if face_num > 100000:
-        courent = courent.simplify_quadric_decimation(40000)
-    courent.export(outputpath)
diff --git a/hy3dgen/texgen/utils/uv_warp_utils.py b/hy3dgen/texgen/utils/uv_warp_utils.py
deleted file mode 100644
index b4f4082274b900aebcdbfcf29a7d6a9532dfa8cb..0000000000000000000000000000000000000000
--- a/hy3dgen/texgen/utils/uv_warp_utils.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-import trimesh
-import xatlas
-
-
-def mesh_uv_wrap(mesh):
-    if isinstance(mesh, trimesh.Scene):
-        mesh = mesh.dump(concatenate=True)
-
-    # if len(mesh.faces) > 50000:
-    #     raise ValueError("The mesh has more than 50,000 faces, which is not supported.")
-
-    vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces)
-
-    mesh.vertices = mesh.vertices[vmapping]
-    mesh.faces = indices
-    mesh.visual.uv = uvs
-
-    return mesh
diff --git a/hy3dgen/text2image.py b/hy3dgen/text2image.py
deleted file mode 100644
index be920672cb72238cbe49cba930e3e02a7c287b82..0000000000000000000000000000000000000000
--- a/hy3dgen/text2image.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Open Source Model Licensed under the Apache License Version 2.0
-# and Other Licenses of the Third-Party Components therein:
-# The below Model in this distribution may have been modified by THL A29 Limited
-# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
-
-# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
-# The below software and/or models in this distribution may have been
-# modified by THL A29 Limited ("Tencent Modifications").
-# All Tencent Modifications are Copyright (C) THL A29 Limited.
-
-# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
-# except for the third-party components listed below.
-# Hunyuan 3D does not impose any additional limitations beyond what is outlined
-# in the repsective licenses of these third-party components.
-# Users must comply with all terms and conditions of original licenses of these third-party
-# components and must ensure that the usage of the third party components adheres to
-# all relevant laws and regulations.
-
-# For avoidance of doubts, Hunyuan 3D means the large language models and
-# their software and algorithms, including trained model weights, parameters (including
-# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
-# fine-tuning enabling code and other elements of the foregoing made publicly available
-# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-
-
-import os
-import random
-
-import numpy as np
-import torch
-from diffusers import AutoPipelineForText2Image
-
-
-def seed_everything(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    os.environ["PL_GLOBAL_SEED"] = str(seed)
-
-
-class HunyuanDiTPipeline:
-    def __init__(
-        self,
-        model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
-        device='cpu'
-    ):
-        torch.set_default_device('cpu')
-        self.device = device
-        self.pipe = AutoPipelineForText2Image.from_pretrained(
-            model_path,
-            torch_dtype=torch.float16,
-            enable_pag=True,
-            pag_applied_layers=["blocks.(16|17|18|19)"]
-        ) # .to(device) #  needed to avoid displaying the warning
-        self.pos_txt = ",白色背景,3D风格,最佳质量"
-        self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
-                       "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
-                       "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
-                       "额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
-
-    def compile(self):
-        # accelarate hunyuan-dit transformer,first inference will cost long time
-        torch.set_float32_matmul_precision('high')
-        self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True)
-        # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True)
-        generator = torch.Generator(device=self.pipe.device)  # infer once for hot-start
-        out_img = self.pipe(
-            prompt='美少女战士',
-            negative_prompt='模糊',
-            num_inference_steps=25,
-            pag_scale=1.3,
-            width=1024,
-            height=1024,
-            generator=generator,
-            return_dict=False
-        )[0][0]
-
-    @torch.no_grad()
-    def __call__(self, prompt, seed=0):
-        seed_everything(seed)
-        generator = torch.Generator(device="cuda") #self.pipe.device
-        generator = generator.manual_seed(int(seed))
-        out_img = self.pipe(
-            prompt=self.pos_txt+prompt,
-            negative_prompt=self.neg_txt,
-            num_inference_steps=20,
-            pag_scale=1.3,
-            width=1024,
-            height=1024,
-            generator=generator,
-            return_dict=False
-        )[0][0]
-        return out_img