Spaces:

Roblox
/

cube3d-interactive

Running on L40S

App Files Files Community

akashkgarg

captaincobb commited on 5 days ago

Commit

cb88aea

verified ·

1 Parent(s): 0964e01

cubev0.5 (#5)

Browse files

- updates for cube v0.5 (ce16420814099de27a21e91450b755f4d804d5b8)
- bbox sliders always present (3ee2e412ae88cade005c7324b456a2b0ebd331ac)
- adding model v0.5 yaml (0b489dd3391bd560f0646aa0ab9fb14c5d3f9f09)
- removing shared url on launch (978db53a12658ece2e2b58d96944a3cf394f7a5f)

Co-authored-by: Akash Garg <captaincobb@users.noreply.huggingface.co>

Files changed (15) hide show

Dockerfile +1 -1
app.py +68 -10
cube/README.md +14 -9
cube/cube3d/colab_cube3d.ipynb +1 -1
cube/cube3d/configs/open_model_v0.5.yaml +33 -0
cube/cube3d/generate.py +18 -2
cube/cube3d/inference/engine.py +86 -17
cube/cube3d/inference/utils.py +27 -4
cube/cube3d/mesh_utils/postprocessing.py +3 -1
cube/cube3d/model/gpt/dual_stream_roformer.py +5 -0
cube/cube3d/model/transformers/cache.py +30 -3
cube/cube3d/model/transformers/dual_stream_attention.py +1 -2
cube/cube3d/model/transformers/roformer.py +1 -2
cube/cube3d/vq_vae_encode_decode.py +2 -2
requirements.txt +2 -1

Dockerfile CHANGED Viewed

@@ -30,6 +30,6 @@ RUN git clone https://github.com/Roblox/cube.git
 WORKDIR /home/user/app/cube
 RUN pip install .[meshlab]
-RUN huggingface-cli download Roblox/cube3d-v0.1 --local-dir ./model_weights
 WORKDIR /home/user/app

 WORKDIR /home/user/app/cube
 RUN pip install .[meshlab]
+RUN huggingface-cli download Roblox/cube3d-v0.5 --local-dir ./model_weights
 WORKDIR /home/user/app

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import torch
 import trimesh
 import sys
 from pathlib import Path
 pathdir = Path(__file__).parent / 'cube'
 sys.path.append(pathdir.as_posix())
@@ -16,11 +17,19 @@ sys.path.append(pathdir.as_posix())
 # print(pathdir.as_posix())
 from cube3d.inference.engine import EngineFast, Engine
 from pathlib import Path
 import uuid
 import shutil
 from huggingface_hub import snapshot_download
 GLOBAL_STATE = {}
@@ -41,8 +50,8 @@ def gen_save_folder(max_size=200):
     return new_folder
 @spaces.GPU
-def handle_text_prompt(input_prompt, variance = 0):
-    print(f"prompt: {input_prompt}, variance: {variance}")
     if "engine_fast" not in GLOBAL_STATE:
         config_path = GLOBAL_STATE["config_path"]
@@ -56,13 +65,38 @@ def handle_text_prompt(input_prompt, variance = 0):
         )
         GLOBAL_STATE["engine_fast"] = engine_fast
-    top_p = None if variance == 0 else (100 - variance) / 100.0
-    mesh_v_f = GLOBAL_STATE["engine_fast"].t2s([input_prompt], use_kv_cache=True, resolution_base=8.0, top_p=top_p)
     # save output
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     save_folder = gen_save_folder()
     output_path = os.path.join(save_folder, "output.glb")
-    trimesh.Trimesh(vertices=vertices, faces=faces).export(output_path)
     return output_path
 def build_interface():
@@ -85,7 +119,28 @@ def build_interface():
                         label="Prompt",
                         lines=2,
                     )
-                    variance = gr.Slider(minimum=0, maximum=99, step=1, value=0, label="Variance")
                 with gr.Row():
                     submit_button = gr.Button("Submit", variant="primary")
             with gr.Column(scale=3):
@@ -97,7 +152,11 @@ def build_interface():
             handle_text_prompt,
             inputs=[
                 input_text_box,
-                variance
             ],
             outputs=[
                 model3d
@@ -105,7 +164,6 @@ def build_interface():
         )
     return interface
 def generate(args):
     GLOBAL_STATE["config_path"] = args.config_path
     GLOBAL_STATE["SAVE_DIR"] = args.save_dir
@@ -122,7 +180,7 @@ if __name__=="__main__":
         "--config_path",
         type=str,
         help="Path to the config file",
-        default="cube/cube3d/configs/open_model.yaml",
     )
     parser.add_argument(
         "--gpt_ckpt_path",
@@ -144,7 +202,7 @@ if __name__=="__main__":
     args = parser.parse_args()
     snapshot_download(
-        repo_id="Roblox/cube3d-v0.1",
         local_dir="./model_weights"
     )
     generate(args)

 import trimesh
 import sys
 from pathlib import Path
+import numpy as np
 pathdir = Path(__file__).parent / 'cube'
 sys.path.append(pathdir.as_posix())
 # print(pathdir.as_posix())
 from cube3d.inference.engine import EngineFast, Engine
+from cube3d.inference.utils import normalize_bbox
 from pathlib import Path
 import uuid
 import shutil
 from huggingface_hub import snapshot_download
+from cube3d.mesh_utils.postprocessing import (
+    PYMESHLAB_AVAILABLE,
+    create_pymeshset,
+    postprocess_mesh,
+    save_mesh,
+)
 GLOBAL_STATE = {}
     return new_folder
 @spaces.GPU
+def handle_text_prompt(input_prompt, use_bbox = True, bbox_x=1.0, bbox_y=1.0, bbox_z=1.0, hi_res=False):
+    print(f"prompt: {input_prompt}, use_bbox: {use_bbox}, bbox_x: {bbox_x}, bbox_y: {bbox_y}, bbox_z: {bbox_z}, hi_res: {hi_res}")
     if "engine_fast" not in GLOBAL_STATE:
         config_path = GLOBAL_STATE["config_path"]
         )
         GLOBAL_STATE["engine_fast"] = engine_fast
+    # Determine bounding box size based on option
+    bbox_size = None
+    if use_bbox:
+        bbox_size = [bbox_x, bbox_y, bbox_z]
+    # For "No Bounding Box", bbox_size remains None
+    normalized_bbox = normalize_bbox(bbox_size) if bbox_size is not None else None
+    resolution_base = 9.0 if hi_res else 8.0
+    mesh_v_f = GLOBAL_STATE["engine_fast"].t2s([input_prompt], use_kv_cache=True, resolution_base=resolution_base, bounding_box_xyz=normalized_bbox)
     # save output
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
+    ms = create_pymeshset(vertices, faces)
+    target_face_num = max(10000, int(faces.shape[0] * 0.1))
+    print(f"Postprocessing mesh to {target_face_num} faces")
+    postprocess_mesh(ms, target_face_num)
+    mesh = ms.current_mesh()
+    vertices = mesh.vertex_matrix()
+    faces = mesh.face_matrix()
+    min_extents = np.min(mesh.vertex_matrix(), axis = 0)
+    max_extents = np.max(mesh.vertex_matrix(), axis = 0)
+    mesh = trimesh.Trimesh(vertices=vertices, faces=faces)
+    scene = trimesh.scene.Scene()
+    scene.add_geometry(mesh)
     save_folder = gen_save_folder()
     output_path = os.path.join(save_folder, "output.glb")
+    # trimesh.Trimesh(vertices=vertices, faces=faces).export(output_path)
+    scene.export(output_path)
     return output_path
 def build_interface():
                         label="Prompt",
                         lines=2,
                     )
+                    use_bbox = gr.Checkbox(label="Use Bbox", value=False)
+                    with gr.Group() as bbox_group:
+                        bbox_x = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Length", interactive=False)
+                        bbox_y = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Height", interactive=False)
+                        bbox_z = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Depth", interactive=False)
+                    # Enable/disable bbox sliders based on use_bbox checkbox
+                    def toggle_bbox_interactivity(use_bbox):
+                        return (
+                            gr.Slider(interactive=use_bbox),
+                            gr.Slider(interactive=use_bbox),
+                            gr.Slider(interactive=use_bbox)
+                        )
+                    use_bbox.change(
+                        toggle_bbox_interactivity,
+                        inputs=[use_bbox],
+                        outputs=[bbox_x, bbox_y, bbox_z]
+                    )
+                    hi_res = gr.Checkbox(label="Hi-Res", value=False)
                 with gr.Row():
                     submit_button = gr.Button("Submit", variant="primary")
             with gr.Column(scale=3):
             handle_text_prompt,
             inputs=[
                 input_text_box,
+                use_bbox,
+                bbox_x,
+                bbox_y,
+                bbox_z,
+                hi_res
             ],
             outputs=[
                 model3d
         )
     return interface
 def generate(args):
     GLOBAL_STATE["config_path"] = args.config_path
     GLOBAL_STATE["SAVE_DIR"] = args.save_dir
         "--config_path",
         type=str,
         help="Path to the config file",
+        default="cube/cube3d/configs/open_model_v0.5.yaml",
     )
     parser.add_argument(
         "--gpt_ckpt_path",
     args = parser.parse_args()
     snapshot_download(
+        repo_id="Roblox/cube3d-v0.5",
         local_dir="./model_weights"
     )
     generate(args)

cube/README.md CHANGED Viewed

@@ -6,9 +6,10 @@
 <div align="center">
   <a href=https://corp.roblox.com/newsroom/2025/03/introducing-roblox-cube target="_blank"><img src=https://img.shields.io/badge/Roblox-Blog-000000.svg?logo=Roblox height=22px></a>
-  <a href=https://huggingface.co/Roblox/cube3d-0.1 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-d96902.svg height=22px></a>
   <a href=https://arxiv.org/abs/2503.15475 target="_blank"><img src=https://img.shields.io/badge/ArXiv-Report-b5212f.svg?logo=arxiv height=22px></a>
-  <a href=https://colab.research.google.com/drive/1ZvTj49pjDCD_crX5WPZNTAoTTzL6-E5t target="_blank"><img src=https://img.shields.io/badge/Google-Open_In_Colab-blue.svg?logo=googlecolab height=22px></a>
 </div>
@@ -27,7 +28,10 @@ towards this vision, we hope to engage others in the research community to addre
 Cube 3D is our first step towards 3D intelligence, which involves a shape tokenizer and a text-to-shape generation model. We are unlocking the power of generating 3D assets and enhancing creativity for all artists. Our latest version of Cube 3D is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. This release includes model weights and starting code for using our text-to-shape model to create 3D assets.
-### Try it out on [Google Colab](https://colab.research.google.com/drive/1ZvTj49pjDCD_crX5WPZNTAoTTzL6-E5t)
 ### Install Requirements
@@ -41,6 +45,8 @@ pip install -e .[meshlab]
 > **CUDA**: If you are using a Windows machine, you may need to install the [CUDA](https://developer.nvidia.com/cuda-downloads) toolkit as well as `torch` with cuda support via `pip install torch --index-url https://download.pytorch.org/whl/cu124 --force-reinstall`
 > **Note**: `[meshlab]` is an optional dependency and can be removed by simply running `pip install -e .` for better compatibility but mesh simplification will be disabled.
 ### Download Models from Huggingface 🤗
@@ -75,7 +81,7 @@ and save it as `turntable.gif` in the specified `output` directory.
 We provide several example output objects and their corresponding text prompts in the `examples` folder.
-> **Note**: You must have Blender installed and available in your system's PATH to render the turntable GIF. You can download it from [Blender's official website](https://www.blender.org/). Ensure that the Blender executable is accessible from the command line.
 > **Note**: If shape decoding is slow, you can try to specify a lower resolution using the `--resolution-base` flag. A lower resolution will create a coarser and lower quality output mesh but faster decoding. Values between 4.0 and 9.0 are recommended.
@@ -118,16 +124,15 @@ engine_fast = EngineFast( # only supported on CUDA devices, replace with Engine
     config_path,
     gpt_ckpt_path,
     shape_ckpt_path,
-    device=torch.device("cuda"),
 )
 # inference
 input_prompt = "A pair of noise-canceling headphones"
 # NOTE: Reduce `resolution_base` for faster inference and lower VRAM usage
-# The `top_k` parameter controls randomness between inferences:
-#   - A value of 1 yields deterministic results.
-#   - Higher values introduce more randomness.
-mesh_v_f = engine_fast.t2s([input_prompt], use_kv_cache=True, resolution_base=8.0, top_k=5)
 # save output
 vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]

 <div align="center">
   <a href=https://corp.roblox.com/newsroom/2025/03/introducing-roblox-cube target="_blank"><img src=https://img.shields.io/badge/Roblox-Blog-000000.svg?logo=Roblox height=22px></a>
   <a href=https://arxiv.org/abs/2503.15475 target="_blank"><img src=https://img.shields.io/badge/ArXiv-Report-b5212f.svg?logo=arxiv height=22px></a>
+  <a href=https://huggingface.co/Roblox/cube3d-0.1 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-d96902.svg height=22px></a>
+  <a href=https://huggingface.co/spaces/Roblox/cube3d-interactive target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Demo-blue.svg height=22px></a>
+  <a href=https://colab.research.google.com/drive/1ZvTj49pjDCD_crX5WPZNTAoTTzL6-E5t target="_blank"><img src=https://img.shields.io/badge/Colab-Demo-blue.svg?logo=googlecolab height=22px></a>
 </div>
 Cube 3D is our first step towards 3D intelligence, which involves a shape tokenizer and a text-to-shape generation model. We are unlocking the power of generating 3D assets and enhancing creativity for all artists. Our latest version of Cube 3D is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. This release includes model weights and starting code for using our text-to-shape model to create 3D assets.
+### Try it out on
+- [Google Colab](https://colab.research.google.com/drive/1ZvTj49pjDCD_crX5WPZNTAoTTzL6-E5t)
+- [Hugging Face Interactive Demo](https://huggingface.co/spaces/Roblox/cube3d-interactive)
 ### Install Requirements
 > **CUDA**: If you are using a Windows machine, you may need to install the [CUDA](https://developer.nvidia.com/cuda-downloads) toolkit as well as `torch` with cuda support via `pip install torch --index-url https://download.pytorch.org/whl/cu124 --force-reinstall`
+> **MacOS**: Systems with Apple Silicon or AMD GPUs can leverage the Metal Performance Shaders (MPS) backend for PyTorch.
 > **Note**: `[meshlab]` is an optional dependency and can be removed by simply running `pip install -e .` for better compatibility but mesh simplification will be disabled.
 ### Download Models from Huggingface 🤗
 We provide several example output objects and their corresponding text prompts in the `examples` folder.
+> **Note**: You must have Blender (version >= 4.3) installed and available in your system's PATH to render the turntable GIF. You can download it from [Blender's official website](https://www.blender.org/). Ensure that the Blender executable is accessible from the command line.
 > **Note**: If shape decoding is slow, you can try to specify a lower resolution using the `--resolution-base` flag. A lower resolution will create a coarser and lower quality output mesh but faster decoding. Values between 4.0 and 9.0 are recommended.
     config_path,
     gpt_ckpt_path,
     shape_ckpt_path,
+    device=torch.device("cuda"), # Replace with "mps" on Metal-compatible devices
 )
 # inference
 input_prompt = "A pair of noise-canceling headphones"
 # NOTE: Reduce `resolution_base` for faster inference and lower VRAM usage
+# The `top_p` parameter controls randomness between inferences:
+#   Float < 1: Keep smallest set of tokens with cumulative probability ≥ top_p. Default None: deterministic generation.
+mesh_v_f = engine_fast.t2s([input_prompt], use_kv_cache=True, resolution_base=8.0, top_p=0.9)
 # save output
 vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]

cube/cube3d/colab_cube3d.ipynb CHANGED Viewed

@@ -4345,7 +4345,7 @@
       "cell_type": "code",
       "source": [
         "input_prompt = \"vintage couch\"\n",
-        "# Use a lower resolution_base to accomodate limited GPU VRAM on Colab notebooks\n",
         "mesh_v_f = engine.t2s([input_prompt], use_kv_cache=True, resolution_base=5.0)"
       ],
       "metadata": {

       "cell_type": "code",
       "source": [
         "input_prompt = \"vintage couch\"\n",
+        "# Use a lower resolution_base to accommodate limited GPU VRAM on Colab notebooks\n",
         "mesh_v_f = engine.t2s([input_prompt], use_kv_cache=True, resolution_base=5.0)"
       ],
       "metadata": {

cube/cube3d/configs/open_model_v0.5.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+gpt_model:
+  n_layer: 23
+  n_single_layer: 1
+  rope_theta: 10000
+  n_head: 12
+  n_embd: 1536
+  bias: true
+  eps: 1.e-6
+  shape_model_vocab_size: 16384
+  text_model_embed_dim: 768
+  use_pooled_text_embed: False
+  shape_model_embed_dim: 32
+  encoder_with_cls_token: true
+  use_bbox: true
+shape_model:
+  encoder_with_cls_token: true
+  num_encoder_latents: 1024
+  num_decoder_latents: 0
+  embed_dim: 32
+  width: 768
+  num_heads: 12
+  out_dim: 1
+  eps: 1.e-6
+  num_freqs: 128
+  point_feats: 3
+  embed_point_feats: false
+  num_encoder_layers: 13
+  encoder_cross_attention_levels: [0, 2, 4, 8]
+  num_decoder_layers: 24
+  num_codes: 16384
+text_model_pretrained_model_name_or_path: "openai/clip-vit-large-patch14"

cube/cube3d/generate.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torch
 import trimesh
 from cube3d.inference.engine import Engine, EngineFast
 from cube3d.mesh_utils.postprocessing import (
     PYMESHLAB_AVAILABLE,
     create_pymeshset,
@@ -13,6 +14,7 @@ from cube3d.mesh_utils.postprocessing import (
 )
 from cube3d.renderer import renderer
 def generate_mesh(
     engine,
     prompt,
@@ -21,12 +23,14 @@ def generate_mesh(
     resolution_base=8.0,
     disable_postprocess=False,
     top_p=None,
 ):
     mesh_v_f = engine.t2s(
         [prompt],
         use_kv_cache=True,
         resolution_base=resolution_base,
         top_p=top_p,
     )
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     obj_path = os.path.join(output_dir, f"{output_name}.obj")
@@ -92,6 +96,14 @@ if __name__ == "__main__":
         default=None,
         help="Float < 1: Keep smallest set of tokens with cumulative probability ≥ top_p. Default None: deterministic generation.",
     )
     parser.add_argument(
         "--render-gif",
         help="Render a turntable gif of the mesh",
@@ -112,7 +124,7 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     print(f"Using device: {device}")
     # Initialize engine based on fast_inference flag
     if args.fast_inference:
@@ -127,7 +139,10 @@ if __name__ == "__main__":
         engine = Engine(
             args.config_path, args.gpt_ckpt_path, args.shape_ckpt_path, device=device
         )
     # Generate meshes based on input source
     obj_path = generate_mesh(
         engine,
@@ -137,6 +152,7 @@ if __name__ == "__main__":
         args.resolution_base,
         args.disable_postprocessing,
         args.top_p,
     )
     if args.render_gif:
         gif_path = renderer.render_turntable(obj_path, args.output_dir)

 import trimesh
 from cube3d.inference.engine import Engine, EngineFast
+from cube3d.inference.utils import normalize_bbox, select_device
 from cube3d.mesh_utils.postprocessing import (
     PYMESHLAB_AVAILABLE,
     create_pymeshset,
 )
 from cube3d.renderer import renderer
 def generate_mesh(
     engine,
     prompt,
     resolution_base=8.0,
     disable_postprocess=False,
     top_p=None,
+    bounding_box_xyz=None,
 ):
     mesh_v_f = engine.t2s(
         [prompt],
         use_kv_cache=True,
         resolution_base=resolution_base,
         top_p=top_p,
+        bounding_box_xyz=bounding_box_xyz,
     )
     vertices, faces = mesh_v_f[0][0], mesh_v_f[0][1]
     obj_path = os.path.join(output_dir, f"{output_name}.obj")
         default=None,
         help="Float < 1: Keep smallest set of tokens with cumulative probability ≥ top_p. Default None: deterministic generation.",
     )
+    parser.add_argument(
+        "--bounding_box_xyz",
+        nargs=3,
+        type=float,
+        help="Three float values for x, y, z bounding box",
+        default=None,
+        required=False,
+    )
     parser.add_argument(
         "--render-gif",
         help="Render a turntable gif of the mesh",
     )
     args = parser.parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
+    device = select_device()
     print(f"Using device: {device}")
     # Initialize engine based on fast_inference flag
     if args.fast_inference:
         engine = Engine(
             args.config_path, args.gpt_ckpt_path, args.shape_ckpt_path, device=device
         )
+    if args.bounding_box_xyz is not None:
+        args.bounding_box_xyz = normalize_bbox(tuple(args.bounding_box_xyz))
     # Generate meshes based on input source
     obj_path = generate_mesh(
         engine,
         args.resolution_base,
         args.disable_postprocessing,
         args.top_p,
+        args.bounding_box_xyz,
     )
     if args.render_gif:
         gif_path = renderer.render_turntable(obj_path, args.output_dir)

cube/cube3d/inference/engine.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import torch
 from tqdm import tqdm
 from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast
@@ -77,12 +79,54 @@ class Engine:
         self.max_id = self.shape_model.cfg.num_codes
     @torch.inference_mode()
-    def prepare_inputs(self, prompts: list[str], guidance_scale: float):
         """
         Prepares the input embeddings for the model based on the provided prompts and guidance scale.
         Args:
             prompts (list[str]): A list of prompt strings to be encoded.
             guidance_scale (float): A scaling factor for guidance. If greater than 0.0, additional processing is applied.
         Returns:
             tuple: A tuple containing:
                 - embed (torch.Tensor): The encoded input embeddings.
@@ -94,11 +138,19 @@ class Engine:
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             embed = self.encode_input(prompt_embeds, self.gpt_model.shape_bos_id)
-        cond = prompt_embeds
         if guidance_scale > 0.0:
             embed = torch.cat([embed, embed], dim=0)
             uncond_embeds = self.run_clip([""] * len(prompts))
-            cond = torch.cat([prompt_embeds, uncond_embeds], dim=0)
         return embed, cond
@@ -161,6 +213,7 @@ class Engine:
         use_kv_cache: bool,
         guidance_scale: float = 3.0,
         top_p: float = None,
     ):
         """
         Generates text using a GPT model based on the provided prompts.
@@ -169,11 +222,14 @@ class Engine:
             use_kv_cache (bool): Whether to use key-value caching for faster generation.
             guidance_scale (float, optional): The scale for guidance during generation. Default is 3.0.
             top_p (float, optional): The cumulative probability threshold for nucleus sampling.
-            If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
         Returns:
             torch.Tensor: A tensor containing the generated token IDs.
         """
-        embed, cond = self.prepare_inputs(prompts, guidance_scale)
         output_ids = []
@@ -267,6 +323,7 @@ class Engine:
         resolution_base: float = 8.0,
         chunk_size: int = 100_000,
         top_p: float = None,
     ):
         """
         Generates a 3D mesh from text prompts using a GPT model and shape decoder.
@@ -276,12 +333,17 @@ class Engine:
             guidance_scale (float, optional): The scale of guidance for the GPT model. Default is 3.0.
             resolution_base (float, optional): The base resolution for the shape decoder. Default is 8.0.
             chunk_size (int, optional): The chunk size for processing the shape decoding. Default is 100,000.
-            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
-                                    If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
         Returns:
             mesh_v_f: The generated 3D mesh vertices and faces.
         """
-        output_ids = self.run_gpt(prompts, use_kv_cache, guidance_scale, top_p)
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             mesh_v_f = self.run_shape_decode(output_ids, resolution_base, chunk_size)
         return mesh_v_f
@@ -304,6 +366,10 @@ class EngineFast(Engine):
             device (torch.device): The device to run the inference on (e.g., CPU or CUDA).
         """
         super().__init__(config_path, gpt_ckpt_path, shape_ckpt_path, device)
         # CUDA Graph params
@@ -424,11 +490,12 @@ class EngineFast(Engine):
         )
     def run_gpt(
-        self,
-        prompts: list[str],
-        use_kv_cache: bool,
         guidance_scale: float = 3.0,
-        top_p: float = None
     ):
         """
         Runs the GPT model to generate text based on the provided prompts.
@@ -437,14 +504,18 @@ class EngineFast(Engine):
             use_kv_cache (bool): Flag indicating whether to use key-value caching. (Currently not used)
             guidance_scale (float, optional): The scale factor for guidance. Default is 3.0.
             top_p (float, optional): The cumulative probability threshold for nucleus sampling.
-            If None, argmax selection is performed. Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept.
         Returns:
             torch.Tensor: A tensor containing the generated output token IDs.
         Raises:
             AssertionError: If the batch size is greater than 1.
         """
-        embed, cond = self.prepare_inputs(prompts, guidance_scale)
         assert len(prompts) == 1, "batch size > 1 not support for EngineFast"
         batch_size, input_seq_len, _ = embed.shape
@@ -475,9 +546,7 @@ class EngineFast(Engine):
             next_embed = next_embed.repeat(2, 1, 1)
             self.embed_buffer[:, input_seq_len, :].copy_(next_embed.squeeze(1))
-            for i in tqdm(
-                range(1, self.max_new_tokens), desc=f"generating"
-            ):
                 self._set_curr_pos_id(i)
                 self.graph.replay()

+from typing import Optional, Tuple
 import torch
 from tqdm import tqdm
 from transformers import CLIPTextModelWithProjection, CLIPTokenizerFast
         self.max_id = self.shape_model.cfg.num_codes
     @torch.inference_mode()
+    def prepare_conditions_with_bbox(
+        self,
+        cond: torch.Tensor,
+        bounding_box_tensor: Optional[torch.Tensor] = None,
+    ):
+        """
+        Prepares condition embeddings by incorporating bounding box information.
+        Concatenates bounding box embeddings to the existing condition tensor if the model
+        supports bounding box projection. If no bounding box is provided, uses zero padding.
+        Args:
+            cond (torch.Tensor): The input condition embeddings tensor of shape (B, seq_len, dim).
+            bounding_box_xyz (Optional[torch.Tensor], optional): The size of the bounding box
+                as (x, y, z) dimensions represented as a tensor. If None, uses zero padding for
+                bounding box embeddings.
+        Returns:
+            torch.Tensor: The condition tensor with bounding box embeddings concatenated along
+                the sequence dimension if bounding box projection is supported, otherwise
+                returns the original condition tensor unchanged.
+        """
+        if not hasattr(self.gpt_model, "bbox_proj"):
+            return cond
+        if bounding_box_tensor is None:
+            B = cond.shape[0]
+            bounding_box_tensor = torch.zeros((B, 3), dtype=cond.dtype, device=self.device)
+        bbox_emb = self.gpt_model.bbox_proj(bounding_box_tensor).unsqueeze(dim=1)
+        cond = torch.cat([cond, bbox_emb], dim=1)
+        return cond
+    @torch.inference_mode()
+    def prepare_inputs(
+        self,
+        prompts: list[str],
+        guidance_scale: float,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
+    ):
         """
         Prepares the input embeddings for the model based on the provided prompts and guidance scale.
         Args:
             prompts (list[str]): A list of prompt strings to be encoded.
             guidance_scale (float): A scaling factor for guidance. If greater than 0.0, additional processing is applied.
+            bounding_box_xyz (Optional[Tuple[float]], optional): The size of the bounding box for generation
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
         Returns:
             tuple: A tuple containing:
                 - embed (torch.Tensor): The encoded input embeddings.
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             embed = self.encode_input(prompt_embeds, self.gpt_model.shape_bos_id)
+        if bounding_box_xyz is not None:
+            cond_bbox = torch.atleast_2d(torch.tensor(bounding_box_xyz)).to(self.device)
+            uncond_bbox = torch.zeros_like(cond_bbox).to(self.device)
+        else:
+            cond_bbox = None
+            uncond_bbox = None
+        cond = self.prepare_conditions_with_bbox(prompt_embeds, cond_bbox)
         if guidance_scale > 0.0:
             embed = torch.cat([embed, embed], dim=0)
             uncond_embeds = self.run_clip([""] * len(prompts))
+            uncond = self.prepare_conditions_with_bbox(uncond_embeds, uncond_bbox)
+            cond = torch.cat([cond, uncond], dim=0)
         return embed, cond
         use_kv_cache: bool,
         guidance_scale: float = 3.0,
         top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
     ):
         """
         Generates text using a GPT model based on the provided prompts.
             use_kv_cache (bool): Whether to use key-value caching for faster generation.
             guidance_scale (float, optional): The scale for guidance during generation. Default is 3.0.
             top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
+            bounding_box_xyz (Optional[Tuple[float]], optional): The size of the bounding box for generation
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
         Returns:
             torch.Tensor: A tensor containing the generated token IDs.
         """
+        embed, cond = self.prepare_inputs(prompts, guidance_scale, bounding_box_xyz)
         output_ids = []
         resolution_base: float = 8.0,
         chunk_size: int = 100_000,
         top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
     ):
         """
         Generates a 3D mesh from text prompts using a GPT model and shape decoder.
             guidance_scale (float, optional): The scale of guidance for the GPT model. Default is 3.0.
             resolution_base (float, optional): The base resolution for the shape decoder. Default is 8.0.
             chunk_size (int, optional): The chunk size for processing the shape decoding. Default is 100,000.
+            top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed (deterministic generation). Otherwise, smallest set of tokens with cumulative probability ≥ top_p are kept (stochastic generation).
+            bounding_box_xyz (Tuple[float] | None, optional): The size of the bounding box for the generated mesh
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
         Returns:
             mesh_v_f: The generated 3D mesh vertices and faces.
         """
+        output_ids = self.run_gpt(
+            prompts, use_kv_cache, guidance_scale, top_p, bounding_box_xyz
+        )
         with torch.autocast(self.device.type, dtype=torch.bfloat16):
             mesh_v_f = self.run_shape_decode(output_ids, resolution_base, chunk_size)
         return mesh_v_f
             device (torch.device): The device to run the inference on (e.g., CPU or CUDA).
         """
+        assert (
+            device.type == "cuda"
+        ), "EngineFast is only supported on cuda devices, please use Engine on non-cuda devices"
         super().__init__(config_path, gpt_ckpt_path, shape_ckpt_path, device)
         # CUDA Graph params
         )
     def run_gpt(
+        self,
+        prompts: list[str],
+        use_kv_cache: bool,
         guidance_scale: float = 3.0,
+        top_p: float = None,
+        bounding_box_xyz: Optional[Tuple[float]] = None,
     ):
         """
         Runs the GPT model to generate text based on the provided prompts.
             use_kv_cache (bool): Flag indicating whether to use key-value caching. (Currently not used)
             guidance_scale (float, optional): The scale factor for guidance. Default is 3.0.
             top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                If None, argmax selection is performed. Otherwise, smallest
+                set of tokens with cumulative probability ≥ top_p are kept.
+            bounding_box_xyz (Tuple[float] | None, optional): The size of the bounding box for the generated mesh
+                as (x, y, z) dimensions. Each value must be between 0 and 1.925. If None,
+                uses default bounding box sizing.
         Returns:
             torch.Tensor: A tensor containing the generated output token IDs.
         Raises:
             AssertionError: If the batch size is greater than 1.
         """
+        embed, cond = self.prepare_inputs(prompts, guidance_scale, bounding_box_xyz)
         assert len(prompts) == 1, "batch size > 1 not support for EngineFast"
         batch_size, input_seq_len, _ = embed.shape
             next_embed = next_embed.repeat(2, 1, 1)
             self.embed_buffer[:, input_seq_len, :].copy_(next_embed.squeeze(1))
+            for i in tqdm(range(1, self.max_new_tokens), desc=f"generating"):
                 self._set_curr_pos_id(i)
                 self.graph.replay()

cube/cube3d/inference/utils.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import logging
-from typing import Any, Optional
 import torch
 from omegaconf import DictConfig, OmegaConf
 from safetensors.torch import load_model
 def load_config(cfg_path: str) -> Any:
     """
@@ -49,8 +56,24 @@ def load_model_weights(model: torch.nn.Module, ckpt_path: str) -> None:
     Returns:
         None
     """
-    assert ckpt_path.endswith(".safetensors"), (
-        f"Checkpoint path '{ckpt_path}' is not a safetensors file"
-    )
     load_model(model, ckpt_path)

 import logging
+from typing import Any, Optional, Tuple
 import torch
 from omegaconf import DictConfig, OmegaConf
 from safetensors.torch import load_model
+BOUNDING_BOX_MAX_SIZE = 1.925
+def normalize_bbox(bounding_box_xyz: Tuple[float]):
+    max_l = max(bounding_box_xyz)
+    return [BOUNDING_BOX_MAX_SIZE * elem / max_l for elem in bounding_box_xyz]
 def load_config(cfg_path: str) -> Any:
     """
     Returns:
         None
     """
+    assert ckpt_path.endswith(
+        ".safetensors"
+    ), f"Checkpoint path '{ckpt_path}' is not a safetensors file"
     load_model(model, ckpt_path)
+def select_device() -> Any:
+    """
+    Selects the appropriate PyTorch device for tensor allocation.
+    Returns:
+        Any: The `torch.device` object.
+    """
+    return torch.device(
+        "cuda"
+        if torch.cuda.is_available()
+        else "mps"
+        if torch.backends.mps.is_available()
+        else "cpu"
+    )

cube/cube3d/mesh_utils/postprocessing.py CHANGED Viewed

@@ -75,10 +75,12 @@ def save_mesh(ms: pymeshlab.MeshSet, output_path: str):
     logging.info(f"Mesh saved to {output_path}.")
-def postprocess_mesh(ms: pymeshlab.MeshSet, target_face_num: int, output_path: str):
     """
     Postprocess the mesh to the target number of faces.
     """
     cleanup(ms)
     remove_floaters(ms)
     simplify_mesh(ms, target_face_num)

     logging.info(f"Mesh saved to {output_path}.")
+def postprocess_mesh(ms: pymeshlab.MeshSet, target_face_num: int):
     """
     Postprocess the mesh to the target number of faces.
     """
     cleanup(ms)
     remove_floaters(ms)
     simplify_mesh(ms, target_face_num)
+    mesh = ms.current_mesh()
+    return mesh.vertex_matrix(), mesh.face_matrix()

cube/cube3d/model/gpt/dual_stream_roformer.py CHANGED Viewed

@@ -34,6 +34,8 @@ class DualStreamRoformer(nn.Module):
         encoder_with_cls_token: bool = True
     def __init__(self, cfg: Config) -> None:
         """
         Initializes the DualStreamRoFormer model.
@@ -108,6 +110,9 @@ class DualStreamRoformer(nn.Module):
         self.lm_head = nn.Linear(self.cfg.n_embd, self.vocab_size, bias=False)
     def encode_text(self, text_embed):
         """
         Encodes the given text embeddings by projecting them through a linear transformation.

         encoder_with_cls_token: bool = True
+        use_bbox: bool = False
     def __init__(self, cfg: Config) -> None:
         """
         Initializes the DualStreamRoFormer model.
         self.lm_head = nn.Linear(self.cfg.n_embd, self.vocab_size, bias=False)
+        if self.cfg.use_bbox:
+            self.bbox_proj = nn.Linear(3, self.cfg.n_embd)
     def encode_text(self, text_embed):
         """
         Encodes the given text embeddings by projecting them through a linear transformation.

cube/cube3d/model/transformers/cache.py CHANGED Viewed

@@ -1,9 +1,36 @@
-from dataclasses import dataclass
 import torch
 @dataclass
 class Cache:
     key_states: torch.Tensor
     value_states: torch.Tensor

+from dataclasses import dataclass, field
 import torch
 @dataclass
 class Cache:
     key_states: torch.Tensor
     value_states: torch.Tensor
+    _supports_index_copy: bool = field(init=False) # For CUDA graph support
+    def __post_init__(self):
+        self._supports_index_copy = self._check_index_copy_support()
+    def _check_index_copy_support(self) -> bool:
+        """Verifies support for `index_copy_` on device."""
+        try:
+            device = self.key_states.device
+            dummy = torch.tensor([0, 0], device=device)
+            dummy.index_copy_(0, torch.tensor([0], device=device), torch.tensor([1], device=device))
+            return True
+        except NotImplementedError:
+            return False
+    def update(self, curr_pos_id: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> None:
+        """
+        Updates the cache based on device operator support.
+        Args:
+            curr_pos_id (torch.Tensor): Current position indices for decoding.
+            k (torch.Tensor): The keys to update
+            v (torch.Tensor): The values to update
+        """
+        if self._supports_index_copy: # CUDA/CPU
+            self.key_states.index_copy_(2, curr_pos_id, k)
+            self.value_states.index_copy_(2, curr_pos_id, v)
+        else: # MPS
+            self.key_states[:, :, curr_pos_id:curr_pos_id +1, ...].copy_(k)
+            self.value_states[:, :, curr_pos_id:curr_pos_id +1, ...].copy_(v)

cube/cube3d/model/transformers/dual_stream_attention.py CHANGED Viewed

@@ -198,8 +198,7 @@ class DualStreamAttentionWithRotaryEmbedding(nn.Module):
                 kv_cache.value_states[:, :, : k.shape[2], :].copy_(v)
             else:
                 assert curr_pos_id is not None
-                kv_cache.key_states.index_copy_(2, curr_pos_id, k)
-                kv_cache.value_states.index_copy_(2, curr_pos_id, v)
             k = kv_cache.key_states
             v = kv_cache.value_states

                 kv_cache.value_states[:, :, : k.shape[2], :].copy_(v)
             else:
                 assert curr_pos_id is not None
+                kv_cache.update(curr_pos_id, k, v)
             k = kv_cache.key_states
             v = kv_cache.value_states

cube/cube3d/model/transformers/roformer.py CHANGED Viewed

@@ -115,8 +115,7 @@ class SelfAttentionWithRotaryEmbedding(nn.Module):
                 kv_cache.value_states[:, :, : k.shape[2], :].copy_(v)
             else:
                 assert curr_pos_id is not None
-                kv_cache.key_states.index_copy_(2, curr_pos_id, k)
-                kv_cache.value_states.index_copy_(2, curr_pos_id, v)
             k = kv_cache.key_states
             v = kv_cache.value_states

                 kv_cache.value_states[:, :, : k.shape[2], :].copy_(v)
             else:
                 assert curr_pos_id is not None
+                kv_cache.update(curr_pos_id, k, v)
             k = kv_cache.key_states
             v = kv_cache.value_states

cube/cube3d/vq_vae_encode_decode.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import torch
 import trimesh
-from cube3d.inference.utils import load_config, load_model_weights, parse_structured
 from cube3d.model.autoencoder.one_d_autoencoder import OneDAutoEncoder
 MESH_SCALE = 0.96
@@ -125,7 +125,7 @@ if __name__ == "__main__":
         help="Path to save the recovered mesh file.",
     )
     args = parser.parse_args()
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     logging.info(f"Using device: {device}")
     cfg = load_config(args.config_path)

 import torch
 import trimesh
+from cube3d.inference.utils import load_config, load_model_weights, parse_structured, select_device
 from cube3d.model.autoencoder.one_d_autoencoder import OneDAutoEncoder
 MESH_SCALE = 0.96
         help="Path to save the recovered mesh file.",
     )
     args = parser.parse_args()
+    device = select_device()
     logging.info(f"Using device: {device}")
     cfg = load_config(args.config_path)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 gradio
 torch
 trimesh
-git+https://github.com/Roblox/cube.git

 gradio
 torch
 trimesh
+pymeshlab
+git+https://github.com/Roblox/cube.git