StableSpann3R

Running

App Files Files Community

aca2024 commited on Oct 18, 2024

Commit

045e960

1 Parent(s): e28f1ec

update app.py

Browse files

Files changed (1) hide show

app.py +57 -215

app.py CHANGED Viewed

@@ -12,15 +12,9 @@ from spann3r.datasets import Demo
 from torch.utils.data import DataLoader
 import trimesh
 from scipy.spatial.transform import Rotation
-from transformers import AutoModelForImageSegmentation
-from torchvision import transforms
-from PIL import Image
-import open3d as o3d
-from backend_utils import improved_multiway_registration, pts2normal, point2mesh, combine_and_clean_point_clouds
 # Default values
-DEFAULT_CKPT_PATH = './checkpoints/spann3r.pth'
 DEFAULT_DUST3R_PATH = 'https://huggingface.co/camenduru/dust3r/resolve/main/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth'
 DEFAULT_DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
@@ -29,45 +23,15 @@ OPENGL = np.array([[1, 0, 0, 0],
                    [0, 0, -1, 0],
                    [0, 0, 0, 1]])
-def export_geometry(geometry, as_pointcloud=False):
-    if as_pointcloud:
-        if not isinstance(geometry, o3d.geometry.PointCloud):
-            raise ValueError("Expected an Open3D PointCloud object when as_pointcloud is True")
-        output_path = tempfile.mktemp(suffix='.ply')
-    else:
-        if not isinstance(geometry, o3d.geometry.TriangleMesh):
-            raise ValueError("Expected an Open3D TriangleMesh object when as_pointcloud is False")
-        output_path = tempfile.mktemp(suffix='.obj')
-    # Apply rotation
-    rot = np.eye(4)
-    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
-    transform = np.linalg.inv(OPENGL @ rot)
-    geometry.transform(transform)
-    # Export the geometry
-    if as_pointcloud:
-        o3d.io.write_point_cloud(output_path, geometry, write_ascii=False, compressed=True)
-    else:
-        o3d.io.write_triangle_mesh(output_path, geometry, write_ascii=False, compressed=True)
-    return output_path
-def extract_frames(video_path: str, duration: float = 20.0, fps: float = 3.0) -> str:
     temp_dir = tempfile.mkdtemp()
     output_path = os.path.join(temp_dir, "%03d.jpg")
-    filter_complex = f"select='if(lt(t,{duration}),1,0)',fps={fps}"
     command = [
         "ffmpeg",
         "-i", video_path,
-        "-vf", filter_complex,
-        "-vsync", "0",
         output_path
     ]
     subprocess.run(command, check=True)
     return temp_dir
@@ -141,42 +105,9 @@ def pts3d_to_trimesh(img, pts3d, valid=None):
     return dict(vertices=vertices, face_colors=face_colors, faces=faces)
 model = load_model(DEFAULT_CKPT_PATH, DEFAULT_DEVICE)
-birefnet = AutoModelForImageSegmentation.from_pretrained('zhengpeng7/BiRefNet', trust_remote_code=True)
-birefnet.to(DEFAULT_DEVICE)
-birefnet.eval()
-def extract_object(birefnet, image):
-    # Data settings
-    image_size = (1024, 1024)
-    transform_image = transforms.Compose([
-        transforms.Resize(image_size),
-        transforms.ToTensor(),
-        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-    ])
-    input_images = transform_image(image).unsqueeze(0).to(DEFAULT_DEVICE)
-    # Prediction
-    with torch.no_grad():
-        preds = birefnet(input_images)[-1].sigmoid().cpu()
-    pred = preds[0].squeeze()
-    pred_pil = transforms.ToPILImage()(pred)
-    mask = pred_pil.resize(image.size)
-    return mask
-def generate_mask(image: np.ndarray):
-    # Convert numpy array to PIL Image
-    pil_image = Image.fromarray((image * 255).astype(np.uint8))
-    # Extract object and get mask
-    mask = extract_object(birefnet, pil_image)
-    # Convert mask to numpy array
-    mask_np = np.array(mask) / 255.0
-    return mask_np
 @torch.no_grad()
-def reconstruct(video_path, conf_thresh, kf_every,
-                as_pointcloud=False, remove_background=False, refine=False):
     # Extract frames from video
     demo_path = extract_frames(video_path)
@@ -197,156 +128,67 @@ def reconstruct(video_path, conf_thresh, kf_every,
     fps = len(batch) / (end - start)
     print(f'Finished reconstruction for {demo_name}, FPS: {fps:.2f}')
-    try:
-        # Process results
-        pcds = []
-        for j, view in enumerate(batch):
-            image = view['img'].permute(0, 2, 3, 1).cpu().numpy()[0]
-            image = (image + 1) / 2
-            pts = preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'].detach().cpu().numpy()[0]
-            pts_normal = pts2normal(preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'][0]).cpu().numpy()
-            conf = preds[j]['conf'][0].cpu().data.numpy()
-            conf_sig = (conf - 1) / conf
-            if remove_background:
-                mask = generate_mask(image)
-            else:
-                mask = np.ones_like(conf)
-            combined_mask = (conf_sig > conf_thresh) & (mask > 0.5)
-            pcd = o3d.geometry.PointCloud()
-            pcd.points = o3d.utility.Vector3dVector(pts[combined_mask])
-            pcd.colors = o3d.utility.Vector3dVector(image[combined_mask])
-            pcd.normals = o3d.utility.Vector3dVector(pts_normal[combined_mask])
-            pcds.append(pcd)
-    except Exception as e:
-        print(repr(e))
-    print(f'Finished Process results {demo_name}')
-    pcd_combined = combine_and_clean_point_clouds(pcds, voxel_size=0.001)
     if as_pointcloud:
-        o3d_geometry = pcd_combined
     else:
-        o3d_geometry = point2mesh(pcd_combined)
-    # Create coarse result
-    print(f'Create coarse result {demo_name}')
-    coarse_output_path = export_geometry(o3d_geometry, as_pointcloud)
-    print(f'Finished Create coarse result {demo_name}')
-    yield coarse_output_path, None
-    if refine:
-        # Perform global optimization
-        print("Performing global registration...")
-        transformed_pcds, _, _ = improved_multiway_registration(pcds, voxel_size=0.001)
-        if as_pointcloud:
-            o3d_geometry = transformed_pcds
-        else:
-            o3d_geometry = point2mesh(transformed_pcds)
-        # Create coarse result
-        refined_output_path = export_geometry(o3d_geometry, as_pointcloud)
-        print(f'Perform global optimization  {demo_name}')
-        yield coarse_output_path, refined_output_path
     # Clean up temporary directory
     os.system(f"rm -rf {demo_path}")
-# Update the Gradio interface with improved layout
-with gr.Blocks(
-        title="StableSpann3r: Making Spann3r stable with Odometry Backend",
-        css="""
-            #download {
-                height: 118px;
-            }
-            .slider .inner {
-                width: 5px;
-                background: #FFF;
-            }
-            .viewport {
-                aspect-ratio: 4/3;
-            }
-            .tabs button.selected {
-                font-size: 20px !important;
-                color: crimson !important;
-            }
-            h1 {
-                text-align: center;
-                display: block;
-            }
-            h2 {
-                text-align: center;
-                display: block;
-            }
-            h3 {
-                text-align: center;
-                display: block;
-            }
-            .md_feedback li {
-                margin-bottom: 0px !important;
-            }
-        """,
-        head="""
-            <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
-            <script>
-                window.dataLayer = window.dataLayer || [];
-                function gtag() {dataLayer.push(arguments);}
-                gtag('js', new Date());
-                gtag('config', 'G-1FWSVCGZTG');
-            </script>
-        """,
-    ) as iface:
-    gr.Markdown(
-        """
-        # StableSpann3r: Making Spann3r stable with Odometry Backend
-        <p align="center">
-            <a title="Website" href="https://stable-x.github.io/StableSpann3r/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-                <img src="https://www.obukhov.ai/img/badges/badge-website.svg">
-            </a>
-            <a title="arXiv" href="https://arxiv.org/abs/XXXX.XXXXX" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-                <img src="https://www.obukhov.ai/img/badges/badge-pdf.svg">
-            </a>
-            <a title="Github" href="https://github.com/Stable-X/StableSpann3r" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-                <img src="https://img.shields.io/github/stars/Stable-X/StableSpann3r?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
-            </a>
-            <a title="Social" href="https://x.com/ychngji6" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-                <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
-            </a>
-        </p>
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            video_input = gr.Video(label="Input Video")
-            with gr.Row():
-                conf_thresh = gr.Slider(0, 1, value=1e-3, label="Confidence Threshold")
-                kf_every = gr.Slider(1, 30, step=1, value=1, label="Keyframe Interval")
-            with gr.Row():
-                remove_background = gr.Checkbox(label="Remove Background", value=False)
-                refine = gr.Checkbox(label="Enable Backend", value=False)
-                as_pointcloud = gr.Checkbox(label="As Pointcloud", value=False)
-            reconstruct_btn = gr.Button("Reconstruct")
-        with gr.Column(scale=2):
-            with gr.Tab("Coarse Model"):
-                coarse_model = gr.Model3D(label="Coarse 3D Model", display_mode="solid", clear_color=[0.0, 0.0, 0.0, 0.0])
-            with gr.Tab("Refined Model"):
-                refined_model = gr.Model3D(label="Refined 3D Model", display_mode="solid", clear_color=[0.0, 0.0, 0.0, 0.0])
-    reconstruct_btn.click(
-        fn=reconstruct,
-        inputs=[video_input, conf_thresh, kf_every, as_pointcloud, remove_background, refine],
-        outputs=[coarse_model, refined_model]
-    )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0")

 from torch.utils.data import DataLoader
 import trimesh
 from scipy.spatial.transform import Rotation
 # Default values
+DEFAULT_CKPT_PATH = 'https://huggingface.co/spaces/Stable-X/StableSpann3R/resolve/main/checkpoints/spann3r.pth'
 DEFAULT_DUST3R_PATH = 'https://huggingface.co/camenduru/dust3r/resolve/main/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth'
 DEFAULT_DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
                    [0, 0, -1, 0],
                    [0, 0, 0, 1]])
+def extract_frames(video_path: str) -> str:
     temp_dir = tempfile.mkdtemp()
     output_path = os.path.join(temp_dir, "%03d.jpg")
     command = [
         "ffmpeg",
         "-i", video_path,
+        "-vf", "fps=1",
         output_path
     ]
     subprocess.run(command, check=True)
     return temp_dir
     return dict(vertices=vertices, face_colors=face_colors, faces=faces)
 model = load_model(DEFAULT_CKPT_PATH, DEFAULT_DEVICE)
 @torch.no_grad()
+def reconstruct(video_path, conf_thresh, kf_every, as_pointcloud=False):
     # Extract frames from video
     demo_path = extract_frames(video_path)
     fps = len(batch) / (end - start)
     print(f'Finished reconstruction for {demo_name}, FPS: {fps:.2f}')
+    # Process results
+    pts_all, images_all, conf_all = [], [], []
+    for j, view in enumerate(batch):
+        image = view['img'].permute(0, 2, 3, 1).cpu().numpy()[0]
+        pts = preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'].detach().cpu().numpy()[0]
+        conf = preds[j]['conf'][0].cpu().data.numpy()
+        images_all.append((image[None, ...] + 1.0)/2.0)
+        pts_all.append(pts[None, ...])
+        conf_all.append(conf[None, ...])
+    images_all = np.concatenate(images_all, axis=0)
+    pts_all = np.concatenate(pts_all, axis=0) * 10
+    conf_all = np.concatenate(conf_all, axis=0)
+    # Create point cloud or mesh
+    conf_sig_all = (conf_all-1) / conf_all
+    mask = conf_sig_all > conf_thresh
+    scene = trimesh.Scene()
     if as_pointcloud:
+        pcd = trimesh.PointCloud(
+            vertices=pts_all[mask].reshape(-1, 3),
+            colors=images_all[mask].reshape(-1, 3)
+        )
+        scene.add_geometry(pcd)
     else:
+        meshes = []
+        for i in range(len(images_all)):
+            meshes.append(pts3d_to_trimesh(images_all[i], pts_all[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(OPENGL @ rot))
+    # Save the scene as GLB
+    output_path = tempfile.mktemp(suffix='.glb')
+    scene.export(output_path)
     # Clean up temporary directory
     os.system(f"rm -rf {demo_path}")
+    return output_path, f"Reconstruction completed. FPS: {fps:.2f}"
+iface = gr.Interface(
+    fn=reconstruct,
+    inputs=[
+        gr.Video(label="Input Video"),
+        gr.Slider(0, 1, value=1e-3, label="Confidence Threshold"),
+        gr.Slider(1, 30, step=1, value=5, label="Keyframe Interval"),
+        gr.Checkbox(label="As Pointcloud", value=False)
+    ],
+    outputs=[
+        gr.Model3D(label="3D Model (GLB)", display_mode="solid"),
+        gr.Textbox(label="Status")
+    ],
+    title="3D Reconstruction with Spatial Memory",
+)
 if __name__ == "__main__":
+    iface.launch()