Spaces:

stanfordmimi
/

Synthpose-Markerless-MoCap-VitPose

Running on Zero

App Files Files Community

yonigozlan HF Staff commited on Jan 8

Commit

51c9688

1 Parent(s): ec83bbc

initial commit

Browse files

Files changed (8) hide show

.gitattributes +4 -0
app.py +332 -0
basket.mp4 +3 -0
football.mp4 +3 -0
hurdles.mp4 +3 -0
render.py +125 -0
requirements.txt +6 -0
tennis.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+basket.mp4 filter=lfs diff=lfs merge=lfs -text
+football.mp4 filter=lfs diff=lfs merge=lfs -text
+hurdles.mp4 filter=lfs diff=lfs merge=lfs -text
+tennis.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import os
+import gradio as gr
+import numpy as np
+import spaces
+import supervision as sv
+import torch
+from render import draw_links, draw_points, keypoint_colors, link_colors
+from tqdm import tqdm
+from transformers import (
+    AutoProcessor,
+    RTDetrForObjectDetection,
+    VitPoseForPoseEstimation,
+)
+css = """
+.feedback textarea {font-size: 24px !important}
+"""
+device = "cuda"
+def calculate_end_frame_index(source_video_path):
+    video_info = sv.VideoInfo.from_video_path(source_video_path)
+    return video_info.total_frames
+@spaces.GPU
+def process_image(
+    input_image,
+    model_variant,
+    progress=gr.Progress(track_tqdm=True),
+):
+    # You can choose detector by your choice
+    person_image_processor = AutoProcessor.from_pretrained(
+        "PekingU/rtdetr_r50vd_coco_o365"
+    )
+    person_model = RTDetrForObjectDetection.from_pretrained(
+        "PekingU/rtdetr_r50vd_coco_o365", device_map=device
+    )
+    if model_variant == "Base":
+        model_name = "yonigozlan/synthpose-vitpose-base-hf"
+    else:
+        model_name = "yonigozlan/synthpose-vitpose-huge-hf"
+    image_processor = AutoProcessor.from_pretrained(model_name)
+    model = VitPoseForPoseEstimation.from_pretrained(model_name, device_map=device)
+    keypoint_edges = model.config.edges
+    frame = np.array(input_image)
+    inputs = person_image_processor(images=frame, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = person_model(**inputs)
+    results = person_image_processor.post_process_object_detection(
+        outputs,
+        target_sizes=torch.tensor([(frame.shape[0], frame.shape[1])]),
+        threshold=0.4,
+    )
+    result = results[0]  # take first image results
+    # Human label refers 0 index in COCO dataset
+    person_boxes = result["boxes"][result["labels"] == 0]
+    person_boxes = person_boxes.cpu().numpy()
+    # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
+    person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+    person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+    # ------------------------------------------------------------------------
+    # Stage 2. Detect keypoints for each person found
+    # ------------------------------------------------------------------------
+    inputs = image_processor(frame, boxes=[person_boxes], return_tensors="pt").to(
+        device
+    )
+    with torch.no_grad():
+        outputs = model(**inputs)
+    pose_results = image_processor.post_process_pose_estimation(
+        outputs, boxes=[person_boxes]
+    )
+    image_pose_result = pose_results[0]  # results for first image
+    for pose_result in image_pose_result:
+        scores = np.array(pose_result["scores"])
+        keypoints = np.array(pose_result["keypoints"])
+        # draw each point on image
+        draw_points(
+            frame,
+            keypoints,
+            scores,
+            keypoint_colors,
+            keypoint_score_threshold=0.3,
+            radius=max(2, int(max(frame.shape[0], frame.shape[1]) / 500)),
+            show_keypoint_weight=False,
+        )
+        # draw links
+        draw_links(
+            frame,
+            keypoints,
+            scores,
+            keypoint_edges,
+            link_colors,
+            keypoint_score_threshold=0.3,
+            thickness=max(2, int(max(frame.shape[0], frame.shape[1]) / 1000)),
+            show_keypoint_weight=False,
+        )
+    return frame
+@spaces.GPU
+def process_video(
+    input_video,
+    model_variant,
+    progress=gr.Progress(track_tqdm=True),
+):
+    video_info = sv.VideoInfo.from_video_path(input_video)
+    total = calculate_end_frame_index(input_video)
+    frame_generator = sv.get_video_frames_generator(source_path=input_video, end=total)
+    result_file_name = "output.mp4"
+    result_file_path = os.path.join(os.getcwd(), result_file_name)
+    # You can choose detector by your choice
+    person_image_processor = AutoProcessor.from_pretrained(
+        "PekingU/rtdetr_r50vd_coco_o365"
+    )
+    person_model = RTDetrForObjectDetection.from_pretrained(
+        "PekingU/rtdetr_r50vd_coco_o365", device_map=device
+    )
+    if model_variant == "Base":
+        model_name = "yonigozlan/synthpose-vitpose-base-hf"
+    else:
+        model_name = "yonigozlan/synthpose-vitpose-huge-hf"
+    image_processor = AutoProcessor.from_pretrained(model_name)
+    model = VitPoseForPoseEstimation.from_pretrained(model_name, device_map=device)
+    keypoint_edges = model.config.edges
+    with sv.VideoSink(result_file_path, video_info=video_info) as sink:
+        for _ in tqdm(range(total), desc="Processing video.."):
+            try:
+                frame = next(frame_generator)
+            except StopIteration:
+                break
+            # ------------------------------------------------------------------------
+            # Stage 1. Detect humans on the image
+            # ------------------------------------------------------------------------
+            inputs = person_image_processor(images=frame, return_tensors="pt").to(
+                device
+            )
+            with torch.no_grad():
+                outputs = person_model(**inputs)
+            results = person_image_processor.post_process_object_detection(
+                outputs,
+                target_sizes=torch.tensor([(frame.shape[0], frame.shape[1])]),
+                threshold=0.4,
+            )
+            result = results[0]  # take first image results
+            # Human label refers 0 index in COCO dataset
+            person_boxes = result["boxes"][result["labels"] == 0]
+            person_boxes = person_boxes.cpu().numpy()
+            # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
+            person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+            person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+            # ------------------------------------------------------------------------
+            # Stage 2. Detect keypoints for each person found
+            # ------------------------------------------------------------------------
+            inputs = image_processor(
+                frame, boxes=[person_boxes], return_tensors="pt"
+            ).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+            pose_results = image_processor.post_process_pose_estimation(
+                outputs, boxes=[person_boxes]
+            )
+            image_pose_result = pose_results[0]  # results for first image
+            for pose_result in image_pose_result:
+                scores = np.array(pose_result["scores"])
+                keypoints = np.array(pose_result["keypoints"])
+                # draw each point on image
+                draw_points(
+                    frame,
+                    keypoints,
+                    scores,
+                    keypoint_colors,
+                    keypoint_score_threshold=0.3,
+                    radius=max(2, int(frame.shape[0] / 500)),
+                    show_keypoint_weight=False,
+                )
+                # draw links
+                draw_links(
+                    frame,
+                    keypoints,
+                    scores,
+                    keypoint_edges,
+                    link_colors,
+                    keypoint_score_threshold=0.3,
+                    thickness=max(1, int(frame.shape[0] / 1000)),
+                    show_keypoint_weight=False,
+                )
+            sink.write_frame(frame)
+    return result_file_path
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    gr.Markdown("## Markerless Motion Capture with SynthPose")
+    gr.Markdown(
+        """
+SynthPose is a new approach that enables finetuning of pre-trained 2D human pose models to predict an arbitrarily denser set of keypoints for accurate kinematic analysis through the use of synthetic data.
+More details are available in [OpenCapBench: A Benchmark to Bridge Pose Estimation and Biomechanics](https://arxiv.org/abs/2406.09788).
+This particular variant was finetuned on a set of keypoints usually found on motion capture setups, and include coco keypoints as well.<br />
+The keypoints part of the skeleton are the COCO keypoints, and the pink ones the anatomical markers.
+"""
+    )
+    gr.Markdown(
+        "Simply upload a video, and press run to start the inference! You can also try the examples below. 👇"
+    )
+    with gr.Row():
+        with gr.Column():
+            input_choice = gr.Radio(
+                ["Video", "Image"], label="Input Type", value="Video", interactive=True
+            )
+            model_variant = gr.Radio(
+                ["Base", "Huge"], label="Model Variant", value="Base", interactive=True
+            )
+            input_video = gr.Video(label="Input Video")
+            input_image = gr.Image(label="Input Image", visible=False)
+        with gr.Column():
+            output_video = gr.Video(label="Output Video")
+            output_image = gr.Image(label="Output Image", visible=False)
+    with gr.Row():
+        submit_video = gr.Button(variant="primary")
+        submit_image = gr.Button(variant="primary", visible=False)
+    def switch_input_type(input_choice):
+        input_type = input_choice
+        if input_type == "Video":
+            return [
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+            ]
+            # input_video.visible = True
+            # input_image.visible = False
+            # output_video.visible = True
+            # output_image.visible = False
+            # submit_video.visible = True
+            # submit_image.visible = False
+        else:
+            return [
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=True),
+            ]
+            # input_video.visible = False
+            # input_image.visible = True
+            # output_video.visible = False
+            # output_image.visible = True
+            # submit_video.visible = False
+            # submit_image.visible = True
+    input_choice.change(
+        switch_input_type,
+        inputs=input_choice,
+        outputs=[
+            input_video,
+            input_image,
+            output_video,
+            output_image,
+            submit_video,
+            submit_image,
+        ],
+    )
+    example = gr.Examples(
+        examples=[
+            ["./tennis.mp4"],
+            ["./football.mp4"],
+            ["./basket.mp4"],
+            ["./hurdles.mp4"],
+        ],
+        inputs=[input_video],
+        outputs=output_video,
+    )
+    submit_video.click(
+        fn=process_video,
+        inputs=[input_video, model_variant],
+        outputs=[output_video],
+    )
+    submit_image.click(
+        fn=process_image,
+        inputs=[input_image, model_variant],
+        outputs=[output_image],
+    )
+if __name__ == "__main__":
+    demo.launch(show_error=True)

basket.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52ade15f3ec0cb1838627090d646c2c12a21dedbe70d4bd60d9ca3fa6ff45e37
+size 9347210

football.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56a85c5c7d5d6e0825f76a71e5e3ee2ce35c8ffbe841ef4bfa544af1089259aa
+size 2855852

hurdles.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ee5aa420ea2629dcefd9bb3a26221f30b4639f6de001c372d6c2f84e79b0b66
+size 6714353

render.py ADDED Viewed

	@@ -0,0 +1,125 @@

+### Visualization for advanced user
+import math
+import cv2
+import numpy as np
+def draw_points(
+    image,
+    keypoints,
+    scores,
+    pose_keypoint_color,
+    keypoint_score_threshold,
+    radius,
+    show_keypoint_weight,
+):
+    if pose_keypoint_color is not None:
+        assert len(pose_keypoint_color) == len(keypoints)
+    for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
+        x_coord, y_coord = int(kpt[0]), int(kpt[1])
+        if kpt_score > keypoint_score_threshold:
+            color = tuple(int(c) for c in pose_keypoint_color[kid])
+            if show_keypoint_weight:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                transparency = max(0, min(1, kpt_score))
+                cv2.addWeighted(
+                    image, transparency, image, 1 - transparency, 0, dst=image
+                )
+            else:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+def draw_links(
+    image,
+    keypoints,
+    scores,
+    keypoint_edges,
+    link_colors,
+    keypoint_score_threshold,
+    thickness,
+    show_keypoint_weight,
+    stick_width=2,
+):
+    height, width, _ = image.shape
+    if keypoint_edges is not None and link_colors is not None:
+        assert len(link_colors) == len(keypoint_edges)
+        for sk_id, sk in enumerate(keypoint_edges):
+            x1, y1, score1 = (
+                int(keypoints[sk[0], 0]),
+                int(keypoints[sk[0], 1]),
+                scores[sk[0]],
+            )
+            x2, y2, score2 = (
+                int(keypoints[sk[1], 0]),
+                int(keypoints[sk[1], 1]),
+                scores[sk[1]],
+            )
+            if (
+                x1 > 0
+                and x1 < width
+                and y1 > 0
+                and y1 < height
+                and x2 > 0
+                and x2 < width
+                and y2 > 0
+                and y2 < height
+                and score1 > keypoint_score_threshold
+                and score2 > keypoint_score_threshold
+            ):
+                color = tuple(int(c) for c in link_colors[sk_id])
+                if show_keypoint_weight:
+                    X = (x1, x2)
+                    Y = (y1, y2)
+                    mean_x = np.mean(X)
+                    mean_y = np.mean(Y)
+                    length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    polygon = cv2.ellipse2Poly(
+                        (int(mean_x), int(mean_y)),
+                        (int(length / 2), int(stick_width)),
+                        int(angle),
+                        0,
+                        360,
+                        1,
+                    )
+                    cv2.fillConvexPoly(image, polygon, color)
+                    transparency = max(
+                        0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2]))
+                    )
+                    cv2.addWeighted(
+                        image, transparency, image, 1 - transparency, 0, dst=image
+                    )
+                else:
+                    cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)
+palette = np.array(
+    [
+        [255, 128, 0],
+        [255, 153, 51],
+        [255, 178, 102],
+        [230, 230, 0],
+        [255, 153, 255],
+        [153, 204, 255],
+        [255, 102, 255],
+        [255, 51, 255],
+        [102, 178, 255],
+        [51, 153, 255],
+        [255, 153, 153],
+        [255, 102, 102],
+        [255, 51, 51],
+        [153, 255, 153],
+        [102, 255, 102],
+        [51, 255, 51],
+        [0, 255, 0],
+        [0, 0, 255],
+        [255, 0, 0],
+        [255, 255, 255],
+    ]
+)
+link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+keypoint_colors = palette[
+    [16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0] + [4] * (52 - 17)
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+timm
+numpy==1.26.3
+git+https://github.com/huggingface/transformers.git@main
+supervision
+spaces

tennis.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc0868023eb6fa2d68338406964396b2cb1123610fdc6af05ba37c539ee9e92a
+size 6586057