Spaces:

infocusp
/

PoseSync-Video-Matching-Tool

Runtime error

App Files Files Community

RishitJavia commited on Sep 12, 2023

Commit

ec24258

1 Parent(s): 1a1a15e

Add code.

Browse files

Files changed (14) hide show

.gitattributes +1 -0
app.py +26 -0
models/movenet/movenet_tf/1/saved_model.pb +3 -0
models/movenet/movenet_tf/1/variables/variables.data-00000-of-00001 +3 -0
models/movenet/movenet_tf/1/variables/variables.index +0 -0
models/yolo/yolov5x.pt +3 -0
requirements.txt +140 -0
src/correspondence.py +139 -0
src/crop_video.py +230 -0
src/evaluation.py +139 -0
src/fastdtw_file.py +86 -0
src/pose_detection.py +123 -0
src/run_chain.py +56 -0
src/utils.py +315 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/movenet/movenet_tf/1/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import cv2
+import gradio as gr
+from src import run_chain
+def video_process(ref_video, test_video, crop_method):
+    run_chain.main(
+        ref_video,
+        test_video,
+        'output_video.mp4',
+        crop_method=crop_method
+        )
+    return 'output_video.mp4'
+demo = gr.Interface(video_process,
+                    inputs = [gr.Video(label='Reference Video'), gr.Video(label='Test Video'), gr.Radio(["YOLO", "Tracker"], label="Crop Method")],
+                    outputs = [gr.PlayableVideo()]
+                    )
+if __name__ == "__main__":
+    demo.launch()

models/movenet/movenet_tf/1/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea126ce61a8d11321a5523a5f0b07bb22184a848d193df7a5d0a42068f0ac15f
+size 10954887

models/movenet/movenet_tf/1/variables/variables.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bfcfc4f2267cbc32327ad9433ba93ee37b58cbf561b5b45bf9ed37f98215295
+size 25390986

models/movenet/movenet_tf/1/variables/variables.index ADDED Viewed

Binary file (6.09 kB). View file

models/yolo/yolov5x.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f27a794fa0308e2606f90565164571d6f0a0ba18a3ce2e5e5d323b71c157859
+size 174114333

requirements.txt ADDED Viewed

	@@ -0,0 +1,140 @@

+absl-py==1.4.0
+aiofiles==23.2.1
+altair
+annotated-types==0.5.0
+anyio==4.0.0
+astunparse==1.6.3
+attrs==23.1.0
+boto3==1.28.39
+botocore==1.31.39
+cachetools==5.3.1
+certifi==2022.12.7
+chardet==4.0.0
+charset-normalizer==3.2.0
+click==8.1.7
+cmake==3.27.2
+contourpy==1.1.0
+cycler==0.10.0
+decorator==4.4.2
+exceptiongroup==1.1.3
+fastapi
+ffmpy==0.3.1
+filelock
+fire==0.5.0
+flatbuffers==23.5.26
+fonttools==4.42.1
+fsspec==2023.6.0
+gast==0.4.0
+gitdb==4.0.10
+GitPython==3.1.33
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
+google-pasta==0.2.0
+gradio
+gradio_client
+grpcio==1.57.0
+h11==0.14.0
+h5py==3.9.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub
+idna==2.10
+imageio==2.31.2
+imageio-ffmpeg==0.4.8
+importlib-resources==6.0.1
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+keras==2.13.1
+kiwisolver==1.4.5
+libclang==16.0.6
+lit==16.0.6
+Markdown==3.4.4
+MarkupSafe==2.1.3
+matplotlib==3.7.2
+moviepy==1.0.3
+mpmath==1.3.0
+networkx==3.1
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+oauthlib==3.2.2
+opencv-python==4.8.0.76
+opencv-python-headless==4.8.0.76
+opt-einsum==3.3.0
+orjson==3.9.5
+packaging==23.1
+pandas==2.1.0
+Pillow==9.5.0
+proglog==0.1.10
+protobuf==4.24.2
+psutil==5.9.5
+py-cpuinfo==9.0.0
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pybboxes==0.1.6
+pydantic
+pydantic_core
+pydub==0.25.1
+pyparsing==2.4.7
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0.1
+referencing==0.30.2
+requests==2.31.0
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+roboflow==1.1.4
+rpds-py==0.10.0
+rsa==4.9
+s3transfer==0.6.2
+sahi==0.11.14
+scikit-learn==1.3.0
+scipy==1.11.2
+seaborn==0.12.2
+semantic-version==2.10.0
+shapely==2.0.1
+six==1.16.0
+smmap==5.0.0
+sniffio==1.3.0
+starlette==0.27.0
+supervision==0.14.0
+sympy==1.12
+tensorboard==2.13.0
+tensorboard-data-server==0.7.1
+tensorflow
+tensorflow-estimator==2.13.0
+tensorflow-hub==0.14.0
+tensorflow-io-gcs-filesystem==0.33.0
+termcolor==2.3.0
+terminaltables==3.1.10
+thop==0.1.1.post2209072238
+threadpoolctl==3.2.0
+toolz==0.12.0
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.66.1
+triton==2.0.0
+typing_extensions
+tzdata==2023.3
+ultralytics==8.0.168
+urllib3==1.26.16
+uvicorn==0.23.2
+websockets==11.0.3
+Werkzeug==2.3.7
+wget==3.2
+wrapt==1.15.0
+yolov5==7.0.12

src/correspondence.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# from __future__ import absolute_import, division
+from collections import defaultdict
+import numpy as np
+from src.evaluation import AngleMAE
+class DTW:
+    """Class for correspondence between two sequence of keypoints detected
+    from videos
+      Parameters:
+        cost_weightage : dictionary containing weightage of MAE and AngleMAE
+        to compute cost for DTW
+    """
+    def __init__(self, cost_weightage=None):
+        self.anglemae_calculator = AngleMAE()
+        self.cost_weightage = cost_weightage
+    def cost(self, x, y):
+        """computes cost for a set of keypoints through MAE or AngleMAE
+        Args:
+          x: A numpy array representing the keypoints of a reference frame
+          y: A numpy array representing the keypoints of a test frame
+        Returns:
+          A float value representing the mae/angle mae score between
+          reference and test pose
+        """
+        cost = 0
+        if self.cost_weightage['angle_mae']:
+            mae_angle = self.anglemae_calculator.mean_absolute_error(x, y) * \
+                        self.cost_weightage['angle_mae']
+            cost += mae_angle
+        if self.cost_weightage['mae']:
+            mae = self.MAE.mean_absolute_error(x, y) * self.cost_weightage[
+                'mae']
+            cost += mae
+        return cost
+    def find_correspondence(self, x, y):
+        """applies Dynamic Time Warping algorithm to find correspondence
+        between reference video and test video
+        Args:
+          x: A numpy array representing the keypoints of a reference video
+          y: A numpy array representing the keypoints of a test video
+        Returns:
+          A tuple containing ref_frame_indices, test_frame_indices and costs
+          where
+            ref_frame_indices: A list of indices for reference video frames
+            test_frame_indices : A list of indices for test video frames
+            costs : A list of cost between reference and test keypoint
+        """
+        x = np.asanyarray(x, dtype='float')
+        y = np.asanyarray(y, dtype='float')
+        len_x, len_y = len(x), len(y)
+        dtw_mapping = defaultdict(lambda: (float('inf'),))
+        similarity_score = defaultdict(lambda: float('inf'), )
+        dtw_mapping[0, 0] = (0, 0, 0)
+        similarity_score[0, 0] = 0
+        for i in range(1, len_x + 1):
+            for j in range(1, len_y + 1):
+                dt = self.cost(x[i - 1], y[j - 1])
+                dtw_mapping[i, j] = min(
+                    (dtw_mapping[i - 1, j][0] + dt, i - 1, j),
+                    (dtw_mapping[i, j - 1][0] + dt, i, j - 1),
+                    (dtw_mapping[i - 1, j - 1][0] + dt, i - 1, j - 1),
+                    key=lambda a: a[0]
+                    )
+                similarity_score[i, j] = dt
+        path = []
+        i, j = len_x, len_y
+        while not (i == j == 0):
+            path.append(
+                (i - 1, j - 1, dtw_mapping[i - 1, j - 1][0],
+                 similarity_score[i - 1, j - 1])
+                )
+            i, j = dtw_mapping[i, j][1], dtw_mapping[i, j][2]
+        path.reverse()
+        ref_frame_idx, test_frame_idx, _, costs = DTW.get_ref_test_mapping(path)
+        return (ref_frame_idx, test_frame_idx, costs)
+    @staticmethod
+    def get_ref_test_mapping(paths):
+        """applies Dynamic Time Warping algorithm to find correspondence
+        between reference video and test video
+        Args:
+          paths :  list of lists which consists of [i, j, ps, c] where i and
+          j are index of x and y time series respectively which have the
+          corespondence, ps is cummulative cost and c is cost between these
+          two instances
+        Returns:
+          A tuple containing ref_frame_indices, test_frame_indices,path_score
+          and costs where
+            ref_frame_indices: A list of indices for reference video frames
+            test_frame_indices : A list of indices for test video frames
+            path_score : A list of path score calculated by DTW between
+            reference and test keypoints
+            costs : A list of cost between reference and test keypoint
+        """
+        path = np.array(paths)
+        ref_2_test = {}
+        for i in range(path.shape[0]):
+            ref_2_test_val = ref_2_test.get(path[i][0], [])
+            ref_2_test_val.append([path[i][1], path[i][2], path[i][3]])
+            ref_2_test[path[i][0]] = ref_2_test_val
+        ref_frames = []
+        test_frames = []
+        path_score = []
+        costs = []
+        for ref_frame, test_frame_list in ref_2_test.items():
+            ref_frames.append(int(ref_frame))
+            test_frame_list.sort(key=lambda x: x[2])
+            test_frames.append(int(test_frame_list[0][0]))
+            path_score.append(test_frame_list[0][1])
+            costs.append(test_frame_list[0][2])
+        return ref_frames, test_frames, path_score, costs

src/crop_video.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import cv2
+import numpy as np
+import yolov5
+class CropVideo:
+    """Base class for cropping a video frame-by-frame using various object
+    detection method such as YOLO or cv2.Tracker
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+      Parameters:
+        method : name of the object detection method
+        model_path : path to object detection model
+    """
+    def __init__(self, method=None):
+        self.method = method
+    def video_crop(self, video_frames):
+        """Crops given list of frames by detecting object using different
+        methods such as YOLO or cv2.Tracker.
+        Args:
+          video_frames: A list of numpy arrays representing the input images
+        Returns:
+          A numpy array containing cropped frames
+        """
+        raise NotImplementedError
+class YOLOCrop(CropVideo):
+    """Class for cropping a video frame-by-frame using YOLO object detection
+    method
+    Parameters :
+        cropping_model_path : path to object detection model
+    """
+    def __init__(self, method=None, model_path=None):
+        super().__init__('yolo')
+        self.model_path = model_path or '../data/models/yolo/yolov5x.pt'
+        self.load_model(self.model_path)
+    def load_model(self, model_path):
+        """Loads object detection model.
+        """
+        self.model = yolov5.load(model_path)
+        self.model.classes = 0
+    def get_yolo_bbox(self, frame):
+        """Runs YOLO object detection on an input image.
+        Args:
+          frame: A [height, width, 3] numpy array representing the input image
+        Returns:
+          A list conating boundig box parameters [x_min, y_min, x_max, y_max]
+        """
+        results = self.model(frame)
+        predictions = results.pred[0]
+        boxes = predictions[:, :4].numpy().astype(np.int32)
+        if len(boxes) == 0:
+            return []
+        elif len(boxes) == 1:
+            return list(boxes[0])
+        else:
+            area = []
+            for i in boxes:
+                area.append(cv2.contourArea(np.array([[i[:2]], [i[2:]]])))
+            largest_bbox = boxes[np.argmax(np.array(area))]
+            return list(largest_bbox)
+    def video_crop(self, video_frames):
+        """Crops given list of frames by detecting object using YOLO
+        Args:
+          video_frames: A list of numpy arrays representing the input images
+        Returns:
+          A numpy array containing cropped frames
+        """
+        x_width_start = []
+        y_height_start = []
+        x_width_end = []
+        y_height_end = []
+        frame_height, frame_width = 0, 0
+        widths = []
+        heights = []
+        for frame in video_frames:
+            frame_height, frame_width, _ = frame.shape
+            bbox = self.get_yolo_bbox(frame)
+            if len(bbox) == 0:
+                continue
+            else:
+                x_width_start.append(int(max(bbox[0] - 100, 0)))
+                y_height_start.append(int(max(bbox[1] - 100, 0)))
+                x_width_end.append(int(min(bbox[2] + 100, frame.shape[1])))
+                y_height_end.append(int(min(bbox[3] + 100, frame.shape[0])))
+                widths.append(x_width_end[-1] - x_width_start[-1])
+                heights.append(y_height_end[-1] - y_height_start[-1])
+        width = np.percentile(np.array(widths), 95)
+        height = np.percentile(np.array(heights), 95)
+        box_len = int(max(width, height))
+        cropped_frames = []
+        for i in range(len(widths)):
+            frame = video_frames[i]
+            xs = x_width_start[i]
+            xe = x_width_start[i] + box_len
+            ys = y_height_start[i]
+            ye = y_height_start[i] + box_len
+            if ye > frame_height:
+                ye = frame_height
+                ys = max(0, ye - box_len)
+            if xe > frame_width:
+                xe = frame_width
+                xs = max(0, xe - box_len)
+            cropped = frame[int(ys): int(ye), int(xs): int(xe), :]
+            cropped_frames.append(np.array(cropped))
+        return np.array(cropped_frames)
+class TrackerCrop(YOLOCrop):
+    def __init__(self, model_path=None):
+        super().__init__(method='yolo')
+        self.tracker = cv2.TrackerMIL.create()
+    @staticmethod
+    def expand_bbox(bbox, frame_shape):
+        """Expands given bounding box by 50 pixels
+        Args:
+          bbox: A list [x,y, width, height] consits of bounding box
+          parameters of
+                object
+          frame_shape: (height, width) of a frame
+        """
+        bbox[0] = max(bbox[0] - 50, 0)
+        bbox[1] = max(bbox[1] - 50, 0)
+        bbox[2] = min(bbox[3] + 50, frame_shape[1] - bbox[0] - 1)
+        bbox[3] = min(bbox[3] + 50, frame_shape[0] - bbox[1] - 1)
+    @staticmethod
+    def pad_bbox(crop_frame, box_len):
+        """Pads given cropped frame
+        Args:
+          crop_frame: A numpy array representing the cropped frame
+          box_len: An integer value representing maximum out of width and height
+        Returns:
+          A numpy array containing cropped frame with padding
+        """
+        if box_len > crop_frame.shape[0] or box_len > crop_frame.shape[1]:
+            crop_frame = np.pad(
+                crop_frame, pad_width=(
+                    (0, box_len - crop_frame.shape[0]),
+                    (0, box_len - crop_frame.shape[1]), (0, 0))
+                )
+        return crop_frame
+    @staticmethod
+    def clip_coordinates(x, y, box_len, frame_shape):
+        """Clips (x,y) coordinates representing the centre of bounding box
+        Args:
+          x: x-coordinate of the centre of bounding box
+          y: y-coordinate of the centre of bounding box
+          box_len: An integer value representing maximum out of width and height
+          frame_shape: (height, width) of a frame
+        Returns:
+          (x,y) clipped coordinates
+        """
+        if x + box_len > frame_shape[1]:
+            diff = x + box_len - frame_shape[1]
+            x = max(0, x - diff)
+        if y + box_len > frame_shape[0]:
+            diff = y + box_len - frame_shape[0]
+            y = max(0, y - diff)
+        return (x, y)
+    def video_crop(self, video_frames):
+        """Crops given list of frames by detecting object using cv2.Tracker
+        Args:
+          video_frames: A list of numpy arrays representing the input images
+        Returns:
+          A numpy array containing cropped frames
+        """
+        frame = video_frames[0]
+        bbox = self.get_yolo_bbox(frame)
+        TrackerCrop.expand_bbox(bbox, frame.shape)
+        self.tracker.init(frame, bbox)
+        output_frame_list = []
+        for frame in video_frames:
+            _, bbox = self.tracker.update(frame)
+            x, y, w, h = bbox
+            box_len = max(w, h)
+            x, y = TrackerCrop.clip_coordinates(x, y, box_len, frame.shape)
+            crop_frame = np.array(frame[y:y + box_len, x:x + box_len, :])
+            crop_frame = TrackerCrop.pad_bbox(crop_frame, box_len)
+            output_frame_list.append(crop_frame)
+        output_frame_array = np.array(output_frame_list)
+        return output_frame_array

src/evaluation.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import numpy as np
+from sklearn import metrics
+class AngleMAE:
+    """Class for evaluation metrics AngleMAE which computes mean absolute
+    error between set of joints angle of reference and test video keypoints
+      Parameters:
+        ref_keypoits : A numpy array [n, 1, 17, 3] containing keypoints
+        representing poses in reference video
+        test_keypoits : A numpy array [n, 1, 17, 3] containing keypoints
+        representing poses in test video
+    """
+    def __init__(self):
+        self.joints_array = np.array(
+            [[11, 5, 7],
+             [12, 6, 8],
+             [6, 8, 10],
+             [5, 7, 9],
+             [11, 12, 14],
+             [12, 11, 13],
+             [12, 14, 16],
+             [11, 13, 15],
+             [5, 11, 13]]
+            )
+        self.joints_dict = {
+            'left_shoulder_joint': ['left_hip', 'left_shoulder', 'left_elbow'],
+            'right_shoulder_joint': ['right_hip', 'right_shoulder',
+                                     'right_elbow'],
+            'right_elbow_joint': ['right_shoulder', 'right_elbow',
+                                  'right_wrist'],
+            'left_elbow_joint': ['left_shoulder', 'left_elbow', 'left_wrist'],
+            'right_hip_joint': ['left_hip', 'right_hip', 'right_knee'],
+            'left_hip_joint': ['right_hip', 'left_hip', 'left_knee'],
+            'right_knee_joint': ['right_hip', 'right_knee', 'right_ankle'],
+            'left_knee_joint': ['left_hip', 'left_knee', 'left_ankle'],
+            'waist_joint': ['left_shoulder', 'left_hip', 'left_knee']
+            }
+        self.angle_mae_joints_weightage_array = ([1, 1, 1, 1, 1, 1, 1, 1, 1])
+    def mean_absolute_error(self, ref_keypoints, test_keypoints) -> object:
+        """
+        Calcultes MAE of given joints via index between reference and test
+        frames
+        Args:
+          ref_keypoints: ndarray of shape (17,2) containing reference frame
+          x, y coordinates
+          test_keypoints: ndarray of shape (17,2) containing test frame x,
+          y coordinates
+        Returns:
+          MAE: A float value representing angle based MAE
+        """
+        ref_angle = self.calculate_angle_atan2(ref_keypoints)
+        test_angle = self.calculate_angle_atan2(test_keypoints)
+        diff = np.abs(ref_angle - test_angle)
+        mae = np.sum(diff * self.angle_mae_joints_weightage_array) / sum(
+            self.angle_mae_joints_weightage_array
+            )
+        return mae
+    def calculate_angle_atan2(self, kpts):
+        """
+        Calcultes angle of given joint
+        Args:
+          kpts: ndarray of shape (17,2) containing x, y coordinates
+        Returns:
+          angle: A float value representing angle in degrees
+        """
+        a = np.zeros((9, 2))
+        b = np.zeros((9, 2))
+        c = np.zeros((9, 2))
+        for i, j in enumerate(self.joints_array):
+            a[i] = kpts[j[0]]
+            b[i] = kpts[j[1]]
+            c[i] = kpts[j[2]]
+        vector_b_a = b - a
+        vector_b_c = b - c
+        angle_0 = np.arctan2(
+            vector_b_a[:, 1],
+            vector_b_a[:, 0]
+            )
+        angle_2 = np.arctan2(
+            vector_b_c[:, 1],
+            vector_b_c[:, 0]
+            )
+        determinant = vector_b_a[:, 0] * vector_b_c[:, 1] - vector_b_a[:,
+                                                            1] * vector_b_c[:,
+                                                                 0]
+        angle_diff = (angle_0 - angle_2)
+        angle = np.degrees(angle_diff)
+        joints_angle_array = angle * (determinant < 0) + (360 + angle) * (
+            determinant > 0)
+        return joints_angle_array % 360
+class MAE:
+    def __init__(self):
+        pass
+    @staticmethod
+    def mean_absolute_error(ref_keypoints, test_keypoints):
+        """
+        Calcultes MAE of given keypoints between reference and test frames
+        Args:
+          ref_keypoints: ndarray of shape (17,2) containing reference frame
+          x, y coordinates
+          test_keypoints: ndarray of shape (17,2) containing test frame x,
+          y coordinates
+        Returns:
+          MAE: A float value representing MAE
+        """
+        return metrics.mean_absolute_error(
+            ref_keypoints.flatten(),
+            test_keypoints.flatten(),
+            )

src/fastdtw_file.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from __future__ import absolute_import, division
+from collections import defaultdict
+import numpy as np
+from pose_estimation import joints_angle
+def fastdtw(x, y, radius=1, dist=None, method={}, angle_comp_method=''):
+    ''' return the approximate distance between 2 time series with O(N)
+        time and memory complexity
+        Parameters
+        ----------
+        x : array_like
+            input array 1
+        y : array_like
+            input array 2
+        radius : int
+            size of neighborhood when expanding the path. A higher value will
+            increase the accuracy of the calculation but also increase time
+            and memory consumption. A radius equal to the size of x and y
+            will yield an exact dynamic time warping calculation.
+        dist : function or int
+            The method for calculating the distance between x[i] and y[j]. If
+            dist is an int of value p > 0, then the p-norm will be used. If
+            dist is a function then dist(x[i], y[j]) will be used. If dist is
+            None then abs(x[i] - y[j]) will be used.
+        Returns
+        -------
+        distance : float
+            the approximate distance between the 2 time series
+        path : list
+            list of indexes for the inputs x and y
+    '''
+    x = np.asanyarray(x, dtype='float')
+    y = np.asanyarray(y, dtype='float')
+    return __dtw(x, y, None, angle_comp_method=angle_comp_method)
+def cost(x, y, method, angle_comp_method=''):
+    mae_angle = joints_angle.mean_absolute_error(
+        x, y
+        )
+    cost = mae_angle
+    return cost
+def __dtw(x, y, method, angle_comp_method=''):
+    len_x, len_y = len(x), len(y)
+    dtw_mapping = defaultdict(lambda: (float('inf'),))
+    similarity_score = defaultdict(lambda: float('inf'), )
+    dtw_mapping[0, 0] = (0, 0, 0)
+    similarity_score[0, 0] = 0
+    for i in range(1, len_x + 1):
+        for j in range(1, len_y + 1):
+            dt = cost(
+                x[i - 1], y[j - 1], method, angle_comp_method=angle_comp_method
+                )
+            dtw_mapping[i, j] = min(
+                (dtw_mapping[i - 1, j][0] + dt, i - 1, j),
+                (dtw_mapping[i, j - 1][0] + dt, i, j - 1),
+                (dtw_mapping[i - 1, j - 1][0] + dt, i - 1, j - 1),
+                key=lambda a: a[0]
+                )
+            similarity_score[i, j] = dt
+    path = []
+    i, j = len_x, len_y
+    while not (i == j == 0):
+        path.append(
+            (i - 1, j - 1, dtw_mapping[i - 1, j - 1][0],
+             similarity_score[i - 1, j - 1])
+            )
+        i, j = dtw_mapping[i, j][1], dtw_mapping[i, j][2]
+    path.reverse()
+    return (dtw_mapping[len_x, len_y][0], path)

src/pose_detection.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as hub
+import os
+class PoseDetection:
+    """Base class for pose detection in images using various algorithm such
+    as Movenet
+    Warning: This class should not be used directly.
+    Use derived classes instead.
+      Parameters:
+        model_name : name of the pose detection method
+        input_size : image size of input required for model
+        model_path : path to pose detection model
+    """
+    def __init__(self, model_name=None, input_size=None, model_path=None):
+        self.model_name = model_name
+        self.input_size = input_size
+        self.model_path = model_path
+        self.load_model()
+    def load_model(self):
+        """Absrtact method to loads pose detection model.
+        """
+        raise NotImplementedError
+    def preprocess_image(self, frame):
+        """Absrtact method to preprocess a image before running pose
+        detection inference on it.
+        """
+        raise NotImplementedError
+    def run_inference(self, frames):
+        """Absrtact method to run pose detection inference on it.
+        """
+        raise NotImplementedError
+class MovenetPoseDetection(PoseDetection):
+    """Class for pose detection in images using Movenet model
+      Parameters:
+        model_name : name of the pose detection method
+        input_size : image size of input required for model
+        model_path : path to pose detection model
+    """
+    def __init__(self, model_name=None, input_size=None, model_path=None):
+        model_name = model_name or 'movenet'
+        input_size = input_size or 256
+        model_path = model_path or 'models/movenet/movenet_tf/1'
+        super().__init__(model_name, input_size, model_path)
+    def load_model(self):
+        """Loads the pose detection model.
+        """
+        if self.model_name == "movenet":
+            module = hub.load(self.model_path)
+            self.model = module.signatures['serving_default']
+    def preprocess_image(self, frame):
+        """Preprocesses an image to transform it into required format for
+        pose detection
+        Args:
+          frame: A numpy array representing the input image
+        Returns:
+          A tensor of 'int32' data type and resized input image.
+        """
+        input_image = tf.expand_dims(frame, axis=0)
+        input_image = tf.image.resize_with_pad(
+            input_image, self.input_size, self.input_size
+            )
+        input_image = tf.cast(input_image, dtype=tf.int32)
+        return input_image
+    def run_inference(self, frames):
+        """Appllies pose dection model on a frame
+        Args:
+          frames: A list of numpy arrays representing the input images
+        Returns:
+          A [n, 1, 17, 3] float numpy array representing the keypoint
+          coordinates
+          and scores predicted by Movenet Model of list of images
+        """
+        keypoints = np.zeros((len(frames), 17, 2))
+        for i, frame in enumerate(frames):
+            frame = self.preprocess_image(frame)
+            keypoints[i, :, :] = self.run_model(frame)
+        return keypoints
+    def run_model(self, input_image):
+        """Runs detection on an input image.
+        Args:
+          input_image: A [1, height, width, 3] tensor represents the input image
+            pixels. Note that the height/width should already be resized and
+            match the
+            expected input resolution of the model before passing into this
+            function.
+        Returns:
+          A [1, 1, 17, 3] float numpy array representing the keypoint
+          coordinates
+          and scores predicted by Movenet Model
+        """
+        outputs = self.model(input_image)
+        outputs = outputs['output_0'].numpy()
+        outputs[:, :, :, [0, 1]] = outputs[:, :, :, [1, 0]]
+        return outputs[0, 0, :, :2]

src/run_chain.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import argparse
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import cv2
+from src.correspondence import DTW
+from src.crop_video import YOLOCrop, TrackerCrop
+from src.pose_detection import MovenetPoseDetection
+from src import utils
+def main(ref_video_path, test_video_path, output_video_path, crop_method="yolo"):
+  ref_frames = utils.get_video_frames(ref_video_path)
+  test_frames = utils.get_video_frames(test_video_path)
+  if crop_method == 'Tracker':
+    crop_object = TrackerCrop()
+  else:
+    crop_object = YOLOCrop()
+  ref_crop_frames = crop_object.video_crop(ref_frames)
+  test_crop_frames = crop_object.video_crop(test_frames)
+  movenet = MovenetPoseDetection()
+  ref_keypoints = movenet.run_inference(ref_crop_frames)
+  test_keypoints = movenet.run_inference(test_crop_frames)
+  dtw = DTW(cost_weightage={'mae' : 0, 'angle_mae' : 1})
+  ref_frame_idx, test_frame_idx, costs = dtw.find_correspondence(ref_keypoints, test_keypoints)
+  utils.Plot.plot_matching(ref_crop_frames, test_crop_frames, ref_keypoints, test_keypoints,ref_frame_idx, test_frame_idx, costs,output_video_path)
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+    description="run chained process",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+  parser.add_argument('--Ref_video', type=str, required=True)
+  parser.add_argument('--Test_video', type=str, required=True)
+  parser.add_argument('--Output_path', type=str, required=True)
+  args = parser.parse_args()
+  try:
+    main(args.Ref_video, args.Test_video, args.Output_path)
+  except NameError:
+    print("Video file is not appropriate.")
+  except ValueError:
+    print("YOLO couldn't detect bounding box for given video.")
+  except cv2.error:
+    print(
+      "Can not convert color from BGR to RGB. Please check the input frame."
+      )

src/utils.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import os
+import cv2
+import numpy as np
+from moviepy.editor import ImageSequenceClip
+def get_video_frames(video_path):
+    """Reads a video frame by frame
+    Args:
+      video_path: A video path
+    Returns:
+      A list of numpy arrays representing video frames
+    """
+    vid_cap = cv2.VideoCapture(video_path)
+    frames = []
+    while True:
+        ret, frame = vid_cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(frame)
+        else:
+            break
+    return frames
+class Plot:
+    """Class for plotting keypoints on video frames and creating a video
+    """
+    KEYPOINT_EDGE_INDS_TO_COLOR = {
+        (0, 1): "m",
+        (0, 2): "c",
+        (1, 3): "m",
+        (2, 4): "c",
+        (0, 5): "m",
+        (0, 6): "c",
+        (5, 7): "m",
+        (7, 9): "m",
+        (6, 8): "c",
+        (8, 10): "c",
+        (5, 6): "y",
+        (5, 11): "m",
+        (6, 12): "c",
+        (11, 12): "y",
+        (11, 13): "m",
+        (13, 15): "m",
+        (12, 14): "c",
+        (14, 16): "c",
+        }
+    @staticmethod
+    def add_keypoints_to_image(
+        images_array, keypoints_list
+        ):
+        """
+        Adds keypoints to the image
+        Args:
+          images_array: list of images to represent the keypoints
+          keypoints_list : list of keypoints
+          model: name of the model used to detect the keypoints
+        Returns:
+          None
+        """
+        output_overlay_array = images_array.astype(np.int32)
+        output_overlay_list = Plot.draw_prediction_on_image(
+            output_overlay_array, keypoints_list
+            )
+        return np.array(output_overlay_list)
+    @staticmethod
+    def draw_prediction_on_image(
+        image_list, keypoints_list
+        ):
+        """Draws the keypoint predictions on image.
+        Args:
+          image_list: A numpy array with shape [n, height, width, channel]
+          representing the
+            pixel values of the input image where n is number of images.
+          keypoints_list: A numpy array with shape [n, 17, 2] representing the
+            coordinates of 17 keypoints where n is number of images.
+        Returns:
+          A numpy array with shape [n, out_height, out_width, channel]
+          representing
+          the list of
+          images overlaid with keypoint predictions.
+        """
+        height, width, channel = image_list[0].shape
+        keypoint_locs, keypoint_edges = Plot._keypoints_and_edges_for_display(
+            keypoints_list,
+            height, width
+            )
+        for img_i in range(keypoint_locs.shape[0]):
+            for edge in keypoint_edges[img_i]:
+                image = cv2.line(
+                    image_list[img_i], (int(edge[0]), int(edge[1])),
+                    (int(edge[2]), int(edge[3])), color=(0, 0, 255), thickness=3
+                    )
+            for center_x, center_y in keypoint_locs[img_i]:
+                image = cv2.circle(
+                    image_list[img_i], (int(center_x), int(center_y)), radius=5,
+                    color=(255, 0, 0), thickness=-1
+                    )
+            image_list[img_i] = image
+        return image_list
+    @staticmethod
+    def _keypoints_and_edges_for_display(
+        keypoints_list, height, width,
+        ):
+        """Returns high confidence keypoints and edges for visualization.
+        Args:
+          keypoints_list: A numpy array with shape [1, 1, 17, 3] representing
+          the keypoint coordinates and scores returned from the MoveNet model.
+          height: height of the image in pixels.
+          width: width of the image in pixels.
+          keypoint_threshold: minimum confidence score for keypoint to be
+          visualized.
+        Returns:
+          A (kpts_absolute_xy, edges_xy) containing:
+            * array with shape [n, 17, 2] representing the coordinates of all
+            keypoints of all detected entities in n images;
+            * array with shape [n, 18, 4] representing the coordinates of all
+            skeleton edges of all detected entities in n images;
+        """
+        kpts_x = width * keypoints_list[:, :, 0]
+        kpts_y = height * keypoints_list[:, :, 1]
+        edge_pair = np.array(list(Plot.KEYPOINT_EDGE_INDS_TO_COLOR.keys()))
+        kpts_absolute_xy = np.stack(
+            [kpts_x, kpts_y], axis=-1
+            )
+        x_start = kpts_x[:, edge_pair[:, 0]]
+        y_start = kpts_y[:, edge_pair[:, 0]]
+        x_end = kpts_x[:, edge_pair[:, 1]]
+        y_end = kpts_y[:, edge_pair[:, 1]]
+        edges = np.stack([x_start, y_start, x_end, y_end], axis=2)
+        return kpts_absolute_xy, edges
+    @staticmethod
+    def resize_and_concat(ref_image_list, test_image_list):
+        """Resizes either of reference frames list or test frames list to
+        make both list of equal shape and merges both frames side by side
+        Args:
+          ref_image_list: A list of numpy array representing reference video
+          frames
+          test_image_list: A list of numpy array representing test video frames
+        Returns:
+          concat_img_list: A list of numpy array representing merged video
+          frames
+        """
+        def pad_image(image_list, pad_axis, pad_len, odd_len_diff):
+            """pads given number of pixels to image_list on given axis
+            Args:
+              image_list: A list of numpy array representing video frames
+              pad_axis: A list of numpy array representing test video frames
+              pad_len: number of pixels to pad on either side of frame
+              odd_len_diff: 1 if difference between reference and test is odd
+              else 0
+            Returns:
+              padded video frame
+            """
+            if pad_axis == 0:
+                return np.pad(
+                    image_list, (
+                        (0, 0), (pad_len, pad_len + odd_len_diff), (0, 0),
+                        (0, 0)), 'constant',
+                    constant_values=(0)
+                    )
+            elif pad_axis == 1:
+                return np.pad(
+                    image_list, (
+                        (0, 0), (0, 0), (pad_len, pad_len + odd_len_diff),
+                        (0, 0)), 'constant',
+                    constant_values=(0)
+                    )
+        ref_height, ref_width, _ = ref_image_list[0].shape
+        test_height, test_width, _ = test_image_list[0].shape
+        pad_height = abs(test_height - ref_height) // 2
+        odd_height_diff = (test_height - ref_height) % 2
+        if ref_height < test_height:
+            ref_image_list = pad_image(
+                ref_image_list, 0, pad_height, odd_height_diff
+                )
+        elif ref_height > test_height:
+            test_image_list = pad_image(
+                test_image_list, 0, pad_height, odd_height_diff
+                )
+        pad_width = abs(test_width - ref_width) // 2
+        odd_width_diff = (test_width - ref_width) % 2
+        if ref_width < test_width:
+            ref_image_list = pad_image(
+                ref_image_list, 1, pad_width, odd_width_diff
+                )
+        elif ref_width > test_width:
+            test_image_list = pad_image(
+                test_image_list, 1, pad_width, odd_width_diff
+                )
+        concat_img_list = np.concatenate(
+            (ref_image_list, test_image_list), axis=2
+            )
+        return concat_img_list
+    @staticmethod
+    def overlay_score_on_images(image_list, scores):
+        """writes score on given image list
+        Args:
+          image_list: A list of numpy array representing video frames
+          scores: A list of score between reference and test keypoints
+        Returns:
+          A list of numpy array with score overlayed on it
+        """
+        for i in range(len(image_list)):
+            image = image_list[i, :, :, :]
+            txt = f"Score : {scores[i]}"
+            image = cv2.putText(
+                image, txt, (5, 10), cv2.FONT_HERSHEY_SIMPLEX,
+                0.4, (255, 0, 0), 1, cv2.LINE_AA
+                )
+            image_list[i, :, :, :] = image
+        return image_list
+    def plot_matching(
+        ref_frames, test_frames, ref_keypoints, test_keypoints, ref_frames_idx,
+        test_frames_idx, costs, output_path
+        ):
+        """creates a video of reference and test video frames with keypoints
+        overlayed on them
+        Args:
+          ref_frames: A list of numpy array representing reference video frames
+          test_frames: A list of numpy array representing test video frames
+          ref_keypoints: A list of numpy array representing reference video
+          keypoints
+          test_keypoints: A list of numpy array representing reference video
+          keypoints
+          ref_frames_idx: A list of reference frame indices
+          test_frames_idx: A list of test frame indices
+          costs: A list of score between reference and test keypoints
+          output_path: path at which output video to be stored
+        """
+        if (ref_frames_idx is not None) and (
+            test_frames_idx is not None) and len(ref_frames_idx) == len(
+            test_frames_idx
+            ):
+            ref_frames = ref_frames[ref_frames_idx]
+            test_frames = test_frames[test_frames_idx]
+            ref_keypoints = ref_keypoints[ref_frames_idx]
+            test_keypoints = test_keypoints[test_frames_idx]
+        if costs is None:
+            costs = ['N/A'] * len(ref_frames)
+        display_line = [
+            f"Cost: {costs[i]}     Ref frame: {ref_frames_idx[i]}    Test " \
+            f"frame: {test_frames_idx[i]}"
+            for i in range(len(ref_frames_idx))]
+        ref_image_list = Plot.add_keypoints_to_image(
+            ref_frames, ref_keypoints
+            )
+        test_image_list = Plot.add_keypoints_to_image(
+            test_frames, test_keypoints
+            )
+        comparison_img_list = Plot.resize_and_concat(
+            ref_image_list, test_image_list
+            )
+        comparison_img_list = Plot.overlay_score_on_images(
+            comparison_img_list, display_line
+            )
+        video = ImageSequenceClip(list(comparison_img_list), fps=5)
+        video.write_videofile(output_path, fps=5)