File size: 5,433 Bytes
8f7ac8f
aedc519
6c2ffca
11e2014
 
 
 
 
 
 
 
 
 
 
 
aedc519
 
6c2ffca
aedc519
 
 
 
11e2014
 
 
aedc519
 
11e2014
 
aedc519
 
 
 
 
11e2014
 
 
 
 
 
aedc519
 
11e2014
 
 
 
 
 
 
 
aedc519
 
11e2014
 
 
aedc519
11e2014
 
 
aedc519
11e2014
 
 
 
 
 
 
 
 
 
 
 
aedc519
11e2014
 
 
 
 
 
 
 
 
 
 
 
320fc26
 
11e2014
aedc519
 
11e2014
 
 
 
aedc519
11e2014
 
aedc519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11e2014
aedc519
 
11e2014
aedc519
 
 
11e2014
aedc519
 
 
11e2014
aedc519
 
 
 
11e2014
aedc519
 
 
11e2014
aedc519
 
 
11e2014
aedc519
 
 
 
11e2014
aedc519
 
 
 
 
 
11e2014
 
aedc519
11e2014
 
aedc519
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import torch
import gradio as gr
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)
import numpy as np # Explicitly add numpy import

# Choose the `slowfast_r50` model
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

# Set to CPU since you don't have a GPU
device = "cpu"
model = model.eval()
model = model.to(device)

# --- Class Name Loading (from notebook) ---
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try:
    urllib.URLopener().retrieve(json_url, json_filename)
except:
    urllib.request.urlretrieve(json_url, json_filename)

with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

# --- Define Input Transform (from notebook) ---
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
# num_clips = 10 # Not used in inference function
# num_crops = 3 # Not used in inference function

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform = ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)
clip_duration = (num_frames * sampling_rate)/frames_per_second

# Download example video (for local testing and for Gradio examples)
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)


def inference(in_vid):
    if in_vid is None:
        return "Please upload a video or use the webcam."

    try:
        # Initialize an EncodedVideo helper class and load the video
        video = EncodedVideo.from_path(in_vid)

        # Ensure we have enough frames for the clip duration
        if video.duration < clip_duration:
            return f"Video is too short. Minimum duration is {clip_duration:.2f} seconds."

        # Select the duration of the clip to load by specifying the start and end duration
        start_sec = 0
        end_sec = start_sec + clip_duration

        # Load the desired clip
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

        # Apply a transform to normalize the video input
        video_data = transform(video_data)

        # Move the inputs to the desired device
        inputs = video_data["video"]
        inputs = [i.to(device)[None, ...] for i in inputs]

        # Pass the input clip through the model
        with torch.no_grad(): # Ensure no gradient computation for inference
            preds = model(inputs)

        # Get the predicted classes
        post_act = torch.nn.Softmax(dim=1)
        preds = post_act(preds)
        pred_classes = preds.topk(k=5).indices[0]

        # Map the predicted classes to the label names
        pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
        return "Top 5 predicted labels: %s" % ", ".join(pred_class_names)

    except Exception as e:
        # Catch common errors like video decoding issues or insufficient frames
        return f"An error occurred during inference: {e}"

# --- UPDATED GRADIO INTERFACE SYNTAX ---
# Removed gr.inputs and gr.outputs
inputs_gradio = gr.Video(label="Upload Video or Use Webcam", sources=["upload", "webcam"], format="mp4")
outputs_gradio = gr.Textbox(label="Top 5 Predicted Labels")

title = "PyTorchVideo SlowFast Action Recognition"
description = """
Demo for PyTorchVideo's SlowFast model, pretrained on the Kinetics 400 dataset for action recognition.
Upload your video or use your webcam to classify the action.
"""
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1812.03982' target='_blank'>SlowFast Networks for Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo' target='_blank'>PyTorchVideo GitHub Repo</a></p>"

examples = [
    [video_path] # Use the downloaded archery.mp4 as an example
]

gr.Interface(
    fn=inference,
    inputs=inputs_gradio,
    outputs=outputs_gradio,
    title=title,
    description=description,
    article=article,
    examples=examples,
    analytics_enabled=False
).launch()