import torch import gradio as gr import json import urllib from torchvision.transforms import Compose, Lambda from torchvision.transforms._transforms_video import ( CenterCropVideo, NormalizeVideo, ) from pytorchvideo.data.encoded_video import EncodedVideo from pytorchvideo.transforms import ( ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample, UniformCropVideo ) import numpy as np # Explicitly add numpy import # Choose the `slowfast_r50` model model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True) # Set to CPU since you don't have a GPU device = "cpu" model = model.eval() model = model.to(device) # --- Class Name Loading (from notebook) --- json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json" json_filename = "kinetics_classnames.json" try: urllib.URLopener().retrieve(json_url, json_filename) except: urllib.request.urlretrieve(json_url, json_filename) with open(json_filename, "r") as f: kinetics_classnames = json.load(f) kinetics_id_to_classname = {} for k, v in kinetics_classnames.items(): kinetics_id_to_classname[v] = str(k).replace('"', "") # --- Define Input Transform (from notebook) --- side_size = 256 mean = [0.45, 0.45, 0.45] std = [0.225, 0.225, 0.225] crop_size = 256 num_frames = 32 sampling_rate = 2 frames_per_second = 30 slowfast_alpha = 4 # num_clips = 10 # Not used in inference function # num_crops = 3 # Not used in inference function class PackPathway(torch.nn.Module): """ Transform for converting video frames as a list of tensors. """ def __init__(self): super().__init__() def forward(self, frames: torch.Tensor): fast_pathway = frames slow_pathway = torch.index_select( frames, 1, torch.linspace( 0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha ).long(), ) frame_list = [slow_pathway, fast_pathway] return frame_list transform = ApplyTransformToKey( key="video", transform=Compose( [ UniformTemporalSubsample(num_frames), Lambda(lambda x: x/255.0), NormalizeVideo(mean, std), ShortSideScale( size=side_size ), CenterCropVideo(crop_size), PackPathway() ] ), ) clip_duration = (num_frames * sampling_rate)/frames_per_second # Download example video (for local testing and for Gradio examples) url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4" video_path = 'archery.mp4' try: urllib.URLopener().retrieve(url_link, video_path) except: urllib.request.urlretrieve(url_link, video_path) def inference(in_vid): if in_vid is None: return "Please upload a video or use the webcam." try: # Initialize an EncodedVideo helper class and load the video video = EncodedVideo.from_path(in_vid) # Ensure we have enough frames for the clip duration if video.duration < clip_duration: return f"Video is too short. Minimum duration is {clip_duration:.2f} seconds." # Select the duration of the clip to load by specifying the start and end duration start_sec = 0 end_sec = start_sec + clip_duration # Load the desired clip video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec) # Apply a transform to normalize the video input video_data = transform(video_data) # Move the inputs to the desired device inputs = video_data["video"] inputs = [i.to(device)[None, ...] for i in inputs] # Pass the input clip through the model with torch.no_grad(): # Ensure no gradient computation for inference preds = model(inputs) # Get the predicted classes post_act = torch.nn.Softmax(dim=1) preds = post_act(preds) pred_classes = preds.topk(k=5).indices[0] # Map the predicted classes to the label names pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes] return "Top 5 predicted labels: %s" % ", ".join(pred_class_names) except Exception as e: # Catch common errors like video decoding issues or insufficient frames return f"An error occurred during inference: {e}" # --- UPDATED GRADIO INTERFACE SYNTAX --- # Removed gr.inputs and gr.outputs inputs_gradio = gr.Video(label="Upload Video or Use Webcam", sources=["upload", "webcam"], format="mp4") outputs_gradio = gr.Textbox(label="Top 5 Predicted Labels") title = "PyTorchVideo SlowFast Action Recognition" description = """ Demo for PyTorchVideo's SlowFast model, pretrained on the Kinetics 400 dataset for action recognition. Upload your video or use your webcam to classify the action. """ article = "

SlowFast Networks for Video Recognition | PyTorchVideo GitHub Repo

" examples = [ [video_path] # Use the downloaded archery.mp4 as an example ] gr.Interface( fn=inference, inputs=inputs_gradio, outputs=outputs_gradio, title=title, description=description, article=article, examples=examples, analytics_enabled=False ).launch()