owinymarvin commited on
Commit
aedc519
·
1 Parent(s): 11e2014

latest changes

Browse files
Files changed (1) hide show
  1. app.py +80 -46
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import torch
2
- # Choose the `slowfast_r50` model
3
- model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
4
- from typing import Dict
5
  import json
6
  import urllib
7
  from torchvision.transforms import Compose, Lambda
@@ -15,24 +13,33 @@ from pytorchvideo.transforms import (
15
  ShortSideScale,
16
  UniformTemporalSubsample,
17
  UniformCropVideo
18
- )
 
19
 
20
- import gradio as gr
21
- # Set to GPU or CPU
 
 
22
  device = "cpu"
23
  model = model.eval()
24
  model = model.to(device)
 
 
25
  json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
26
  json_filename = "kinetics_classnames.json"
27
- try: urllib.URLopener().retrieve(json_url, json_filename)
28
- except: urllib.request.urlretrieve(json_url, json_filename)
 
 
 
29
  with open(json_filename, "r") as f:
30
  kinetics_classnames = json.load(f)
31
 
32
- # Create an id to label name mapping
33
  kinetics_id_to_classname = {}
34
  for k, v in kinetics_classnames.items():
35
  kinetics_id_to_classname[v] = str(k).replace('"', "")
 
 
36
  side_size = 256
37
  mean = [0.45, 0.45, 0.45]
38
  std = [0.225, 0.225, 0.225]
@@ -41,19 +48,18 @@ num_frames = 32
41
  sampling_rate = 2
42
  frames_per_second = 30
43
  slowfast_alpha = 4
44
- num_clips = 10
45
- num_crops = 3
46
 
47
  class PackPathway(torch.nn.Module):
48
  """
49
- Transform for converting video frames as a list of tensors.
50
  """
51
  def __init__(self):
52
  super().__init__()
53
-
54
  def forward(self, frames: torch.Tensor):
55
  fast_pathway = frames
56
- # Perform temporal sampling from the fast pathway.
57
  slow_pathway = torch.index_select(
58
  frames,
59
  1,
@@ -64,7 +70,7 @@ class PackPathway(torch.nn.Module):
64
  frame_list = [slow_pathway, fast_pathway]
65
  return frame_list
66
 
67
- transform = ApplyTransformToKey(
68
  key="video",
69
  transform=Compose(
70
  [
@@ -79,53 +85,81 @@ transform = ApplyTransformToKey(
79
  ]
80
  ),
81
  )
82
-
83
- # The duration of the input clip is also specific to the model.
84
  clip_duration = (num_frames * sampling_rate)/frames_per_second
 
 
85
  url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
86
  video_path = 'archery.mp4'
87
  try: urllib.URLopener().retrieve(url_link, video_path)
88
  except: urllib.request.urlretrieve(url_link, video_path)
89
- # Select the duration of the clip to load by specifying the start and end duration
90
- # The start_sec should correspond to where the action occurs in the video
91
 
92
  def inference(in_vid):
93
- start_sec = 0
94
- end_sec = start_sec + clip_duration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- # Initialize an EncodedVideo helper class and load the video
97
- video = EncodedVideo.from_path(in_vid)
98
 
99
- # Load the desired clip
100
- video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
 
101
 
102
- # Apply a transform to normalize the video input
103
- video_data = transform(video_data)
 
104
 
105
- # Move the inputs to the desired device
106
- inputs = video_data["video"]
107
- inputs = [i.to(device)[None, ...] for i in inputs]
108
- # Pass the input clip through the model
109
- preds = model(inputs)
110
 
111
- # Get the predicted classes
112
- post_act = torch.nn.Softmax(dim=1)
113
- preds = post_act(preds)
114
- pred_classes = preds.topk(k=5).indices[0]
115
 
116
- # Map the predicted classes to the label names
117
- pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
118
- return "%s" % ", ".join(pred_class_names)
119
 
120
- inputs = gr.inputs.Video(label="Input Video")
121
- outputs = gr.outputs.Textbox(label="Top 5 predicted labels")
 
 
122
 
123
- title = "SLOWFAST"
124
- description = "demo for SLOWFAST, SlowFast networks pretrained on the Kinetics 400 dataset. To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
125
- article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1812.03982'>SlowFast Networks for Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo'>Github Repo</a></p>"
 
 
 
126
 
127
  examples = [
128
- ['archery.mp4']
129
  ]
130
 
131
- gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples, analytics_enabled=False).launch(debug=True)
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ import gradio as gr
 
 
3
  import json
4
  import urllib
5
  from torchvision.transforms import Compose, Lambda
 
13
  ShortSideScale,
14
  UniformTemporalSubsample,
15
  UniformCropVideo
16
+ )
17
+ import numpy as np # Explicitly add numpy import
18
 
19
+ # Choose the `slowfast_r50` model
20
+ model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
21
+
22
+ # Set to CPU since you don't have a GPU
23
  device = "cpu"
24
  model = model.eval()
25
  model = model.to(device)
26
+
27
+ # --- Class Name Loading (from notebook) ---
28
  json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
29
  json_filename = "kinetics_classnames.json"
30
+ try:
31
+ urllib.URLopener().retrieve(json_url, json_filename)
32
+ except:
33
+ urllib.request.urlretrieve(json_url, json_filename)
34
+
35
  with open(json_filename, "r") as f:
36
  kinetics_classnames = json.load(f)
37
 
 
38
  kinetics_id_to_classname = {}
39
  for k, v in kinetics_classnames.items():
40
  kinetics_id_to_classname[v] = str(k).replace('"', "")
41
+
42
+ # --- Define Input Transform (from notebook) ---
43
  side_size = 256
44
  mean = [0.45, 0.45, 0.45]
45
  std = [0.225, 0.225, 0.225]
 
48
  sampling_rate = 2
49
  frames_per_second = 30
50
  slowfast_alpha = 4
51
+ # num_clips = 10 # Not used in inference function
52
+ # num_crops = 3 # Not used in inference function
53
 
54
  class PackPathway(torch.nn.Module):
55
  """
56
+ Transform for converting video frames as a list of tensors.
57
  """
58
  def __init__(self):
59
  super().__init__()
60
+
61
  def forward(self, frames: torch.Tensor):
62
  fast_pathway = frames
 
63
  slow_pathway = torch.index_select(
64
  frames,
65
  1,
 
70
  frame_list = [slow_pathway, fast_pathway]
71
  return frame_list
72
 
73
+ transform = ApplyTransformToKey(
74
  key="video",
75
  transform=Compose(
76
  [
 
85
  ]
86
  ),
87
  )
 
 
88
  clip_duration = (num_frames * sampling_rate)/frames_per_second
89
+
90
+ # Download example video (for local testing and for Gradio examples)
91
  url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
92
  video_path = 'archery.mp4'
93
  try: urllib.URLopener().retrieve(url_link, video_path)
94
  except: urllib.request.urlretrieve(url_link, video_path)
95
+
 
96
 
97
  def inference(in_vid):
98
+ if in_vid is None:
99
+ return "Please upload a video or use the webcam."
100
+
101
+ try:
102
+ # Initialize an EncodedVideo helper class and load the video
103
+ video = EncodedVideo.from_path(in_vid)
104
+
105
+ # Ensure we have enough frames for the clip duration
106
+ if video.duration < clip_duration:
107
+ return f"Video is too short. Minimum duration is {clip_duration:.2f} seconds."
108
+
109
+ # Select the duration of the clip to load by specifying the start and end duration
110
+ start_sec = 0
111
+ end_sec = start_sec + clip_duration
112
+
113
+ # Load the desired clip
114
+ video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
115
 
116
+ # Apply a transform to normalize the video input
117
+ video_data = transform(video_data)
118
 
119
+ # Move the inputs to the desired device
120
+ inputs = video_data["video"]
121
+ inputs = [i.to(device)[None, ...] for i in inputs]
122
 
123
+ # Pass the input clip through the model
124
+ with torch.no_grad(): # Ensure no gradient computation for inference
125
+ preds = model(inputs)
126
 
127
+ # Get the predicted classes
128
+ post_act = torch.nn.Softmax(dim=1)
129
+ preds = post_act(preds)
130
+ pred_classes = preds.topk(k=5).indices[0]
 
131
 
132
+ # Map the predicted classes to the label names
133
+ pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
134
+ return "Top 5 predicted labels: %s" % ", ".join(pred_class_names)
 
135
 
136
+ except Exception as e:
137
+ # Catch common errors like video decoding issues or insufficient frames
138
+ return f"An error occurred during inference: {e}"
139
 
140
+ # --- UPDATED GRADIO INTERFACE SYNTAX ---
141
+ # Removed gr.inputs and gr.outputs
142
+ inputs_gradio = gr.Video(label="Upload Video or Use Webcam", sources=["upload", "webcam"], format="mp4")
143
+ outputs_gradio = gr.Textbox(label="Top 5 Predicted Labels")
144
 
145
+ title = "PyTorchVideo SlowFast Action Recognition"
146
+ description = """
147
+ Demo for PyTorchVideo's SlowFast model, pretrained on the Kinetics 400 dataset for action recognition.
148
+ Upload your video or use your webcam to classify the action.
149
+ """
150
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1812.03982' target='_blank'>SlowFast Networks for Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo' target='_blank'>PyTorchVideo GitHub Repo</a></p>"
151
 
152
  examples = [
153
+ [video_path] # Use the downloaded archery.mp4 as an example
154
  ]
155
 
156
+ gr.Interface(
157
+ fn=inference,
158
+ inputs=inputs_gradio,
159
+ outputs=outputs_gradio,
160
+ title=title,
161
+ description=description,
162
+ article=article,
163
+ examples=examples,
164
+ analytics_enabled=False
165
+ ).launch()