Andy1621 commited on
Commit
8f49d2e
·
1 Parent(s): 89eb4bc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import numpy as np
5
+ import torch.nn.functional as F
6
+ import torchvision.transforms as T
7
+ from PIL import Image
8
+ from decord import VideoReader
9
+ from decord import cpu
10
+ from uniformer import uniformer_small
11
+ from kinetics_class_index import kinetics_classnames
12
+ from transforms import (
13
+ GroupNormalize, GroupScale, GroupCenterCrop,
14
+ Stack, ToTorchFormatTensor
15
+ )
16
+
17
+ import gradio as gr
18
+
19
+ # Device on which to run the model
20
+ # Set to cuda to load on GPU
21
+ device = "cpu"
22
+ os.system("wget https://cdn-lfs.huggingface.co/Andy1621/uniformer/d5fd7b0c49ee6a5422ef5d0c884d962c742003bfbd900747485eb99fa269d0db")
23
+ # Pick a pretrained model
24
+ model = uniformer_small()
25
+ state_dict = torch.load('d5fd7b0c49ee6a5422ef5d0c884d962c742003bfbd900747485eb99fa269d0db', map_location='cpu')
26
+ model.load_state_dict(state_dict['model'])
27
+
28
+ # Set to eval mode and move to desired device
29
+ model = model.to(device)
30
+ model = model.eval()
31
+
32
+ # Create an id to label name mapping
33
+ kinetics_id_to_classname = {}
34
+ for k, v in kinetics_classnames.items():
35
+ kinetics_id_to_classname[k] = v
36
+
37
+
38
+ def get_index(num_frames, num_segments=16, dense_sample_rate=8):
39
+ sample_range = num_segments * dense_sample_rate
40
+ sample_pos = max(1, 1 + num_frames - sample_range)
41
+ t_stride = dense_sample_rate
42
+ start_idx = 0 if sample_pos == 1 else sample_pos // 2
43
+ offsets = np.array([
44
+ (idx * t_stride + start_idx) %
45
+ num_frames for idx in range(num_segments)
46
+ ])
47
+ return offsets + 1
48
+
49
+
50
+ def load_video(video_path):
51
+ vr = VideoReader(video_path, ctx=cpu(0))
52
+ num_frames = len(vr)
53
+ frame_indices = get_index(num_frames, 16, 16)
54
+
55
+ # transform
56
+ crop_size = 224
57
+ scale_size = 256
58
+ input_mean = [0.485, 0.456, 0.406]
59
+ input_std = [0.229, 0.224, 0.225]
60
+
61
+ transform = T.Compose([
62
+ GroupScale(int(scale_size)),
63
+ GroupCenterCrop(crop_size),
64
+ Stack(),
65
+ ToTorchFormatTensor(),
66
+ GroupNormalize(input_mean, input_std)
67
+ ])
68
+
69
+ images_group = list()
70
+ for frame_index in frame_indices:
71
+ img = Image.fromarray(vr[frame_index].asnumpy())
72
+ images_group.append(img)
73
+ torch_imgs = transform(images_group)
74
+ return torch_imgs
75
+
76
+
77
+ def inference(video):
78
+ vid = load_video(video)
79
+
80
+ # The model expects inputs of shape: B x C x H x W
81
+ TC, H, W = vid.shape
82
+ inputs = vid.reshape(1, TC//3, 3, H, W).permute(0, 2, 1, 3, 4)
83
+
84
+ prediction = model(inputs)
85
+ prediction = F.softmax(prediction, dim=1).flatten()
86
+
87
+ return {kinetics_id_to_classname[str(i)]: float(prediction[i]) for i in range(1000)}
88
+
89
+
90
+ inputs = gr.inputs.Video()
91
+ label = gr.outputs.Label(num_top_classes=5)
92
+
93
+ title = "UniFormer-S"
94
+ description = "Gradio demo for UniFormer: To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
95
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.09450' target='_blank'>UniFormer: Unifying Convolution and Self-attention for Visual Recognition</a> | <a href='https://github.com/Sense-X/UniFormer' target='_blank'>Github Repo</a></p>"
96
+
97
+ gr.Interface(
98
+ inference, inputs, outputs=label,
99
+ title=title, description=description, article=article,
100
+ examples=[['hitting_baseball.mp4'], ['hoverboarding.mp4'], ['yoga.mp4']]
101
+ ).launch(enable_queue=True, cache_examples=True)