Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import torchvision.transforms as T
|
7 |
+
from PIL import Image
|
8 |
+
from decord import VideoReader
|
9 |
+
from decord import cpu
|
10 |
+
from uniformer import uniformer_small
|
11 |
+
from kinetics_class_index import kinetics_classnames
|
12 |
+
from transforms import (
|
13 |
+
GroupNormalize, GroupScale, GroupCenterCrop,
|
14 |
+
Stack, ToTorchFormatTensor
|
15 |
+
)
|
16 |
+
|
17 |
+
import gradio as gr
|
18 |
+
|
19 |
+
# Device on which to run the model
|
20 |
+
# Set to cuda to load on GPU
|
21 |
+
device = "cpu"
|
22 |
+
os.system("wget https://cdn-lfs.huggingface.co/Andy1621/uniformer/d5fd7b0c49ee6a5422ef5d0c884d962c742003bfbd900747485eb99fa269d0db")
|
23 |
+
# Pick a pretrained model
|
24 |
+
model = uniformer_small()
|
25 |
+
state_dict = torch.load('d5fd7b0c49ee6a5422ef5d0c884d962c742003bfbd900747485eb99fa269d0db', map_location='cpu')
|
26 |
+
model.load_state_dict(state_dict['model'])
|
27 |
+
|
28 |
+
# Set to eval mode and move to desired device
|
29 |
+
model = model.to(device)
|
30 |
+
model = model.eval()
|
31 |
+
|
32 |
+
# Create an id to label name mapping
|
33 |
+
kinetics_id_to_classname = {}
|
34 |
+
for k, v in kinetics_classnames.items():
|
35 |
+
kinetics_id_to_classname[k] = v
|
36 |
+
|
37 |
+
|
38 |
+
def get_index(num_frames, num_segments=16, dense_sample_rate=8):
|
39 |
+
sample_range = num_segments * dense_sample_rate
|
40 |
+
sample_pos = max(1, 1 + num_frames - sample_range)
|
41 |
+
t_stride = dense_sample_rate
|
42 |
+
start_idx = 0 if sample_pos == 1 else sample_pos // 2
|
43 |
+
offsets = np.array([
|
44 |
+
(idx * t_stride + start_idx) %
|
45 |
+
num_frames for idx in range(num_segments)
|
46 |
+
])
|
47 |
+
return offsets + 1
|
48 |
+
|
49 |
+
|
50 |
+
def load_video(video_path):
|
51 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
52 |
+
num_frames = len(vr)
|
53 |
+
frame_indices = get_index(num_frames, 16, 16)
|
54 |
+
|
55 |
+
# transform
|
56 |
+
crop_size = 224
|
57 |
+
scale_size = 256
|
58 |
+
input_mean = [0.485, 0.456, 0.406]
|
59 |
+
input_std = [0.229, 0.224, 0.225]
|
60 |
+
|
61 |
+
transform = T.Compose([
|
62 |
+
GroupScale(int(scale_size)),
|
63 |
+
GroupCenterCrop(crop_size),
|
64 |
+
Stack(),
|
65 |
+
ToTorchFormatTensor(),
|
66 |
+
GroupNormalize(input_mean, input_std)
|
67 |
+
])
|
68 |
+
|
69 |
+
images_group = list()
|
70 |
+
for frame_index in frame_indices:
|
71 |
+
img = Image.fromarray(vr[frame_index].asnumpy())
|
72 |
+
images_group.append(img)
|
73 |
+
torch_imgs = transform(images_group)
|
74 |
+
return torch_imgs
|
75 |
+
|
76 |
+
|
77 |
+
def inference(video):
|
78 |
+
vid = load_video(video)
|
79 |
+
|
80 |
+
# The model expects inputs of shape: B x C x H x W
|
81 |
+
TC, H, W = vid.shape
|
82 |
+
inputs = vid.reshape(1, TC//3, 3, H, W).permute(0, 2, 1, 3, 4)
|
83 |
+
|
84 |
+
prediction = model(inputs)
|
85 |
+
prediction = F.softmax(prediction, dim=1).flatten()
|
86 |
+
|
87 |
+
return {kinetics_id_to_classname[str(i)]: float(prediction[i]) for i in range(1000)}
|
88 |
+
|
89 |
+
|
90 |
+
inputs = gr.inputs.Video()
|
91 |
+
label = gr.outputs.Label(num_top_classes=5)
|
92 |
+
|
93 |
+
title = "UniFormer-S"
|
94 |
+
description = "Gradio demo for UniFormer: To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
|
95 |
+
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.09450' target='_blank'>UniFormer: Unifying Convolution and Self-attention for Visual Recognition</a> | <a href='https://github.com/Sense-X/UniFormer' target='_blank'>Github Repo</a></p>"
|
96 |
+
|
97 |
+
gr.Interface(
|
98 |
+
inference, inputs, outputs=label,
|
99 |
+
title=title, description=description, article=article,
|
100 |
+
examples=[['hitting_baseball.mp4'], ['hoverboarding.mp4'], ['yoga.mp4']]
|
101 |
+
).launch(enable_queue=True, cache_examples=True)
|