Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from transformers import AutoProcessor, AutoModel | |
| from PIL import Image | |
| from decord import VideoReader, cpu | |
| def sample_uniform_frame_indices(clip_len, seg_len): | |
| """ | |
| Samples `clip_len` uniformly spaced frame indices from a video of length `seg_len`. | |
| Handles edge cases where `seg_len` might be less than `clip_len`. | |
| """ | |
| if seg_len < clip_len: | |
| repeat_factor = np.ceil(clip_len / seg_len).astype(int) | |
| indices = np.arange(seg_len).tolist() * repeat_factor | |
| indices = indices[:clip_len] | |
| else: | |
| spacing = seg_len // clip_len | |
| indices = [i * spacing for i in range(clip_len)] | |
| return np.array(indices).astype(np.int64) | |
| def read_video_decord(file_path, indices): | |
| vr = VideoReader(file_path, num_threads=1, ctx=cpu(0)) | |
| video = vr.get_batch(indices).asnumpy() | |
| return video | |
| def concatenate_frames(frames, clip_len): | |
| assert len(frames) == clip_len, f"The function expects {clip_len} frames as input." | |
| layout = { | |
| 32: (4, 8), | |
| 16: (4, 4), | |
| 8: (2, 4) | |
| } | |
| rows, cols = layout[clip_len] | |
| combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows)) | |
| frame_iter = iter(frames) | |
| y_offset = 0 | |
| for i in range(rows): | |
| x_offset = 0 | |
| for j in range(cols): | |
| img = Image.fromarray(next(frame_iter)) | |
| combined_image.paste(img, (x_offset, y_offset)) | |
| x_offset += frames[0].shape[1] | |
| y_offset += frames[0].shape[0] | |
| return combined_image | |
| def model_interface(uploaded_video, model_choice, activities): | |
| clip_len = { | |
| "microsoft/xclip-base-patch16-zero-shot": 32, | |
| "microsoft/xclip-base-patch32-16-frames": 16, | |
| "microsoft/xclip-base-patch32": 8 | |
| }.get(model_choice, 32) | |
| indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video))) | |
| video = read_video_decord(uploaded_video, indices) | |
| concatenated_image = concatenate_frames(video, clip_len) # Passed clip_len as argument | |
| processor = AutoProcessor.from_pretrained(model_choice) | |
| model = AutoModel.from_pretrained(model_choice) | |
| activities_list = activities.split(",") | |
| inputs = processor( | |
| text=activities_list, | |
| videos=list(video), | |
| return_tensors="pt", | |
| padding=True, | |
| ) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| logits_per_video = outputs.logits_per_video | |
| probs = logits_per_video.softmax(dim=1) | |
| results_probs = [] | |
| results_logits = [] | |
| for i in range(len(activities_list)): | |
| activity = activities_list[i] | |
| prob = float(probs[0][i]) | |
| logit = float(logits_per_video[0][i]) | |
| results_probs.append((activity, f"Probability: {prob * 100:.2f}%")) | |
| results_logits.append((activity, f"Raw Score: {logit:.2f}")) | |
| # Retrieve most likely predicted label and its probability | |
| max_prob_idx = probs[0].argmax().item() | |
| most_likely_activity = activities_list[max_prob_idx] | |
| most_likely_prob = float(probs[0][max_prob_idx]) | |
| return concatenated_image, results_probs, results_logits, (most_likely_activity, f"Probability: {most_likely_prob * 100:.2f}%") | |
| iface = gr.Interface( | |
| fn=model_interface, | |
| inputs=[ | |
| gr.components.Video(label="Upload a video file"), | |
| gr.components.Dropdown(choices=[ | |
| "microsoft/xclip-base-patch16-zero-shot", | |
| "microsoft/xclip-base-patch32-16-frames", | |
| "microsoft/xclip-base-patch32" | |
| ], label="Model Choice"), | |
| gr.components.Textbox(lines=4, label="Enter activities (comma-separated)"), | |
| ], | |
| outputs=[ | |
| gr.components.Image(type="pil", label="sampled frames"), | |
| gr.components.Textbox(type="text", label="Probabilities"), | |
| gr.components.Textbox(type="text", label="Raw Scores"), | |
| gr.components.Textbox(type="text", label="Most Likely Prediction") | |
| ], | |
| live=False | |
| ) | |
| iface.launch() |