Spaces:

owinymarvin
/

SW_AI_deployment

Sleeping

App Files Files Community

owinymarvin commited on 29 days ago

Commit

11e2014

1 Parent(s): 320fc26

latest changes

Browse files

Files changed (2) hide show

app.py +124 -157
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,164 +1,131 @@
-import gradio as gr
 import torch
-import cv2
-import numpy as np
-import os
 import json
-from PIL import Image
-from torchvision import transforms
-from huggingface_hub import hf_hub_download
-import time
-# --- 1. Define Model Architecture ---
-class SmallVideoClassifier(torch.nn.Module):
-    def __init__(self, num_classes=2, num_frames=8):
-        super(SmallVideoClassifier, self).__init__()
-        from torchvision.models import mobilenet_v3_small, MobileNet_V3_Small_Weights
-        try:
-            weights = MobileNet_V3_Small_Weights.IMAGENET1K_V1
-        except Exception:
-            print("Warning: MobileNet_V3_Small_Weights.IMAGENET1K_V1 not found, initializing without pre-trained weights.")
-            weights = None
-        self.feature_extractor = mobilenet_v3_small(weights=weights)
-        self.feature_extractor.classifier = torch.nn.Identity()
-        self.num_spatial_features = 576
-        self.temporal_aggregator = torch.nn.AdaptiveAvgPool1d(1)
-        self.classifier = torch.nn.Sequential(
-            torch.nn.Linear(self.num_spatial_features, 512),
-            torch.nn.ReLU(),
-            torch.nn.Dropout(0.2),
-            torch.nn.Linear(512, num_classes)
-        )
-    def forward(self, pixel_values):
-        batch_size, num_frames, channels, height, width = pixel_values.shape
-        x = pixel_values.view(batch_size * num_frames, channels, height, width)
-        spatial_features = self.feature_extractor(x)
-        spatial_features = spatial_features.view(batch_size, num_frames, self.num_spatial_features)
-        temporal_features = self.temporal_aggregator(spatial_features.permute(0, 2, 1)).squeeze(-1)
-        logits = self.classifier(temporal_features)
-        return logits
-# --- 2. Configuration and Model Loading ---
-HF_USERNAME = "owinymarvin"
-NEW_MODEL_REPO_ID_SHORT = "timesformer-violence-detector"
-NEW_MODEL_REPO_ID = f"{HF_USERNAME}/{NEW_MODEL_REPO_ID_SHORT}"
-print(f"Downloading config.json from {NEW_MODEL_REPO_ID}...")
-config_path = hf_hub_download(repo_id=NEW_MODEL_REPO_ID, filename="config.json")
-with open(config_path, 'r') as f:
-    model_config = json.load(f)
-NUM_FRAMES = model_config.get('num_frames', 8)
-IMAGE_SIZE = tuple(model_config.get('image_size', [224, 224]))
-NUM_CLASSES = model_config.get('num_classes', 2)
-CLASS_LABELS = ["Non-violence", "Violence"]
-if NUM_CLASSES != len(CLASS_LABELS):
-    print(f"Warning: NUM_CLASSES in config ({NUM_CLASSES}) does not match hardcoded CLASS_LABELS length ({len(CLASS_LABELS)}). Adjust CLASS_LABELS if needed.")
-device = torch.device("cpu")
-print(f"Using device: {device}")
-model = SmallVideoClassifier(num_classes=NUM_CLASSES, num_frames=NUM_FRAMES)
-print(f"Downloading model weights from {NEW_MODEL_REPO_ID}...")
-model_weights_path = hf_hub_download(repo_id=NEW_MODEL_REPO_ID, filename="small_violence_classifier.pth")
-model.load_state_dict(torch.load(model_weights_path, map_location=device))
-model.to(device)
-model.eval()
-print(f"Model loaded successfully with {NUM_FRAMES} frames and image size {IMAGE_SIZE}.")
-# --- 3. Define Preprocessing Transform ---
-transform = transforms.Compose([
-    transforms.Resize(IMAGE_SIZE),
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-])
-# --- Global state for the generator function ---
-frame_buffer = []
-current_prediction_label = "Initializing..."
-current_probabilities = {label: 0.0 for label in CLASS_LABELS}
-# --- 4. Gradio Live Inference Function (Generator) ---
-# This function will receive individual frames from the webcam as a NumPy array (H, W, C, RGB)
-def predict_live_frames(input_frame):
-    global frame_buffer, current_prediction_label, current_probabilities
-    if input_frame is None:
-        dummy_frame = np.zeros((200, 400, 3), dtype=np.uint8)
-        cv2.putText(dummy_frame, "Waiting for webcam input...", (10, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
-        yield dummy_frame
-        return
-    pil_image = Image.fromarray(input_frame)
-    processed_frame_tensor = transform(pil_image)
-    frame_buffer.append(processed_frame_tensor)
-    slide_window_by = 1
-    if len(frame_buffer) >= NUM_FRAMES:
-        input_tensor = torch.stack(frame_buffer[-NUM_FRAMES:], dim=0).unsqueeze(0).to(device)
-        with torch.no_grad():
-            outputs = model(input_tensor)
-            probabilities = torch.softmax(outputs, dim=1)
-            predicted_class_idx = torch.argmax(probabilities, dim=1).item()
-            current_prediction_label = f"Class: {CLASS_LABELS[predicted_class_idx]}"
-            current_probabilities = {CLASS_LABELS[i]: prob.item() for i, prob in enumerate(probabilities[0])}
-        frame_buffer = frame_buffer[slide_window_by:]
-    display_frame = cv2.cvtColor(input_frame, cv2.COLOR_RGB2BGR)
-    # Draw the main prediction label
-    text_color = (0, 255, 0) # Green (BGR)
-    text_outline_color = (0, 0, 0) # Black
-    font_scale = 1.0
-    font_thickness = 2
-    cv2.putText(display_frame, current_prediction_label, (10, 40),
-                cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_outline_color, font_thickness + 2, cv2.LINE_AA)
-    cv2.putText(display_frame, current_prediction_label, (10, 40),
-                cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness, cv2.LINE_AA)
-    # Draw probabilities for all classes
-    y_offset = 80
-    for label, prob in current_probabilities.items():
-        prob_text = f"{label}: {prob:.2f}"
-        cv2.putText(display_frame, prob_text, (10, y_offset),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, text_outline_color, 2, cv2.LINE_AA)
-        cv2.putText(display_frame, prob_text, (10, y_offset),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 1, cv2.LINE_AA)
-        y_offset += 30
-    yield cv2.cvtColor(display_frame, cv2.COLOR_BGR2RGB)
-# --- 5. Gradio Interface Setup (with hidden buttons) ---
-iface = gr.Interface(
-    fn=predict_live_frames,
-    # Input: Live webcam feed, configure for streaming
-    inputs=gr.Video(sources=["webcam"], streaming=True, label="Live Webcam Feed"),
-    # Output: Image component to display processed frames
-    outputs=gr.Image(type="numpy", label="Processed Feed with Predictions"),
-    title="Real-time Violence Detection with SmallVideoClassifier (Webcam)",
-    description=(
-        "This model analyzes your live webcam feed for violence, displaying the predicted class and probabilities on the screen. "
-        "Please grant webcam access when prompted by your browser."
     ),
-    # --- IMPORTANT: Hide the default submit/clear buttons ---
-    submit_btn=None,
-    clear_btn=None,
-    allow_flagging="never",
-    # No examples needed for live webcam
-    examples=None
 )
-iface.launch()

 import torch
+# Choose the `slowfast_r50` model
+model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
+from typing import Dict
 import json
+import urllib
+from torchvision.transforms import Compose, Lambda
+from torchvision.transforms._transforms_video import (
+    CenterCropVideo,
+    NormalizeVideo,
+)
+from pytorchvideo.data.encoded_video import EncodedVideo
+from pytorchvideo.transforms import (
+    ApplyTransformToKey,
+    ShortSideScale,
+    UniformTemporalSubsample,
+    UniformCropVideo
+)
+import gradio as gr
+# Set to GPU or CPU
+device = "cpu"
+model = model.eval()
+model = model.to(device)
+json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
+json_filename = "kinetics_classnames.json"
+try: urllib.URLopener().retrieve(json_url, json_filename)
+except: urllib.request.urlretrieve(json_url, json_filename)
+with open(json_filename, "r") as f:
+    kinetics_classnames = json.load(f)
+# Create an id to label name mapping
+kinetics_id_to_classname = {}
+for k, v in kinetics_classnames.items():
+    kinetics_id_to_classname[v] = str(k).replace('"', "")
+side_size = 256
+mean = [0.45, 0.45, 0.45]
+std = [0.225, 0.225, 0.225]
+crop_size = 256
+num_frames = 32
+sampling_rate = 2
+frames_per_second = 30
+slowfast_alpha = 4
+num_clips = 10
+num_crops = 3
+class PackPathway(torch.nn.Module):
+    """
+    Transform for converting video frames as a list of tensors.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, frames: torch.Tensor):
+        fast_pathway = frames
+        # Perform temporal sampling from the fast pathway.
+        slow_pathway = torch.index_select(
+            frames,
+            1,
+            torch.linspace(
+                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
+            ).long(),
+        )
+        frame_list = [slow_pathway, fast_pathway]
+        return frame_list
+transform =  ApplyTransformToKey(
+    key="video",
+    transform=Compose(
+        [
+            UniformTemporalSubsample(num_frames),
+            Lambda(lambda x: x/255.0),
+            NormalizeVideo(mean, std),
+            ShortSideScale(
+                size=side_size
+            ),
+            CenterCropVideo(crop_size),
+            PackPathway()
+        ]
     ),
 )
+# The duration of the input clip is also specific to the model.
+clip_duration = (num_frames * sampling_rate)/frames_per_second
+url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
+video_path = 'archery.mp4'
+try: urllib.URLopener().retrieve(url_link, video_path)
+except: urllib.request.urlretrieve(url_link, video_path)
+# Select the duration of the clip to load by specifying the start and end duration
+# The start_sec should correspond to where the action occurs in the video
+def inference(in_vid):
+    start_sec = 0
+    end_sec = start_sec + clip_duration
+    # Initialize an EncodedVideo helper class and load the video
+    video = EncodedVideo.from_path(in_vid)
+    # Load the desired clip
+    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+    # Apply a transform to normalize the video input
+    video_data = transform(video_data)
+    # Move the inputs to the desired device
+    inputs = video_data["video"]
+    inputs = [i.to(device)[None, ...] for i in inputs]
+    # Pass the input clip through the model
+    preds = model(inputs)
+    # Get the predicted classes
+    post_act = torch.nn.Softmax(dim=1)
+    preds = post_act(preds)
+    pred_classes = preds.topk(k=5).indices[0]
+    # Map the predicted classes to the label names
+    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
+    return "%s" % ", ".join(pred_class_names)
+inputs = gr.inputs.Video(label="Input Video")
+outputs = gr.outputs.Textbox(label="Top 5 predicted labels")
+title = "SLOWFAST"
+description = "demo for SLOWFAST, SlowFast networks pretrained on the Kinetics 400 dataset. To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1812.03982'>SlowFast Networks for Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo'>Github Repo</a></p>"
+examples = [
+    ['archery.mp4']
+]
+gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples, analytics_enabled=False).launch(debug=True)

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ opencv-python-headless  # Use headless for server environments to avoid GUI depe
 gradio
 huggingface_hub
 Pillow
-numpy

 gradio
 huggingface_hub
 Pillow
+numpy
+av
+fvcore