Spaces:

owinymarvin
/

SW_AI_deployment

Sleeping

App Files Files Community

owinymarvin commited on May 24

Commit

429fbf1

1 Parent(s): 998f789

latest changes

Browse files

Files changed (1) hide show

app.py +65 -52

app.py CHANGED Viewed

@@ -7,9 +7,9 @@ import json
 from PIL import Image
 from torchvision import transforms
 from huggingface_hub import hf_hub_download
-import time # For potential sleep to control frame rate if needed
-# --- 1. Define Model Architecture (Copy from small_video_classifier.py) ---
 class SmallVideoClassifier(torch.nn.Module):
     def __init__(self, num_classes=2, num_frames=8):
         super(SmallVideoClassifier, self).__init__()
@@ -58,7 +58,7 @@ CLASS_LABELS = ["Non-violence", "Violence"]
 if NUM_CLASSES != len(CLASS_LABELS):
     print(f"Warning: NUM_CLASSES in config ({NUM_CLASSES}) does not match hardcoded CLASS_LABELS length ({len(CLASS_LABELS)}). Adjust CLASS_LABELS if needed.")
-device = torch.device("cpu") # Explicitly use CPU
 print(f"Using device: {device}")
 model = SmallVideoClassifier(num_classes=NUM_CLASSES, num_frames=NUM_FRAMES)
@@ -78,34 +78,30 @@ transform = transforms.Compose([
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
 ])
-# --- 4. Gradio Live Inference Function (Generator) ---
-# This function will receive individual frames from the webcam
-# Initialize global state for the generator function (before the predict function)
-frame_buffer = [] # Buffer for collecting frames for model input
 current_prediction_label = "Initializing..."
-current_probabilities = {label: 0.0 for label in CLASS_LABELS} # Initial probabilities
 def predict_live_frames(input_frame):
-    global frame_buffer, current_prediction_label, current_probabilities # Use global to maintain state across calls
     if input_frame is None:
-        # If no frame is received (e.g., webcam not active), yield a black frame or handle gracefully
         dummy_frame = np.zeros((200, 400, 3), dtype=np.uint8)
         cv2.putText(dummy_frame, "Waiting for webcam input...", (10, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
         yield dummy_frame
-        return # Exit if no frame to process
-    # Gradio Webcam gives NumPy array (H, W, C) in RGB
     pil_image = Image.fromarray(input_frame)
-    # Apply transformations (outputs C, H, W tensor)
     processed_frame_tensor = transform(pil_image)
     frame_buffer.append(processed_frame_tensor)
-    # Perform prediction only when the buffer is full
-    if len(frame_buffer) == NUM_FRAMES:
-        # Stack the buffered frames and add a batch dimension
-        input_tensor = torch.stack(frame_buffer, dim=0).unsqueeze(0).to(device)
         with torch.no_grad():
             outputs = model(input_tensor)
@@ -115,15 +111,8 @@ def predict_live_frames(input_frame):
             current_prediction_label = f"Class: {CLASS_LABELS[predicted_class_idx]}"
             current_probabilities = {CLASS_LABELS[i]: prob.item() for i, prob in enumerate(probabilities[0])}
-        # --- Sliding Window ---
-        # Keep the last few frames to allow continuous predictions
-        slide_window_by = 1 # Predict every frame (most "real-time" feel but highest compute)
-                           # Or: NUM_FRAMES // 2 (e.g., predict every 4 frames for NUM_FRAMES=8)
-                           # Or: NUM_FRAMES (non-overlapping windows, less frequent updates)
-        frame_buffer = frame_buffer[slide_window_by:]
-    # --- Draw Prediction on the current input frame ---
-    # Convert the input_frame (RGB NumPy array) to BGR for OpenCV drawing
     display_frame = cv2.cvtColor(input_frame, cv2.COLOR_RGB2BGR)
     # Draw the main prediction label
@@ -132,42 +121,66 @@ def predict_live_frames(input_frame):
     font_scale = 1.0
     font_thickness = 2
-    # Draw outline first for better readability
     cv2.putText(display_frame, current_prediction_label, (10, 40),
                 cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_outline_color, font_thickness + 2, cv2.LINE_AA)
-    # Draw actual text
     cv2.putText(display_frame, current_prediction_label, (10, 40),
                 cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness, cv2.LINE_AA)
-    # Draw probabilities for all classes (like YOLO)
-    y_offset = 80 # Start drawing probabilities slightly lower
     for label, prob in current_probabilities.items():
         prob_text = f"{label}: {prob:.2f}"
         cv2.putText(display_frame, prob_text, (10, y_offset),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.7, text_outline_color, 2, cv2.LINE_AA)
         cv2.putText(display_frame, prob_text, (10, y_offset),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 1, cv2.LINE_AA) # Yellow for probs
-        y_offset += 30 # Move down for next probability
-    # Yield the processed frame back to Gradio for display
-    # Gradio expects RGB NumPy array for video/image components
     yield cv2.cvtColor(display_frame, cv2.COLOR_BGR2RGB)
-# --- 5. Gradio Interface Setup ---
-iface = gr.Interface(
-    fn=predict_live_frames,
-    # CORRECTED: Use gr.Video with sources=["webcam"] for webcam input
-    inputs=gr.Video(sources=["webcam"], streaming=True, label="Live Webcam Feed for Violence Detection"),
-    # Outputs are updated continuously by the generator
-    outputs=gr.Image(type="numpy", label="Live Prediction Output"), # Using Image as output for continuous frames
-    title="Real-time Violence Detection with SmallVideoClassifier (Webcam)",
-    description=(
-        "This model detects violence in a live webcam feed. "
-        "Predictions (Class and Probabilities) will be displayed on each frame. "
-        "Please allow webcam access when prompted."
-    ),
-    allow_flagging="never", # Disable flagging on Hugging Face Spaces
-)
-iface.launch()

 from PIL import Image
 from torchvision import transforms
 from huggingface_hub import hf_hub_download
+import time
+# --- 1. Define Model Architecture ---
 class SmallVideoClassifier(torch.nn.Module):
     def __init__(self, num_classes=2, num_frames=8):
         super(SmallVideoClassifier, self).__init__()
 if NUM_CLASSES != len(CLASS_LABELS):
     print(f"Warning: NUM_CLASSES in config ({NUM_CLASSES}) does not match hardcoded CLASS_LABELS length ({len(CLASS_LABELS)}). Adjust CLASS_LABELS if needed.")
+device = torch.device("cpu")
 print(f"Using device: {device}")
 model = SmallVideoClassifier(num_classes=NUM_CLASSES, num_frames=NUM_FRAMES)
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
 ])
+# --- Global state for the generator function ---
+frame_buffer = []
 current_prediction_label = "Initializing..."
+current_probabilities = {label: 0.0 for label in CLASS_LABELS}
+# --- 4. Gradio Live Inference Function (Generator) ---
 def predict_live_frames(input_frame):
+    global frame_buffer, current_prediction_label, current_probabilities
     if input_frame is None:
+        # If no frame is received (e.g., webcam not active or disconnected)
         dummy_frame = np.zeros((200, 400, 3), dtype=np.uint8)
         cv2.putText(dummy_frame, "Waiting for webcam input...", (10, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
         yield dummy_frame
+        return
     pil_image = Image.fromarray(input_frame)
     processed_frame_tensor = transform(pil_image)
     frame_buffer.append(processed_frame_tensor)
+    slide_window_by = 1
+    if len(frame_buffer) >= NUM_FRAMES:
+        input_tensor = torch.stack(frame_buffer[-NUM_FRAMES:], dim=0).unsqueeze(0).to(device)
         with torch.no_grad():
             outputs = model(input_tensor)
             current_prediction_label = f"Class: {CLASS_LABELS[predicted_class_idx]}"
             current_probabilities = {CLASS_LABELS[i]: prob.item() for i, prob in enumerate(probabilities[0])}
+        frame_buffer = frame_buffer[slide_window_by:]
     display_frame = cv2.cvtColor(input_frame, cv2.COLOR_RGB2BGR)
     # Draw the main prediction label
     font_scale = 1.0
     font_thickness = 2
     cv2.putText(display_frame, current_prediction_label, (10, 40),
                 cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_outline_color, font_thickness + 2, cv2.LINE_AA)
     cv2.putText(display_frame, current_prediction_label, (10, 40),
                 cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness, cv2.LINE_AA)
+    # Draw probabilities for all classes
+    y_offset = 80
     for label, prob in current_probabilities.items():
         prob_text = f"{label}: {prob:.2f}"
         cv2.putText(display_frame, prob_text, (10, y_offset),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.7, text_outline_color, 2, cv2.LINE_AA)
         cv2.putText(display_frame, prob_text, (10, y_offset),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 1, cv2.LINE_AA)
+        y_offset += 30
     yield cv2.cvtColor(display_frame, cv2.COLOR_BGR2RGB)
+# --- 5. Gradio Blocks Interface Setup ---
+with gr.Blocks(
+    title="Real-time Violence Detection", # Title for the browser tab
+    theme=gr.themes.Default(primary_hue=gr.Color(c50='#e0f7fa', c100='#b2ebf2', c200='#80deea', c300='#4dd0e1', c400='#26c6da', c500='#00bcd4', c600='#00acc1', c700='#0097a7', c800='#00838f', c900='#006064', ca50='#84ffff', ca100='#18ffff', ca200='#00e5ff', ca400='#00b8d4')) # Optional: A subtle theme change
+) as demo:
+    # Optional: Display a title and description clearly, even without buttons
+    gr.Markdown(
+        """
+        # 🎬 Real-time Violence Detection
+        **Live Feed with Constant Predictions**
+        This model analyzes your live webcam feed for violence, displaying the predicted class and probabilities on the screen.
+        Please grant webcam access when prompted by your browser.
+        """
+    )
+    with gr.Row():
+        # Input: Live webcam feed
+        # We need to set a minimum height and width to ensure the video feed is displayed reasonably
+        video_input = gr.Video(
+            sources=["webcam"],
+            streaming=True,
+            label="Live Webcam Feed",
+            # Optional: Set dimensions for the video display
+            height=480, # or None for auto
+            width=640 # or None for auto
+        )
+        # Output: Image component to display processed frames
+        video_output = gr.Image(
+            type="numpy",
+            label="Processed Feed with Predictions",
+            # Optional: Set dimensions to match input or your preference
+            height=480, # or None for auto
+            width=640 # or None for auto
+        )
+    # Connect the video stream directly to the prediction function
+    # The 'stream' event on gr.Video is triggered as new frames arrive from the webcam.
+    video_input.stream(
+        predict_live_frames, # The function to call for each frame
+        inputs=video_input,  # Pass the video_input component itself as input
+        outputs=video_output # Update the video_output component
+    )
+demo.launch()