Spaces:

owinymarvin
/

SW_AI_deployment

Running

App Files Files Community

owinymarvin commited on May 24

Commit

4ad907b

1 Parent(s): e1f8629

latest changes

Browse files

Files changed (1) hide show

app.py +38 -42

app.py CHANGED Viewed

@@ -15,17 +15,15 @@ class SmallVideoClassifier(torch.nn.Module):
     def __init__(self, num_classes=2, num_frames=8):
         super(SmallVideoClassifier, self).__init__()
         from torchvision.models import mobilenet_v3_small, MobileNet_V3_Small_Weights
-        # Load weights only if they are available (they should be for IMAGENET1K_V1)
-        # Use a check to prevent error if weights are not found in specific environments
         try:
             weights = MobileNet_V3_Small_Weights.IMAGENET1K_V1
         except Exception:
             print("Warning: MobileNet_V3_Small_Weights.IMAGENET1K_V1 not found, initializing without pre-trained weights.")
-            weights = None # Or provide specific default for your use case
         self.feature_extractor = mobilenet_v3_small(weights=weights)
         self.feature_extractor.classifier = torch.nn.Identity()
-        self.num_spatial_features = 576 # MobileNetV3-Small's final feature map size
         self.temporal_aggregator = torch.nn.AdaptiveAvgPool1d(1)
         self.classifier = torch.nn.Sequential(
             torch.nn.Linear(self.num_spatial_features, 512),
@@ -48,7 +46,6 @@ HF_USERNAME = "owinymarvin"
 NEW_MODEL_REPO_ID_SHORT = "timesformer-violence-detector"
 NEW_MODEL_REPO_ID = f"{HF_USERNAME}/{NEW_MODEL_REPO_ID_SHORT}"
-# Download config.json to get model parameters
 print(f"Downloading config.json from {NEW_MODEL_REPO_ID}...")
 config_path = hf_hub_download(repo_id=NEW_MODEL_REPO_ID, filename="config.json")
 with open(config_path, 'r') as f:
@@ -58,24 +55,20 @@ NUM_FRAMES = model_config.get('num_frames', 8)
 IMAGE_SIZE = tuple(model_config.get('image_size', [224, 224]))
 NUM_CLASSES = model_config.get('num_classes', 2)
-# Define class labels (adjust if your dataset had different labels/order)
 CLASS_LABELS = ["Non-violence", "Violence"]
 if NUM_CLASSES != len(CLASS_LABELS):
     print(f"Warning: NUM_CLASSES in config ({NUM_CLASSES}) does not match hardcoded CLASS_LABELS length ({len(CLASS_LABELS)}). Adjust CLASS_LABELS if needed.")
-# Initialize the model
-device = torch.device("cpu") # Explicitly use CPU as requested
 print(f"Using device: {device}")
 model = SmallVideoClassifier(num_classes=NUM_CLASSES, num_frames=NUM_FRAMES)
-# Download model weights
 print(f"Downloading model weights from {NEW_MODEL_REPO_ID}...")
 model_weights_path = hf_hub_download(repo_id=NEW_MODEL_REPO_ID, filename="small_violence_classifier.pth")
 model.load_state_dict(torch.load(model_weights_path, map_location=device))
 model.to(device)
-model.eval() # Set model to evaluation mode
 print(f"Model loaded successfully with {NUM_FRAMES} frames and image size {IMAGE_SIZE}.")
@@ -89,7 +82,7 @@ transform = transforms.Compose([
 # --- 4. Gradio Inference Function ---
 def predict_video(video_path):
     if video_path is None:
-        return None # Or raise an error, or return a placeholder video
     cap = cv2.VideoCapture(video_path)
@@ -97,50 +90,47 @@ def predict_video(video_path):
         print(f"Error: Could not open video file {video_path}.")
         raise ValueError(f"Could not open video file {video_path}. Please ensure it's a valid video format.")
-    # Get video properties
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Create a temporary output video file
-    # Use tempfile to ensure proper cleanup on Hugging Face Spaces
     temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     output_video_path = temp_output_file.name
-    temp_output_file.close() # Close the file handle as cv2.VideoWriter needs to open it
-    # Define the codec and create VideoWriter object
-    # For MP4, 'mp4v' is generally compatible.
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
     print(f"Processing video: {video_path}")
     print(f"Total frames: {total_frames}, FPS: {fps}")
     print(f"Output video will be saved to: {output_video_path}")
-    frame_buffer = [] # To store NUM_FRAMES for each prediction batch
-    current_prediction_label = "Processing..." # Initial label
     frame_idx = 0
     while True:
         ret, frame = cap.read()
         if not ret:
-            break # End of video
         frame_idx += 1
-        # Convert frame from BGR (OpenCV) to RGB (PIL/PyTorch)
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(frame_rgb)
-        # Apply transformations and add to buffer
-        processed_frame = transform(pil_image) # shape: (C, H, W)
         frame_buffer.append(processed_frame)
-        # Perform prediction when the buffer is full
         if len(frame_buffer) == NUM_FRAMES:
-            # Stack the buffered frames and add a batch dimension
-            # Resulting shape: (1, NUM_FRAMES, C, H, W)
             input_tensor = torch.stack(frame_buffer, dim=0).unsqueeze(0).to(device)
             with torch.no_grad():
@@ -149,39 +139,45 @@ def predict_video(video_path):
                 predicted_class_idx = torch.argmax(probabilities, dim=1).item()
                 current_prediction_label = f"Prediction: {CLASS_LABELS[predicted_class_idx]} (Prob: {probabilities[0, predicted_class_idx]:.2f})"
-            # Reset buffer for the next non-overlapping window
             frame_buffer = []
-            # Or, if you want sliding window (more continuous output but higher compute):
-            # frame_buffer = frame_buffer[1:] # e.g., slide by 1 frame
         # Draw prediction text on the current frame
-        # The prediction will lag by NUM_FRAMES, as it's based on the previous batch.
-        # We display the last known prediction.
-        cv2.putText(frame, current_prediction_label, (10, 30),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2, cv2.LINE_AA)
-        # Write the processed frame to the output video
         out.write(frame)
-    # Release resources
     cap.release()
     out.release()
     print(f"Video processing complete. Output saved to: {output_video_path}")
-    return output_video_path # Gradio expects the path to the output video
 # --- 5. Gradio Interface Setup ---
 iface = gr.Interface(
     fn=predict_video,
-    # Corrected: Removed 'type="filepath"' as it's not a valid argument for gr.Video
     inputs=gr.Video(label="Upload Video for Violence Detection (MP4 recommended)"),
     outputs=gr.Video(label="Processed Video with Predictions"),
     title="Real-time Violence Detection with SmallVideoClassifier",
     description="Upload a video, and the model will analyze it for violence, displaying the predicted class and confidence on each frame.",
-    allow_flagging="never", # Disable flagging on Hugging Face Spaces
     examples=[
-        # You can provide example video URLs or paths if you have them publicly available
-        # e.g., "https://huggingface.co/datasets/gradio/test-files/resolve/main/video.mp4"
     ]
 )

     def __init__(self, num_classes=2, num_frames=8):
         super(SmallVideoClassifier, self).__init__()
         from torchvision.models import mobilenet_v3_small, MobileNet_V3_Small_Weights
         try:
             weights = MobileNet_V3_Small_Weights.IMAGENET1K_V1
         except Exception:
             print("Warning: MobileNet_V3_Small_Weights.IMAGENET1K_V1 not found, initializing without pre-trained weights.")
+            weights = None
         self.feature_extractor = mobilenet_v3_small(weights=weights)
         self.feature_extractor.classifier = torch.nn.Identity()
+        self.num_spatial_features = 576
         self.temporal_aggregator = torch.nn.AdaptiveAvgPool1d(1)
         self.classifier = torch.nn.Sequential(
             torch.nn.Linear(self.num_spatial_features, 512),
 NEW_MODEL_REPO_ID_SHORT = "timesformer-violence-detector"
 NEW_MODEL_REPO_ID = f"{HF_USERNAME}/{NEW_MODEL_REPO_ID_SHORT}"
 print(f"Downloading config.json from {NEW_MODEL_REPO_ID}...")
 config_path = hf_hub_download(repo_id=NEW_MODEL_REPO_ID, filename="config.json")
 with open(config_path, 'r') as f:
 IMAGE_SIZE = tuple(model_config.get('image_size', [224, 224]))
 NUM_CLASSES = model_config.get('num_classes', 2)
 CLASS_LABELS = ["Non-violence", "Violence"]
 if NUM_CLASSES != len(CLASS_LABELS):
     print(f"Warning: NUM_CLASSES in config ({NUM_CLASSES}) does not match hardcoded CLASS_LABELS length ({len(CLASS_LABELS)}). Adjust CLASS_LABELS if needed.")
+device = torch.device("cpu")
 print(f"Using device: {device}")
 model = SmallVideoClassifier(num_classes=NUM_CLASSES, num_frames=NUM_FRAMES)
 print(f"Downloading model weights from {NEW_MODEL_REPO_ID}...")
 model_weights_path = hf_hub_download(repo_id=NEW_MODEL_REPO_ID, filename="small_violence_classifier.pth")
 model.load_state_dict(torch.load(model_weights_path, map_location=device))
 model.to(device)
+model.eval()
 print(f"Model loaded successfully with {NUM_FRAMES} frames and image size {IMAGE_SIZE}.")
 # --- 4. Gradio Inference Function ---
 def predict_video(video_path):
     if video_path is None:
+        return None
     cap = cv2.VideoCapture(video_path)
         print(f"Error: Could not open video file {video_path}.")
         raise ValueError(f"Could not open video file {video_path}. Please ensure it's a valid video format.")
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     fps = cap.get(cv2.CAP_PROP_FPS)
+    # Ensure FPS is not zero to avoid division by zero errors, default to 25 if needed
+    if fps <= 0:
+        fps = 25.0
+        print(f"Warning: Original video FPS was 0 or less, defaulting to {fps}.")
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     output_video_path = temp_output_file.name
+    temp_output_file.close()
+    # --- CHANGED: Use XVID codec for better browser compatibility ---
+    # This might prevent Gradio's internal re-encoding.
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
     out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
     print(f"Processing video: {video_path}")
     print(f"Total frames: {total_frames}, FPS: {fps}")
     print(f"Output video will be saved to: {output_video_path}")
+    frame_buffer = []
+    current_prediction_label = "Processing..."
     frame_idx = 0
     while True:
         ret, frame = cap.read()
         if not ret:
+            break
         frame_idx += 1
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(frame_rgb)
+        processed_frame = transform(pil_image)
         frame_buffer.append(processed_frame)
         if len(frame_buffer) == NUM_FRAMES:
             input_tensor = torch.stack(frame_buffer, dim=0).unsqueeze(0).to(device)
             with torch.no_grad():
                 predicted_class_idx = torch.argmax(probabilities, dim=1).item()
                 current_prediction_label = f"Prediction: {CLASS_LABELS[predicted_class_idx]} (Prob: {probabilities[0, predicted_class_idx]:.2f})"
             frame_buffer = []
+            # If you want a sliding window, you would do something like:
+            # frame_buffer = frame_buffer[int(NUM_FRAMES * 0.5):] # Slide by half the window size
         # Draw prediction text on the current frame
+        # Ensure text color is clearly visible (e.g., white or bright green)
+        # Add a black outline for better readability
+        text_color = (0, 255, 0) # Green (BGR format for OpenCV)
+        text_outline_color = (0, 0, 0) # Black
+        font_scale = 1.0 # Increased font size
+        font_thickness = 2
+        # Draw outline first for better readability
+        cv2.putText(frame, current_prediction_label, (10, 40), # Slightly lower position
+                    cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_outline_color, font_thickness + 2, cv2.LINE_AA)
+        # Draw actual text
+        cv2.putText(frame, current_prediction_label, (10, 40),
+                    cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness, cv2.LINE_AA)
         out.write(frame)
     cap.release()
     out.release()
     print(f"Video processing complete. Output saved to: {output_video_path}")
+    return output_video_path
 # --- 5. Gradio Interface Setup ---
 iface = gr.Interface(
     fn=predict_video,
     inputs=gr.Video(label="Upload Video for Violence Detection (MP4 recommended)"),
     outputs=gr.Video(label="Processed Video with Predictions"),
     title="Real-time Violence Detection with SmallVideoClassifier",
     description="Upload a video, and the model will analyze it for violence, displaying the predicted class and confidence on each frame.",
+    allow_flagging="never",
     examples=[
+        # Add example videos here for easier testing and demonstration
+        # E.g., a sample video that's publicly accessible:
+        # "https://huggingface.co/datasets/gradio/test-files/resolve/main/video.mp4"
     ]
 )