Spaces:

owinymarvin
/

SW_AI_deployment

Running

App Files Files Community

owinymarvin commited on May 23

Commit

7cc8973

1 Parent(s): 8f7ac8f

Add application file

Browse files

Files changed (1) hide show

app.py +7 -10

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from collections import deque
 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 # These must match the values used during your training
-NUM_FRAMES = 16
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
@@ -37,12 +37,13 @@ print(f"Model's class labels: {model.config.id2label}")
 frame_buffer = deque(maxlen=NUM_FRAMES)
 last_inference_time = time.time()
 inference_interval = 1.0 # Predict every 1 second (1.0 / INFERENCE_FPS)
-current_prediction_text = "Buffering frames..."
 def predict_video_frame(image_np_array):
     global frame_buffer, last_inference_time, current_prediction_text
     # Gradio sends frames as numpy arrays (RGB)
     pil_image = Image.fromarray(image_np_array)
     frame_buffer.append(pil_image)
@@ -53,6 +54,7 @@ def predict_video_frame(image_np_array):
         last_inference_time = current_time
         # Preprocess the frames. processor expects a list of PIL Images or numpy arrays
         processed_input = processor(images=list(frame_buffer), return_tensors="pt")
         pixel_values = processed_input.pixel_values.to(device)
@@ -68,6 +70,7 @@ def predict_video_frame(image_np_array):
         print(current_prediction_text) # Print to Space logs
     # Return the current prediction text for display in the UI
     return current_prediction_text
 # --- Gradio Interface ---
@@ -75,19 +78,15 @@ def predict_video_frame(image_np_array):
 webcam_input = gr.Image(
     sources=["webcam"], # Allows webcam input
     streaming=True,      # Enables continuous streaming of frames
-    shape=(TARGET_IMAGE_WIDTH, TARGET_IMAGE_HEIGHT), # Set expected input resolution
     label="Live Webcam Feed"
 )
 # Output text box for predictions
-prediction_output = gr.Textbox(label="Real-time Prediction")
 # Define the Gradio Interface
-# We use Blocks for more control over layout if needed, but Interface works too.
-# For simplicity, we'll stick to a basic Interface
-# For streaming, gr.Interface.load() is more common, but let's define from scratch.
 demo = gr.Interface(
     fn=predict_video_frame,
     inputs=webcam_input,
@@ -96,8 +95,6 @@ demo = gr.Interface(
     allow_flagging="never", # Disable flagging on public demo
     title="TimesFormer Crime Detection Live Demo",
     description=f"This demo uses a finetuned TimesFormer model ({HF_MODEL_REPO_ID}) to predict crime actions from a live webcam feed. The model processes {NUM_FRAMES} frames at a time and makes a prediction every {inference_interval} seconds. Please allow webcam access.",
-    # You might want to add examples for file uploads if you also want to support video files.
-    # examples=["path/to/your/test_video.mp4"] # If you add video upload input
 )
 if __name__ == "__main__":

 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 # These must match the values used during your training
+NUM_FRAMES = 8
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
 frame_buffer = deque(maxlen=NUM_FRAMES)
 last_inference_time = time.time()
 inference_interval = 1.0 # Predict every 1 second (1.0 / INFERENCE_FPS)
+current_prediction_text = "Buffering frames..." # Initialize global text
 def predict_video_frame(image_np_array):
     global frame_buffer, last_inference_time, current_prediction_text
     # Gradio sends frames as numpy arrays (RGB)
+    # The image_processor will handle the resizing to TARGET_IMAGE_HEIGHT x TARGET_IMAGE_WIDTH
     pil_image = Image.fromarray(image_np_array)
     frame_buffer.append(pil_image)
         last_inference_time = current_time
         # Preprocess the frames. processor expects a list of PIL Images or numpy arrays
+        # It will handle resizing and normalization based on its config
         processed_input = processor(images=list(frame_buffer), return_tensors="pt")
         pixel_values = processed_input.pixel_values.to(device)
         print(current_prediction_text) # Print to Space logs
     # Return the current prediction text for display in the UI
+    # Gradio's streaming will update this textbox asynchronously
     return current_prediction_text
 # --- Gradio Interface ---
 webcam_input = gr.Image(
     sources=["webcam"], # Allows webcam input
     streaming=True,      # Enables continuous streaming of frames
+    # REMOVED: shape=(TARGET_IMAGE_WIDTH, TARGET_IMAGE_HEIGHT), # This was causing the TypeError
     label="Live Webcam Feed"
 )
 # Output text box for predictions
+prediction_output = gr.Textbox(label="Real-time Prediction", value="Buffering frames...")
 # Define the Gradio Interface
 demo = gr.Interface(
     fn=predict_video_frame,
     inputs=webcam_input,
     allow_flagging="never", # Disable flagging on public demo
     title="TimesFormer Crime Detection Live Demo",
     description=f"This demo uses a finetuned TimesFormer model ({HF_MODEL_REPO_ID}) to predict crime actions from a live webcam feed. The model processes {NUM_FRAMES} frames at a time and makes a prediction every {inference_interval} seconds. Please allow webcam access.",
 )
 if __name__ == "__main__":