Spaces:

owinymarvin
/

SW_AI_deployment

Running

App Files Files Community

owinymarvin commited on May 23

Commit

4784ef2

1 Parent(s): bc79b5b

latest changes

Browse files

Files changed (1) hide show

app.py +133 -69

app.py CHANGED Viewed

@@ -12,17 +12,22 @@ from collections import deque
 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 # These must match the values used during your training
-NUM_FRAMES = 8 # Still using 8 frames, as that was your original training setup
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
-# --- Prediction Timing ---
-# How long to record (in seconds) before making a prediction
-RECORDING_DURATION_SECONDS = 10.0 # CHANGED: Now records for 10 seconds
-# How often the model should predict (after the recording duration)
-# Setting this to a very high number (like 9999) means it essentially predicts only once
-# after the recording is done until reset. Or you can leave it at 1.0 if you want it to trigger often.
-INFERENCE_INTERVAL_SECONDS = 1.0 # This will be the minimum time between predictions if not controlled by reset.
 # --- Load Model and Processor ---
@@ -41,40 +46,83 @@ print(f"Model loaded successfully on {device}.")
 print(f"Model's class labels: {model.config.id2label}")
 # --- Global State Variables ---
-# Use a global deque to store captured frames
-captured_frames_buffer = deque(maxlen=NUM_FRAMES)
-recording_start_time = None # To track when recording for a clip started
-last_prediction_time = time.time() # To control prediction frequency after recording
-# --- Functions for Gradio Interface ---
-def process_frame_and_predict(image_np_array):
-    global captured_frames_buffer, recording_start_time, last_prediction_time
-    # Initialize recording_start_time if it's the first frame for a new recording cycle
-    if recording_start_time is None:
-        recording_start_time = time.time()
-        captured_frames_buffer.clear() # Clear buffer to start a new clip
-    # Convert Gradio's numpy array (RGB) to PIL Image
-    pil_image = Image.fromarray(image_np_array)
-    captured_frames_buffer.append(pil_image)
     current_time = time.time()
-    elapsed_recording_time = current_time - recording_start_time
-    output_status = f"Recording: {elapsed_recording_time:.1f}/{RECORDING_DURATION_SECONDS}s | Frames: {len(captured_frames_buffer)}/{NUM_FRAMES}"
-    prediction_text = "Recording..." # Default text while recording
-    # Check if enough time has passed and we have enough frames
-    if elapsed_recording_time >= RECORDING_DURATION_SECONDS and len(captured_frames_buffer) >= NUM_FRAMES:
-        if (current_time - last_prediction_time) >= INFERENCE_INTERVAL_SECONDS: # Limit prediction frequency
-            # --- Perform Inference ---
-            print(f"Triggered inference on {len(captured_frames_buffer)} frames after {RECORDING_DURATION_SECONDS}s recording...")
-            frames_for_prediction = list(captured_frames_buffer) # Take a snapshot
-            # The image_processor will handle the resizing to TARGET_IMAGE_HEIGHT x TARGET_IMAGE_WIDTH
-            processed_input = processor(images=frames_for_prediction, return_tensors="pt")
             pixel_values = processed_input.pixel_values.to(device)
             with torch.no_grad():
@@ -85,38 +133,54 @@ def process_frame_and_predict(image_np_array):
             predicted_label = model.config.id2label[predicted_class_id]
             confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
-            prediction_text = f"Predicted: {predicted_label} ({confidence:.2f})"
-            print(prediction_text) # Print to Space logs
-            last_prediction_time = current_time # Update time of last successful prediction
-            # Reset recording_start_time to allow a new recording cycle
-            recording_start_time = None
-            captured_frames_buffer.clear() # Clear buffer for next clip
         else:
-            prediction_text = "Prediction done. Waiting for next interval..." # Message if prediction recently made
-    return output_status, prediction_text
-def reset_app_state():
-    """Resets the global state variables to start a new recording/prediction cycle."""
-    global captured_frames_buffer, recording_start_time, last_prediction_time
-    captured_frames_buffer.clear()
-    recording_start_time = None
-    last_prediction_time = time.time()
-    print("App state reset.")
-    # Return initial messages for the UI
-    return "Ready to record...", "Ready for new prediction."
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         f"""
-        # TimesFormer Crime Detection Live Demo (Auto-Triggered Clip Prediction)
-        This demo uses a finetuned TimesFormer model ({HF_MODEL_REPO_ID}) to predict crime actions from a live webcam feed.
-        It records **{RECORDING_DURATION_SECONDS} seconds** of video, then automatically triggers a prediction.
-        The model processes **{NUM_FRAMES} frames** per prediction.
-        Click 'Reset' to start a new video recording.
         Please allow webcam access.
         """
     )
@@ -128,27 +192,27 @@ with gr.Blocks() as demo:
                 label="Live Webcam Feed"
             )
             # Textboxes for status and prediction
-            status_output = gr.Textbox(label="Status", value="Ready to record...")
             # Reset Button
-            reset_button = gr.Button("Reset / Start New Video")
         with gr.Column():
-            prediction_output = gr.Textbox(label="Prediction Result", value="Recording will start automatically.")
     # Define actions
     # This continuously processes frames from the webcam
     webcam_input.stream(
-        process_frame_and_predict,
         inputs=[webcam_input],
-        outputs=[status_output, prediction_output] # Now outputs both status and prediction
     )
     # This triggers the reset function when the button is clicked
     reset_button.click(
-        reset_app_state,
         inputs=[],
-        outputs=[status_output, prediction_output] # Updates both output textboxes
     )
 if __name__ == "__main__":

 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 # These must match the values used during your training
+# IMPORTANT: Your model was trained on NUM_FRAMES = 8.
+# If you want to use 20 frames, this model will likely NOT perform well
+# as it's a mismatch. If you truly need 20 frames, the model should be retrained with 20.
+# For now, let's keep it at 8 as per your training, but we can simulate 20 captured for sampling.
+MODEL_INPUT_NUM_FRAMES = 8 # This is the 'NUM_FRAMES' the model expects
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
+# --- Video Capture & Prediction Timing ---
+RAW_RECORDING_DURATION_SECONDS = 10.0 # Capture raw frames for this duration for each clip
+FRAMES_TO_SAMPLE_PER_CLIP = 20 # Number of frames to hypothetically sample from the raw 10s clip
+                               # NOTE: The model will only use MODEL_INPUT_NUM_FRAMES (8) of these.
+# The delay *after* a prediction is made before the next prediction cycle starts.
+# Set to 120.0 seconds (2 minutes) for CPU testing. Change this for GPU.
+DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0 # CHANGED: Variable for delay between predictions
 # --- Load Model and Processor ---
 print(f"Model's class labels: {model.config.id2label}")
 # --- Global State Variables ---
+# Buffer to store raw frames from the webcam for the current 10-second segment
+raw_frames_buffer = deque() # No maxlen, we manage size based on time
+current_clip_start_time = time.time() # Time when the current 10-second clip started
+last_prediction_completion_time = time.time() # Time when the last prediction finished
+# State machine for the app's workflow
+# States: "recording", "processing_delay", "predicting"
+app_state = "recording"
+# --- Helper function to sample frames ---
+def sample_frames(frames_list, target_count):
+    """
+    Samples target_count frames evenly from a list of frames.
+    If frames_list has fewer than target_count, it returns all frames.
+    """
+    if not frames_list:
+        return []
+    if len(frames_list) <= target_count:
+        return frames_list
+    indices = np.linspace(0, len(frames_list) - 1, target_count, dtype=int)
+    sampled = [frames_list[i] for i in indices]
+    return sampled
+# --- Main processing function for Gradio Stream ---
+def live_predict_stream(image_np_array):
+    global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
     current_time = time.time()
+    pil_image = Image.fromarray(cv2.cvtColor(image_np_array, cv2.COLOR_RGB2BGR)) # Convert RGB to BGR if using cv2.putText later, otherwise RGB is fine
+    status_message = ""
+    prediction_result = ""
+    if app_state == "recording":
+        raw_frames_buffer.append(pil_image)
+        elapsed_recording_time = current_time - current_clip_start_time
+        if elapsed_recording_time < RAW_RECORDING_DURATION_SECONDS:
+            status_message = f"Recording: {elapsed_recording_time:.1f}/{RAW_RECORDING_DURATION_SECONDS}s. Total raw frames: {len(raw_frames_buffer)}"
+            prediction_result = "Buffering for next clip..."
+        else:
+            # Done recording, now move to predicting state
+            app_state = "predicting"
+            status_message = f"Finished recording {RAW_RECORDING_DURATION_SECONDS}s. Preparing for prediction..."
+            prediction_result = "Processing clip..."
+            print(f"DEBUG: Entering 'predicting' state. Raw frames collected: {len(raw_frames_buffer)}")
+    if app_state == "predicting":
+        # Ensure prediction logic runs only once per clip
+        if raw_frames_buffer: # Check if there are frames to process
+            print(f"DEBUG: Performing prediction.")
+            # 1. Sample FRAMES_TO_SAMPLE_PER_CLIP from the raw buffer
+            # Note: Your model was trained on MODEL_INPUT_NUM_FRAMES.
+            # We'll sample 20 from the raw, but then further sample 8 for the model.
+            sampled_raw_frames = sample_frames(list(raw_frames_buffer), FRAMES_TO_SAMPLE_PER_CLIP)
+            # 2. Select MODEL_INPUT_NUM_FRAMES from the sampled frames for the model
+            frames_for_model = sample_frames(sampled_raw_frames, MODEL_INPUT_NUM_FRAMES)
+            if len(frames_for_model) < MODEL_INPUT_NUM_FRAMES:
+                # This should ideally not happen if RAW_RECORDING_DURATION_SECONDS is long enough
+                # and camera FPS is stable.
+                prediction_result = "Not enough frames for model input. Waiting for more..."
+                status_message = "Error: Not enough frames for model."
+                print(f"WARNING: Insufficient frames for model input: {len(frames_for_model)}/{MODEL_INPUT_NUM_FRAMES}")
+                # Reset state if we can't predict
+                app_state = "recording"
+                raw_frames_buffer.clear()
+                current_clip_start_time = time.time()
+                last_prediction_completion_time = time.time() # Reset delay counter too
+                return status_message, prediction_result
+            # Preprocess and predict
+            processed_input = processor(images=frames_for_model, return_tensors="pt")
             pixel_values = processed_input.pixel_values.to(device)
             with torch.no_grad():
             predicted_label = model.config.id2label[predicted_class_id]
             confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
+            prediction_result = f"Predicted: {predicted_label} ({confidence:.2f})"
+            print(f"DEBUG: {prediction_result}")
+            # Clear raw buffer as this clip has been processed
+            raw_frames_buffer.clear()
+            last_prediction_completion_time = current_time # Mark time prediction finished
+            app_state = "processing_delay" # Move to delay state
+            status_message = f"Prediction complete. Waiting for {DELAY_BETWEEN_PREDICTIONS_SECONDS}s delay."
         else:
+             # This means app_state is predicting but raw_frames_buffer is empty, should not happen in normal flow
+             status_message = "Waiting for frames to process..."
+             prediction_result = "..."
+    elif app_state == "processing_delay":
+        elapsed_delay = current_time - last_prediction_completion_time
+        if elapsed_delay < DELAY_BETWEEN_PREDICTIONS_SECONDS:
+            status_message = f"Delaying next prediction: {int(elapsed_delay)}/{DELAY_BETWEEN_PREDICTIONS_SECONDS}s"
+            # Keep showing the last prediction result during the delay
+        else:
+            # Delay is over, reset for next recording cycle
+            app_state = "recording"
+            current_clip_start_time = current_time # Start new recording clip
+            status_message = "Delay finished. Starting new recording..."
+            prediction_result = "Recording for next clip..."
+            print(f"DEBUG: Delay finished. Entering 'recording' state.")
+    return status_message, prediction_result
+def reset_app_state_manual():
+    """Resets the global state variables and starts a new recording cycle immediately."""
+    global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
+    raw_frames_buffer.clear()
+    current_clip_start_time = time.time()
+    last_prediction_completion_time = time.time()
+    app_state = "recording" # Force state to recording
+    print("Manual reset: App state reset and starting new recording cycle.")
+    return "Ready to record...", "Ready for new prediction cycle."
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         f"""
+        # TimesFormer Crime Detection Live Demo (Segmented Auto-Prediction)
+        This demo continuously captures live webcam feed.
+        It records raw video for **{RAW_RECORDING_DURATION_SECONDS} seconds**.
+        From this, it samples **{FRAMES_TO_SAMPLE_PER_CLIP} frames** (for context) and then extracts **{MODEL_INPUT_NUM_FRAMES} frames**
+        for the TimesFormer model to make a prediction.
+        After each prediction, there's a **{DELAY_BETWEEN_PREDICTIONS_SECONDS/60:.0f} minute delay** before the next prediction cycle begins.
         Please allow webcam access.
         """
     )
                 label="Live Webcam Feed"
             )
             # Textboxes for status and prediction
+            status_output = gr.Textbox(label="Current Status", value="Initializing...")
             # Reset Button
+            reset_button = gr.Button("Manual Reset / Start New Cycle Immediately")
         with gr.Column():
+            prediction_output = gr.Textbox(label="Prediction Result", value="Waiting for recording to start...")
     # Define actions
     # This continuously processes frames from the webcam
     webcam_input.stream(
+        live_predict_stream,
         inputs=[webcam_input],
+        outputs=[status_output, prediction_output]
     )
     # This triggers the reset function when the button is clicked
     reset_button.click(
+        reset_app_state_manual,
         inputs=[],
+        outputs=[status_output, prediction_output]
     )
 if __name__ == "__main__":