Spaces:

owinymarvin
/

SW_AI_deployment

Running

App Files Files Community

owinymarvin commited on May 23

Commit

4873e8b

1 Parent(s): 4784ef2

latest changes

Browse files

Files changed (1) hide show

app.py +119 -152

app.py CHANGED Viewed

@@ -6,77 +6,49 @@ from PIL import Image
 import numpy as np
 import time
 from collections import deque
-# --- Configuration ---
-# Your Hugging Face model repository ID
 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
-# These must match the values used during your training
-# IMPORTANT: Your model was trained on NUM_FRAMES = 8.
-# If you want to use 20 frames, this model will likely NOT perform well
-# as it's a mismatch. If you truly need 20 frames, the model should be retrained with 20.
-# For now, let's keep it at 8 as per your training, but we can simulate 20 captured for sampling.
-MODEL_INPUT_NUM_FRAMES = 8 # This is the 'NUM_FRAMES' the model expects
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
-# --- Video Capture & Prediction Timing ---
-RAW_RECORDING_DURATION_SECONDS = 10.0 # Capture raw frames for this duration for each clip
-FRAMES_TO_SAMPLE_PER_CLIP = 20 # Number of frames to hypothetically sample from the raw 10s clip
-                               # NOTE: The model will only use MODEL_INPUT_NUM_FRAMES (8) of these.
-# The delay *after* a prediction is made before the next prediction cycle starts.
-# Set to 120.0 seconds (2 minutes) for CPU testing. Change this for GPU.
-DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0 # CHANGED: Variable for delay between predictions
-# --- Load Model and Processor ---
-print(f"Loading model and image processor from {HF_MODEL_REPO_ID}...")
 try:
     processor = AutoImageProcessor.from_pretrained(HF_MODEL_REPO_ID)
     model = TimesformerForVideoClassification.from_pretrained(HF_MODEL_REPO_ID)
 except Exception as e:
-    print(f"Error loading model from Hugging Face Hub: {e}")
     exit()
-model.eval() # Set model to evaluation mode
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-print(f"Model loaded successfully on {device}.")
-print(f"Model's class labels: {model.config.id2label}")
-# --- Global State Variables ---
-# Buffer to store raw frames from the webcam for the current 10-second segment
-raw_frames_buffer = deque() # No maxlen, we manage size based on time
-current_clip_start_time = time.time() # Time when the current 10-second clip started
-last_prediction_completion_time = time.time() # Time when the last prediction finished
-# State machine for the app's workflow
-# States: "recording", "processing_delay", "predicting"
 app_state = "recording"
-# --- Helper function to sample frames ---
 def sample_frames(frames_list, target_count):
-    """
-    Samples target_count frames evenly from a list of frames.
-    If frames_list has fewer than target_count, it returns all frames.
-    """
     if not frames_list:
         return []
     if len(frames_list) <= target_count:
         return frames_list
     indices = np.linspace(0, len(frames_list) - 1, target_count, dtype=int)
-    sampled = [frames_list[i] for i in indices]
     return sampled
-# --- Main processing function for Gradio Stream ---
 def live_predict_stream(image_np_array):
     global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
     current_time = time.time()
-    pil_image = Image.fromarray(cv2.cvtColor(image_np_array, cv2.COLOR_RGB2BGR)) # Convert RGB to BGR if using cv2.putText later, otherwise RGB is fine
     status_message = ""
     prediction_result = ""
@@ -84,136 +56,131 @@ def live_predict_stream(image_np_array):
     if app_state == "recording":
         raw_frames_buffer.append(pil_image)
         elapsed_recording_time = current_time - current_clip_start_time
-        if elapsed_recording_time < RAW_RECORDING_DURATION_SECONDS:
-            status_message = f"Recording: {elapsed_recording_time:.1f}/{RAW_RECORDING_DURATION_SECONDS}s. Total raw frames: {len(raw_frames_buffer)}"
-            prediction_result = "Buffering for next clip..."
-        else:
-            # Done recording, now move to predicting state
             app_state = "predicting"
-            status_message = f"Finished recording {RAW_RECORDING_DURATION_SECONDS}s. Preparing for prediction..."
-            prediction_result = "Processing clip..."
-            print(f"DEBUG: Entering 'predicting' state. Raw frames collected: {len(raw_frames_buffer)}")
-    if app_state == "predicting":
-        # Ensure prediction logic runs only once per clip
-        if raw_frames_buffer: # Check if there are frames to process
-            print(f"DEBUG: Performing prediction.")
-            # 1. Sample FRAMES_TO_SAMPLE_PER_CLIP from the raw buffer
-            # Note: Your model was trained on MODEL_INPUT_NUM_FRAMES.
-            # We'll sample 20 from the raw, but then further sample 8 for the model.
-            sampled_raw_frames = sample_frames(list(raw_frames_buffer), FRAMES_TO_SAMPLE_PER_CLIP)
-            # 2. Select MODEL_INPUT_NUM_FRAMES from the sampled frames for the model
-            frames_for_model = sample_frames(sampled_raw_frames, MODEL_INPUT_NUM_FRAMES)
-            if len(frames_for_model) < MODEL_INPUT_NUM_FRAMES:
-                # This should ideally not happen if RAW_RECORDING_DURATION_SECONDS is long enough
-                # and camera FPS is stable.
-                prediction_result = "Not enough frames for model input. Waiting for more..."
-                status_message = "Error: Not enough frames for model."
-                print(f"WARNING: Insufficient frames for model input: {len(frames_for_model)}/{MODEL_INPUT_NUM_FRAMES}")
-                # Reset state if we can't predict
-                app_state = "recording"
                 raw_frames_buffer.clear()
-                current_clip_start_time = time.time()
-                last_prediction_completion_time = time.time() # Reset delay counter too
-                return status_message, prediction_result
-            # Preprocess and predict
-            processed_input = processor(images=frames_for_model, return_tensors="pt")
-            pixel_values = processed_input.pixel_values.to(device)
-            with torch.no_grad():
-                outputs = model(pixel_values)
-                logits = outputs.logits
-            predicted_class_id = logits.argmax(-1).item()
-            predicted_label = model.config.id2label[predicted_class_id]
-            confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
-            prediction_result = f"Predicted: {predicted_label} ({confidence:.2f})"
-            print(f"DEBUG: {prediction_result}")
-            # Clear raw buffer as this clip has been processed
-            raw_frames_buffer.clear()
-            last_prediction_completion_time = current_time # Mark time prediction finished
-            app_state = "processing_delay" # Move to delay state
-            status_message = f"Prediction complete. Waiting for {DELAY_BETWEEN_PREDICTIONS_SECONDS}s delay."
         else:
-             # This means app_state is predicting but raw_frames_buffer is empty, should not happen in normal flow
-             status_message = "Waiting for frames to process..."
-             prediction_result = "..."
     elif app_state == "processing_delay":
         elapsed_delay = current_time - last_prediction_completion_time
-        if elapsed_delay < DELAY_BETWEEN_PREDICTIONS_SECONDS:
-            status_message = f"Delaying next prediction: {int(elapsed_delay)}/{DELAY_BETWEEN_PREDICTIONS_SECONDS}s"
-            # Keep showing the last prediction result during the delay
-        else:
-            # Delay is over, reset for next recording cycle
             app_state = "recording"
-            current_clip_start_time = current_time # Start new recording clip
-            status_message = "Delay finished. Starting new recording..."
-            prediction_result = "Recording for next clip..."
-            print(f"DEBUG: Delay finished. Entering 'recording' state.")
     return status_message, prediction_result
 def reset_app_state_manual():
-    """Resets the global state variables and starts a new recording cycle immediately."""
     global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
     raw_frames_buffer.clear()
     current_clip_start_time = time.time()
     last_prediction_completion_time = time.time()
-    app_state = "recording" # Force state to recording
-    print("Manual reset: App state reset and starting new recording cycle.")
-    return "Ready to record...", "Ready for new prediction cycle."
-# --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown(
         f"""
-        # TimesFormer Crime Detection Live Demo (Segmented Auto-Prediction)
-        This demo continuously captures live webcam feed.
-        It records raw video for **{RAW_RECORDING_DURATION_SECONDS} seconds**.
-        From this, it samples **{FRAMES_TO_SAMPLE_PER_CLIP} frames** (for context) and then extracts **{MODEL_INPUT_NUM_FRAMES} frames**
-        for the TimesFormer model to make a prediction.
-        After each prediction, there's a **{DELAY_BETWEEN_PREDICTIONS_SECONDS/60:.0f} minute delay** before the next prediction cycle begins.
-        Please allow webcam access.
         """
     )
-    with gr.Row():
-        with gr.Column():
-            webcam_input = gr.Image(
-                sources=["webcam"],
-                streaming=True,
-                label="Live Webcam Feed"
-            )
-            # Textboxes for status and prediction
-            status_output = gr.Textbox(label="Current Status", value="Initializing...")
-            # Reset Button
-            reset_button = gr.Button("Manual Reset / Start New Cycle Immediately")
-        with gr.Column():
-            prediction_output = gr.Textbox(label="Prediction Result", value="Waiting for recording to start...")
-    # Define actions
-    # This continuously processes frames from the webcam
-    webcam_input.stream(
-        live_predict_stream,
-        inputs=[webcam_input],
-        outputs=[status_output, prediction_output]
-    )
-    # This triggers the reset function when the button is clicked
-    reset_button.click(
-        reset_app_state_manual,
-        inputs=[],
-        outputs=[status_output, prediction_output]
-    )
 if __name__ == "__main__":
     demo.launch()

 import numpy as np
 import time
 from collections import deque
+import base64
+import io
 HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
+MODEL_INPUT_NUM_FRAMES = 8
 TARGET_IMAGE_HEIGHT = 224
 TARGET_IMAGE_WIDTH = 224
+RAW_RECORDING_DURATION_SECONDS = 10.0
+FRAMES_TO_SAMPLE_PER_CLIP = 20
+DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0
+print(f"Loading model and processor from {HF_MODEL_REPO_ID}...")
 try:
     processor = AutoImageProcessor.from_pretrained(HF_MODEL_REPO_ID)
     model = TimesformerForVideoClassification.from_pretrained(HF_MODEL_REPO_ID)
 except Exception as e:
+    print(f"Error loading model: {e}")
     exit()
+model.eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+print(f"Model loaded on {device}.")
+raw_frames_buffer = deque()
+current_clip_start_time = time.time()
+last_prediction_completion_time = time.time()
 app_state = "recording"
 def sample_frames(frames_list, target_count):
     if not frames_list:
         return []
     if len(frames_list) <= target_count:
         return frames_list
     indices = np.linspace(0, len(frames_list) - 1, target_count, dtype=int)
+    sampled = [frames_list(int(i)) for i in indices] # Corrected sampling
     return sampled
 def live_predict_stream(image_np_array):
     global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
     current_time = time.time()
+    pil_image = Image.fromarray(image_np_array)
     status_message = ""
     prediction_result = ""
     if app_state == "recording":
         raw_frames_buffer.append(pil_image)
         elapsed_recording_time = current_time - current_clip_start_time
+        status_message = f"Recording: {elapsed_recording_time:.1f}/{RAW_RECORDING_DURATION_SECONDS}s. Raw frames: {len(raw_frames_buffer)}"
+        prediction_result = "Buffering..."
+        if elapsed_recording_time >= RAW_RECORDING_DURATION_SECONDS:
             app_state = "predicting"
+            status_message = "Preparing to predict..."
+            prediction_result = "Processing..."
+            print("DEBUG: Transitioning to 'predicting' state.")
+    elif app_state == "predicting":
+        if raw_frames_buffer:
+            print("DEBUG: Starting prediction.")
+            try:
+                sampled_raw_frames = sample_frames(list(raw_frames_buffer), FRAMES_TO_SAMPLE_PER_CLIP)
+                frames_for_model = sample_frames(sampled_raw_frames, MODEL_INPUT_NUM_FRAMES)
+                if len(frames_for_model) < MODEL_INPUT_NUM_FRAMES:
+                    prediction_result = "Error: Not enough frames for model."
+                    status_message = "Error during frame sampling."
+                    app_state = "recording"
+                    raw_frames_buffer.clear()
+                    current_clip_start_time = time.time()
+                    last_prediction_completion_time = time.time()
+                    return status_message, prediction_result
+                processed_input = processor(images=frames_for_model, return_tensors="pt")
+                pixel_values = processed_input.pixel_values.to(device)
+                with torch.no_grad():
+                    outputs = model(pixel_values)
+                    logits = outputs.logits
+                predicted_class_id = logits.argmax(-1).item()
+                predicted_label = model.config.id2label.get(predicted_class_id, "Unknown") # Handle potential missing label
+                confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
+                prediction_result = f"Predicted: {predicted_label} (Confidence: {confidence:.2f})"
+                status_message = "Prediction complete."
+                print(f"DEBUG: Prediction Result: {prediction_result}")
                 raw_frames_buffer.clear()
+                last_prediction_completion_time = current_time
+                app_state = "processing_delay"
+                print("DEBUG: Transitioning to 'processing_delay' state.")
+            except Exception as e:
+                prediction_result = f"Error during prediction: {e}"
+                status_message = "Prediction error."
+                print(f"ERROR during prediction: {e}")
+                app_state = "processing_delay" # Move to delay to avoid continuous errors
         else:
+            status_message = "Waiting for frames..."
+            prediction_result = "..."
     elif app_state == "processing_delay":
         elapsed_delay = current_time - last_prediction_completion_time
+        status_message = f"Delaying next prediction: {int(elapsed_delay)}/{int(DELAY_BETWEEN_PREDICTIONS_SECONDS)}s"
+        if elapsed_delay >= DELAY_BETWEEN_PREDICTIONS_SECONDS:
             app_state = "recording"
+            current_clip_start_time = current_time
+            status_message = "Starting new recording..."
+            prediction_result = "Ready..."
+            print("DEBUG: Transitioning back to 'recording' state.")
     return status_message, prediction_result
 def reset_app_state_manual():
     global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
     raw_frames_buffer.clear()
     current_clip_start_time = time.time()
     last_prediction_completion_time = time.time()
+    app_state = "recording"
+    print("DEBUG: Manual reset triggered.")
+    return "Ready to record...", "Ready for new prediction."
 with gr.Blocks() as demo:
     gr.Markdown(
         f"""
+        # TimesFormer Crime Detection - Hugging Face Space Host
+        This Space hosts the `owinymarvin/timesformer-crime-detection` model.
+        Live webcam demo with recording and prediction phases.
         """
     )
+    with gr.Tab("Live Webcam Demo"):
+        gr.Markdown(
+            f"""
+            Continuously captures live webcam feed for **{RAW_RECORDING_DURATION_SECONDS} seconds**,
+            then makes a prediction. There is a **{DELAY_BETWEEN_PREDICTIONS_SECONDS/60:.0f} minute delay** afterwards.
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                webcam_input = gr.Image(
+                    sources=["webcam"],
+                    streaming=True,
+                    label="Live Webcam Feed"
+                )
+                status_output = gr.Textbox(label="Current Status", value="Initializing...")
+                reset_button = gr.Button("Reset / Start New Cycle")
+            with gr.Column():
+                prediction_output = gr.Textbox(label="Prediction Result", value="Waiting...")
+        webcam_input.stream(
+            live_predict_stream,
+            inputs=[webcam_input],
+            outputs=[status_output, prediction_output]
+        )
+        reset_button.click(
+            reset_app_state_manual,
+            inputs=[],
+            outputs=[status_output, prediction_output]
+        )
+    with gr.Tab("API Endpoint for External Clients"):
+        gr.Markdown(
+            """
+            Use this API endpoint to send base64-encoded frames for prediction.
+            """
+        )
+        gr.Interface(
+            fn=lambda x: "API endpoint is active",
+            inputs=gr.Textbox(label="Input (Base64 JSON)"),
+            outputs=gr.Textbox(label="Status"),
+            title="API Status (Details in app.py)" # Minimal UI for API tab
+        )
 if __name__ == "__main__":
     demo.launch()