Spaces:
Running
Running
Commit
·
4784ef2
1
Parent(s):
bc79b5b
latest changes
Browse files
app.py
CHANGED
@@ -12,17 +12,22 @@ from collections import deque
|
|
12 |
HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
|
13 |
|
14 |
# These must match the values used during your training
|
15 |
-
|
|
|
|
|
|
|
|
|
16 |
TARGET_IMAGE_HEIGHT = 224
|
17 |
TARGET_IMAGE_WIDTH = 224
|
18 |
|
19 |
-
# --- Prediction Timing ---
|
20 |
-
#
|
21 |
-
|
22 |
-
#
|
23 |
-
|
24 |
-
# after
|
25 |
-
|
|
|
26 |
|
27 |
|
28 |
# --- Load Model and Processor ---
|
@@ -41,40 +46,83 @@ print(f"Model loaded successfully on {device}.")
|
|
41 |
print(f"Model's class labels: {model.config.id2label}")
|
42 |
|
43 |
# --- Global State Variables ---
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
current_time = time.time()
|
64 |
-
|
65 |
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
# --- Perform Inference ---
|
73 |
-
print(f"Triggered inference on {len(captured_frames_buffer)} frames after {RECORDING_DURATION_SECONDS}s recording...")
|
74 |
-
frames_for_prediction = list(captured_frames_buffer) # Take a snapshot
|
75 |
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
pixel_values = processed_input.pixel_values.to(device)
|
79 |
|
80 |
with torch.no_grad():
|
@@ -85,38 +133,54 @@ def process_frame_and_predict(image_np_array):
|
|
85 |
predicted_label = model.config.id2label[predicted_class_id]
|
86 |
confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
|
87 |
|
88 |
-
|
89 |
-
print(
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
else:
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
# --- Gradio Interface ---
|
112 |
with gr.Blocks() as demo:
|
113 |
gr.Markdown(
|
114 |
f"""
|
115 |
-
# TimesFormer Crime Detection Live Demo (Auto-
|
116 |
-
This demo
|
117 |
-
It records **{
|
118 |
-
|
119 |
-
|
|
|
120 |
Please allow webcam access.
|
121 |
"""
|
122 |
)
|
@@ -128,27 +192,27 @@ with gr.Blocks() as demo:
|
|
128 |
label="Live Webcam Feed"
|
129 |
)
|
130 |
# Textboxes for status and prediction
|
131 |
-
status_output = gr.Textbox(label="Status", value="
|
132 |
|
133 |
# Reset Button
|
134 |
-
reset_button = gr.Button("Reset / Start New
|
135 |
|
136 |
with gr.Column():
|
137 |
-
prediction_output = gr.Textbox(label="Prediction Result", value="
|
138 |
|
139 |
# Define actions
|
140 |
# This continuously processes frames from the webcam
|
141 |
webcam_input.stream(
|
142 |
-
|
143 |
inputs=[webcam_input],
|
144 |
-
outputs=[status_output, prediction_output]
|
145 |
)
|
146 |
|
147 |
# This triggers the reset function when the button is clicked
|
148 |
reset_button.click(
|
149 |
-
|
150 |
inputs=[],
|
151 |
-
outputs=[status_output, prediction_output]
|
152 |
)
|
153 |
|
154 |
if __name__ == "__main__":
|
|
|
12 |
HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
|
13 |
|
14 |
# These must match the values used during your training
|
15 |
+
# IMPORTANT: Your model was trained on NUM_FRAMES = 8.
|
16 |
+
# If you want to use 20 frames, this model will likely NOT perform well
|
17 |
+
# as it's a mismatch. If you truly need 20 frames, the model should be retrained with 20.
|
18 |
+
# For now, let's keep it at 8 as per your training, but we can simulate 20 captured for sampling.
|
19 |
+
MODEL_INPUT_NUM_FRAMES = 8 # This is the 'NUM_FRAMES' the model expects
|
20 |
TARGET_IMAGE_HEIGHT = 224
|
21 |
TARGET_IMAGE_WIDTH = 224
|
22 |
|
23 |
+
# --- Video Capture & Prediction Timing ---
|
24 |
+
RAW_RECORDING_DURATION_SECONDS = 10.0 # Capture raw frames for this duration for each clip
|
25 |
+
FRAMES_TO_SAMPLE_PER_CLIP = 20 # Number of frames to hypothetically sample from the raw 10s clip
|
26 |
+
# NOTE: The model will only use MODEL_INPUT_NUM_FRAMES (8) of these.
|
27 |
+
|
28 |
+
# The delay *after* a prediction is made before the next prediction cycle starts.
|
29 |
+
# Set to 120.0 seconds (2 minutes) for CPU testing. Change this for GPU.
|
30 |
+
DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0 # CHANGED: Variable for delay between predictions
|
31 |
|
32 |
|
33 |
# --- Load Model and Processor ---
|
|
|
46 |
print(f"Model's class labels: {model.config.id2label}")
|
47 |
|
48 |
# --- Global State Variables ---
|
49 |
+
# Buffer to store raw frames from the webcam for the current 10-second segment
|
50 |
+
raw_frames_buffer = deque() # No maxlen, we manage size based on time
|
51 |
+
current_clip_start_time = time.time() # Time when the current 10-second clip started
|
52 |
+
last_prediction_completion_time = time.time() # Time when the last prediction finished
|
53 |
+
|
54 |
+
# State machine for the app's workflow
|
55 |
+
# States: "recording", "processing_delay", "predicting"
|
56 |
+
app_state = "recording"
|
57 |
+
|
58 |
+
# --- Helper function to sample frames ---
|
59 |
+
def sample_frames(frames_list, target_count):
|
60 |
+
"""
|
61 |
+
Samples target_count frames evenly from a list of frames.
|
62 |
+
If frames_list has fewer than target_count, it returns all frames.
|
63 |
+
"""
|
64 |
+
if not frames_list:
|
65 |
+
return []
|
66 |
+
if len(frames_list) <= target_count:
|
67 |
+
return frames_list
|
68 |
+
|
69 |
+
indices = np.linspace(0, len(frames_list) - 1, target_count, dtype=int)
|
70 |
+
sampled = [frames_list[i] for i in indices]
|
71 |
+
return sampled
|
72 |
+
|
73 |
+
|
74 |
+
# --- Main processing function for Gradio Stream ---
|
75 |
+
def live_predict_stream(image_np_array):
|
76 |
+
global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
|
77 |
|
78 |
current_time = time.time()
|
79 |
+
pil_image = Image.fromarray(cv2.cvtColor(image_np_array, cv2.COLOR_RGB2BGR)) # Convert RGB to BGR if using cv2.putText later, otherwise RGB is fine
|
80 |
|
81 |
+
status_message = ""
|
82 |
+
prediction_result = ""
|
83 |
|
84 |
+
if app_state == "recording":
|
85 |
+
raw_frames_buffer.append(pil_image)
|
86 |
+
elapsed_recording_time = current_time - current_clip_start_time
|
|
|
|
|
|
|
87 |
|
88 |
+
if elapsed_recording_time < RAW_RECORDING_DURATION_SECONDS:
|
89 |
+
status_message = f"Recording: {elapsed_recording_time:.1f}/{RAW_RECORDING_DURATION_SECONDS}s. Total raw frames: {len(raw_frames_buffer)}"
|
90 |
+
prediction_result = "Buffering for next clip..."
|
91 |
+
else:
|
92 |
+
# Done recording, now move to predicting state
|
93 |
+
app_state = "predicting"
|
94 |
+
status_message = f"Finished recording {RAW_RECORDING_DURATION_SECONDS}s. Preparing for prediction..."
|
95 |
+
prediction_result = "Processing clip..."
|
96 |
+
print(f"DEBUG: Entering 'predicting' state. Raw frames collected: {len(raw_frames_buffer)}")
|
97 |
+
|
98 |
+
if app_state == "predicting":
|
99 |
+
# Ensure prediction logic runs only once per clip
|
100 |
+
if raw_frames_buffer: # Check if there are frames to process
|
101 |
+
print(f"DEBUG: Performing prediction.")
|
102 |
+
|
103 |
+
# 1. Sample FRAMES_TO_SAMPLE_PER_CLIP from the raw buffer
|
104 |
+
# Note: Your model was trained on MODEL_INPUT_NUM_FRAMES.
|
105 |
+
# We'll sample 20 from the raw, but then further sample 8 for the model.
|
106 |
+
sampled_raw_frames = sample_frames(list(raw_frames_buffer), FRAMES_TO_SAMPLE_PER_CLIP)
|
107 |
+
|
108 |
+
# 2. Select MODEL_INPUT_NUM_FRAMES from the sampled frames for the model
|
109 |
+
frames_for_model = sample_frames(sampled_raw_frames, MODEL_INPUT_NUM_FRAMES)
|
110 |
+
|
111 |
+
if len(frames_for_model) < MODEL_INPUT_NUM_FRAMES:
|
112 |
+
# This should ideally not happen if RAW_RECORDING_DURATION_SECONDS is long enough
|
113 |
+
# and camera FPS is stable.
|
114 |
+
prediction_result = "Not enough frames for model input. Waiting for more..."
|
115 |
+
status_message = "Error: Not enough frames for model."
|
116 |
+
print(f"WARNING: Insufficient frames for model input: {len(frames_for_model)}/{MODEL_INPUT_NUM_FRAMES}")
|
117 |
+
# Reset state if we can't predict
|
118 |
+
app_state = "recording"
|
119 |
+
raw_frames_buffer.clear()
|
120 |
+
current_clip_start_time = time.time()
|
121 |
+
last_prediction_completion_time = time.time() # Reset delay counter too
|
122 |
+
return status_message, prediction_result
|
123 |
+
|
124 |
+
# Preprocess and predict
|
125 |
+
processed_input = processor(images=frames_for_model, return_tensors="pt")
|
126 |
pixel_values = processed_input.pixel_values.to(device)
|
127 |
|
128 |
with torch.no_grad():
|
|
|
133 |
predicted_label = model.config.id2label[predicted_class_id]
|
134 |
confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
|
135 |
|
136 |
+
prediction_result = f"Predicted: {predicted_label} ({confidence:.2f})"
|
137 |
+
print(f"DEBUG: {prediction_result}")
|
138 |
|
139 |
+
# Clear raw buffer as this clip has been processed
|
140 |
+
raw_frames_buffer.clear()
|
141 |
+
last_prediction_completion_time = current_time # Mark time prediction finished
|
142 |
+
app_state = "processing_delay" # Move to delay state
|
143 |
+
status_message = f"Prediction complete. Waiting for {DELAY_BETWEEN_PREDICTIONS_SECONDS}s delay."
|
144 |
else:
|
145 |
+
# This means app_state is predicting but raw_frames_buffer is empty, should not happen in normal flow
|
146 |
+
status_message = "Waiting for frames to process..."
|
147 |
+
prediction_result = "..."
|
148 |
+
|
149 |
+
elif app_state == "processing_delay":
|
150 |
+
elapsed_delay = current_time - last_prediction_completion_time
|
151 |
+
if elapsed_delay < DELAY_BETWEEN_PREDICTIONS_SECONDS:
|
152 |
+
status_message = f"Delaying next prediction: {int(elapsed_delay)}/{DELAY_BETWEEN_PREDICTIONS_SECONDS}s"
|
153 |
+
# Keep showing the last prediction result during the delay
|
154 |
+
else:
|
155 |
+
# Delay is over, reset for next recording cycle
|
156 |
+
app_state = "recording"
|
157 |
+
current_clip_start_time = current_time # Start new recording clip
|
158 |
+
status_message = "Delay finished. Starting new recording..."
|
159 |
+
prediction_result = "Recording for next clip..."
|
160 |
+
print(f"DEBUG: Delay finished. Entering 'recording' state.")
|
161 |
+
|
162 |
+
return status_message, prediction_result
|
163 |
+
|
164 |
+
def reset_app_state_manual():
|
165 |
+
"""Resets the global state variables and starts a new recording cycle immediately."""
|
166 |
+
global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
|
167 |
+
raw_frames_buffer.clear()
|
168 |
+
current_clip_start_time = time.time()
|
169 |
+
last_prediction_completion_time = time.time()
|
170 |
+
app_state = "recording" # Force state to recording
|
171 |
+
print("Manual reset: App state reset and starting new recording cycle.")
|
172 |
+
return "Ready to record...", "Ready for new prediction cycle."
|
173 |
|
174 |
# --- Gradio Interface ---
|
175 |
with gr.Blocks() as demo:
|
176 |
gr.Markdown(
|
177 |
f"""
|
178 |
+
# TimesFormer Crime Detection Live Demo (Segmented Auto-Prediction)
|
179 |
+
This demo continuously captures live webcam feed.
|
180 |
+
It records raw video for **{RAW_RECORDING_DURATION_SECONDS} seconds**.
|
181 |
+
From this, it samples **{FRAMES_TO_SAMPLE_PER_CLIP} frames** (for context) and then extracts **{MODEL_INPUT_NUM_FRAMES} frames**
|
182 |
+
for the TimesFormer model to make a prediction.
|
183 |
+
After each prediction, there's a **{DELAY_BETWEEN_PREDICTIONS_SECONDS/60:.0f} minute delay** before the next prediction cycle begins.
|
184 |
Please allow webcam access.
|
185 |
"""
|
186 |
)
|
|
|
192 |
label="Live Webcam Feed"
|
193 |
)
|
194 |
# Textboxes for status and prediction
|
195 |
+
status_output = gr.Textbox(label="Current Status", value="Initializing...")
|
196 |
|
197 |
# Reset Button
|
198 |
+
reset_button = gr.Button("Manual Reset / Start New Cycle Immediately")
|
199 |
|
200 |
with gr.Column():
|
201 |
+
prediction_output = gr.Textbox(label="Prediction Result", value="Waiting for recording to start...")
|
202 |
|
203 |
# Define actions
|
204 |
# This continuously processes frames from the webcam
|
205 |
webcam_input.stream(
|
206 |
+
live_predict_stream,
|
207 |
inputs=[webcam_input],
|
208 |
+
outputs=[status_output, prediction_output]
|
209 |
)
|
210 |
|
211 |
# This triggers the reset function when the button is clicked
|
212 |
reset_button.click(
|
213 |
+
reset_app_state_manual,
|
214 |
inputs=[],
|
215 |
+
outputs=[status_output, prediction_output]
|
216 |
)
|
217 |
|
218 |
if __name__ == "__main__":
|