owinymarvin commited on
Commit
89ce7bf
·
1 Parent(s): 028d725

latest changes

Browse files
Files changed (2) hide show
  1. app.py +11 -17
  2. good copy.py +212 -0
app.py CHANGED
@@ -10,13 +10,15 @@ import base64
10
  import io
11
 
12
  # --- Configuration ---
13
- HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
 
 
14
  MODEL_INPUT_NUM_FRAMES = 8
15
  TARGET_IMAGE_HEIGHT = 224
16
  TARGET_IMAGE_WIDTH = 224
17
  RAW_RECORDING_DURATION_SECONDS = 10.0
18
  FRAMES_TO_SAMPLE_PER_CLIP = 20
19
- DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0 # 2 minutes for CPU
20
 
21
  # --- Load Model and Processor ---
22
  print(f"Loading model and processor from {HF_MODEL_REPO_ID}...")
@@ -31,6 +33,7 @@ model.eval()
31
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
  model.to(device)
33
  print(f"Model loaded on {device}.")
 
34
 
35
  # --- Global State Variables for Live Demo ---
36
  raw_frames_buffer = deque()
@@ -121,8 +124,7 @@ def live_predict_stream(image_np_array):
121
 
122
  if elapsed_delay < DELAY_BETWEEN_PREDICTIONS_SECONDS:
123
  # Continue yielding the delay message and the last prediction result
124
- # Assuming prediction_result from previous state is still held by UI
125
- yield f"Delaying next prediction: {int(elapsed_delay)}/{int(DELAY_BETWEEN_PREDICTIONS_SECONDS)}s", gr.NO_VALUE # NO_VALUE keeps previous prediction visible
126
  else:
127
  # Delay is over, reset for new recording cycle
128
  app_state = "recording"
@@ -130,13 +132,7 @@ def live_predict_stream(image_np_array):
130
  print("DEBUG: Transitioning back to 'recording' state.")
131
  yield "Starting new recording...", "Ready for new prediction."
132
 
133
- # If for some reason nothing is yielded, return the current state to prevent UI freeze.
134
- # This acts as a fallback if no state transition happens.
135
- # However, with the yield statements, this might be less critical.
136
- # For streaming, yielding is the preferred way to update.
137
- # If the function ends without yielding, Gradio will just keep the last state.
138
- # We always yield in every branch.
139
- pass # No explicit return needed at the end if all paths yield
140
 
141
  def reset_app_state_manual():
142
  global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
@@ -152,9 +148,10 @@ def reset_app_state_manual():
152
  with gr.Blocks() as demo:
153
  gr.Markdown(
154
  f"""
155
- # TimesFormer Crime Detection - Hugging Face Space Host
156
- This Space hosts the `owinymarvin/timesformer-crime-detection` model.
157
  Live webcam demo with recording and prediction phases.
 
158
  """
159
  )
160
 
@@ -177,15 +174,12 @@ with gr.Blocks() as demo:
177
  with gr.Column():
178
  prediction_output = gr.Textbox(label="Prediction Result", value="Waiting...")
179
 
180
- # IMPORTANT: Use webcam_input.stream() with a generator function (live_predict_stream)
181
- # to enable progressive updates via 'yield'.
182
  webcam_input.stream(
183
  live_predict_stream,
184
  inputs=[webcam_input],
185
  outputs=[status_output, prediction_output]
186
  )
187
 
188
- # The reset button is a regular click event, not a stream
189
  reset_button.click(
190
  reset_app_state_manual,
191
  inputs=[],
@@ -196,9 +190,9 @@ with gr.Blocks() as demo:
196
  gr.Markdown(
197
  """
198
  Use this API endpoint to send base64-encoded frames for prediction.
 
199
  """
200
  )
201
- # Placeholder for the API tab. The actual API calls target /run/predict_from_frames_api
202
  gr.Interface(
203
  fn=lambda frames_list: "API endpoint is active for programmatic calls. See documentation in app.py.",
204
  inputs=gr.Json(label="List of Base64-encoded image strings"),
 
10
  import io
11
 
12
  # --- Configuration ---
13
+ # CHANGED: Using a public Facebook TimesFormer model fine-tuned on Kinetics
14
+ HF_MODEL_REPO_ID = "facebook/timesformer-base-finetuned-kinetics"
15
+
16
  MODEL_INPUT_NUM_FRAMES = 8
17
  TARGET_IMAGE_HEIGHT = 224
18
  TARGET_IMAGE_WIDTH = 224
19
  RAW_RECORDING_DURATION_SECONDS = 10.0
20
  FRAMES_TO_SAMPLE_PER_CLIP = 20
21
+ DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0 # 2 minutes for CPU, adjust for GPU
22
 
23
  # --- Load Model and Processor ---
24
  print(f"Loading model and processor from {HF_MODEL_REPO_ID}...")
 
33
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
  model.to(device)
35
  print(f"Model loaded on {device}.")
36
+ print(f"Model's class labels (Kinetics): {model.config.id2label}") # Print new labels
37
 
38
  # --- Global State Variables for Live Demo ---
39
  raw_frames_buffer = deque()
 
124
 
125
  if elapsed_delay < DELAY_BETWEEN_PREDICTIONS_SECONDS:
126
  # Continue yielding the delay message and the last prediction result
127
+ yield f"Delaying next prediction: {int(elapsed_delay)}/{int(DELAY_BETWEEN_PREDICTIONS_SECONDS)}s", gr.NO_VALUE
 
128
  else:
129
  # Delay is over, reset for new recording cycle
130
  app_state = "recording"
 
132
  print("DEBUG: Transitioning back to 'recording' state.")
133
  yield "Starting new recording...", "Ready for new prediction."
134
 
135
+ pass
 
 
 
 
 
 
136
 
137
  def reset_app_state_manual():
138
  global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
 
148
  with gr.Blocks() as demo:
149
  gr.Markdown(
150
  f"""
151
+ # TimesFormer Action Recognition - Using Facebook Kinetics Model
152
+ This Space hosts the `{HF_MODEL_REPO_ID}` model.
153
  Live webcam demo with recording and prediction phases.
154
+ **NOTE: This model predicts general human actions (e.g., 'playing guitar', 'walking'), not crime events.**
155
  """
156
  )
157
 
 
174
  with gr.Column():
175
  prediction_output = gr.Textbox(label="Prediction Result", value="Waiting...")
176
 
 
 
177
  webcam_input.stream(
178
  live_predict_stream,
179
  inputs=[webcam_input],
180
  outputs=[status_output, prediction_output]
181
  )
182
 
 
183
  reset_button.click(
184
  reset_app_state_manual,
185
  inputs=[],
 
190
  gr.Markdown(
191
  """
192
  Use this API endpoint to send base64-encoded frames for prediction.
193
+ (Currently uses the Kinetics model).
194
  """
195
  )
 
196
  gr.Interface(
197
  fn=lambda frames_list: "API endpoint is active for programmatic calls. See documentation in app.py.",
198
  inputs=gr.Json(label="List of Base64-encoded image strings"),
good copy.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoImageProcessor, TimesformerForVideoClassification
4
+ import cv2
5
+ from PIL import Image
6
+ import numpy as np
7
+ import time
8
+ from collections import deque
9
+ import base64
10
+ import io
11
+
12
+ # --- Configuration ---
13
+ HF_MODEL_REPO_ID = "owinymarvin/timesformer-crime-detection"
14
+ MODEL_INPUT_NUM_FRAMES = 8
15
+ TARGET_IMAGE_HEIGHT = 224
16
+ TARGET_IMAGE_WIDTH = 224
17
+ RAW_RECORDING_DURATION_SECONDS = 10.0
18
+ FRAMES_TO_SAMPLE_PER_CLIP = 20
19
+ DELAY_BETWEEN_PREDICTIONS_SECONDS = 120.0 # 2 minutes for CPU
20
+
21
+ # --- Load Model and Processor ---
22
+ print(f"Loading model and processor from {HF_MODEL_REPO_ID}...")
23
+ try:
24
+ processor = AutoImageProcessor.from_pretrained(HF_MODEL_REPO_ID)
25
+ model = TimesformerForVideoClassification.from_pretrained(HF_MODEL_REPO_ID)
26
+ except Exception as e:
27
+ print(f"Error loading model: {e}")
28
+ exit()
29
+
30
+ model.eval()
31
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+ model.to(device)
33
+ print(f"Model loaded on {device}.")
34
+
35
+ # --- Global State Variables for Live Demo ---
36
+ raw_frames_buffer = deque()
37
+ current_clip_start_time = time.time()
38
+ last_prediction_completion_time = time.time()
39
+ app_state = "recording" # States: "recording", "predicting", "processing_delay"
40
+
41
+ # --- Helper function to sample frames ---
42
+ def sample_frames(frames_list, target_count):
43
+ if not frames_list:
44
+ return []
45
+ if len(frames_list) <= target_count:
46
+ return frames_list
47
+ indices = np.linspace(0, len(frames_list) - 1, target_count, dtype=int)
48
+ sampled = [frames_list[int(i)] for i in indices]
49
+ return sampled
50
+
51
+ # --- Main processing function for Live Demo Stream ---
52
+ def live_predict_stream(image_np_array):
53
+ global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
54
+
55
+ current_time = time.time()
56
+ pil_image = Image.fromarray(image_np_array)
57
+
58
+ if app_state == "recording":
59
+ raw_frames_buffer.append(pil_image)
60
+ elapsed_recording_time = current_time - current_clip_start_time
61
+
62
+ yield f"Recording: {elapsed_recording_time:.1f}/{RAW_RECORDING_DURATION_SECONDS}s. Raw frames: {len(raw_frames_buffer)}", "Buffering..."
63
+
64
+ if elapsed_recording_time >= RAW_RECORDING_DURATION_SECONDS:
65
+ # Transition to predicting state
66
+ app_state = "predicting"
67
+ yield "Preparing to predict...", "Processing..."
68
+ print("DEBUG: Transitioning to 'predicting' state.")
69
+
70
+ elif app_state == "predicting":
71
+ # Ensure this prediction block only runs once per cycle
72
+ if raw_frames_buffer: # Only proceed if there are frames to process
73
+ print("DEBUG: Starting prediction.")
74
+ try:
75
+ sampled_raw_frames = sample_frames(list(raw_frames_buffer), FRAMES_TO_SAMPLE_PER_CLIP)
76
+ frames_for_model = sample_frames(sampled_raw_frames, MODEL_INPUT_NUM_FRAMES)
77
+
78
+ if len(frames_for_model) < MODEL_INPUT_NUM_FRAMES:
79
+ yield "Error during frame sampling.", f"Error: Not enough frames ({len(frames_for_model)}/{MODEL_INPUT_NUM_FRAMES}). Resetting."
80
+ print(f"ERROR: Insufficient frames for model input: {len(frames_for_model)}/{MODEL_INPUT_NUM_FRAMES}. Resetting state.")
81
+ app_state = "recording" # Reset state to start a new recording
82
+ raw_frames_buffer.clear()
83
+ current_clip_start_time = time.time()
84
+ last_prediction_completion_time = time.time()
85
+ return # Exit this stream call to wait for next frame or reset
86
+
87
+ processed_input = processor(images=frames_for_model, return_tensors="pt")
88
+ pixel_values = processed_input.pixel_values.to(device)
89
+
90
+ with torch.no_grad():
91
+ outputs = model(pixel_values)
92
+ logits = outputs.logits
93
+
94
+ predicted_class_id = logits.argmax(-1).item()
95
+ predicted_label = model.config.id2label.get(predicted_class_id, "Unknown")
96
+ confidence = torch.nn.functional.softmax(logits, dim=-1)[0][predicted_class_id].item()
97
+
98
+ prediction_result = f"Predicted: {predicted_label} (Confidence: {confidence:.2f})"
99
+ status_message = "Prediction complete."
100
+ print(f"DEBUG: Prediction Result: {prediction_result}")
101
+
102
+ # Yield the prediction result immediately to ensure UI update
103
+ yield status_message, prediction_result
104
+
105
+ # Clear buffer and transition to delay AFTER yielding the prediction
106
+ raw_frames_buffer.clear()
107
+ last_prediction_completion_time = current_time
108
+ app_state = "processing_delay"
109
+ print("DEBUG: Transitioning to 'processing_delay' state.")
110
+
111
+ except Exception as e:
112
+ error_message = f"Error during prediction: {e}"
113
+ print(f"ERROR during prediction: {e}")
114
+ # Yield error to UI
115
+ yield "Prediction error.", error_message
116
+ app_state = "processing_delay" # Still go to delay state to prevent constant errors
117
+ raw_frames_buffer.clear() # Clear buffer to prevent re-processing same problematic frames
118
+
119
+ elif app_state == "processing_delay":
120
+ elapsed_delay = current_time - last_prediction_completion_time
121
+
122
+ if elapsed_delay < DELAY_BETWEEN_PREDICTIONS_SECONDS:
123
+ # Continue yielding the delay message and the last prediction result
124
+ # Assuming prediction_result from previous state is still held by UI
125
+ yield f"Delaying next prediction: {int(elapsed_delay)}/{int(DELAY_BETWEEN_PREDICTIONS_SECONDS)}s", gr.NO_VALUE # NO_VALUE keeps previous prediction visible
126
+ else:
127
+ # Delay is over, reset for new recording cycle
128
+ app_state = "recording"
129
+ current_clip_start_time = current_time
130
+ print("DEBUG: Transitioning back to 'recording' state.")
131
+ yield "Starting new recording...", "Ready for new prediction."
132
+
133
+ # If for some reason nothing is yielded, return the current state to prevent UI freeze.
134
+ # This acts as a fallback if no state transition happens.
135
+ # However, with the yield statements, this might be less critical.
136
+ # For streaming, yielding is the preferred way to update.
137
+ # If the function ends without yielding, Gradio will just keep the last state.
138
+ # We always yield in every branch.
139
+ pass # No explicit return needed at the end if all paths yield
140
+
141
+ def reset_app_state_manual():
142
+ global raw_frames_buffer, current_clip_start_time, last_prediction_completion_time, app_state
143
+ raw_frames_buffer.clear()
144
+ current_clip_start_time = time.time()
145
+ last_prediction_completion_time = time.time()
146
+ app_state = "recording"
147
+ print("DEBUG: Manual reset triggered.")
148
+ # Return initial values immediately upon reset
149
+ return "Ready to record...", "Ready for new prediction."
150
+
151
+ # --- Gradio UI Layout ---
152
+ with gr.Blocks() as demo:
153
+ gr.Markdown(
154
+ f"""
155
+ # TimesFormer Crime Detection - Hugging Face Space Host
156
+ This Space hosts the `owinymarvin/timesformer-crime-detection` model.
157
+ Live webcam demo with recording and prediction phases.
158
+ """
159
+ )
160
+
161
+ with gr.Tab("Live Webcam Demo"):
162
+ gr.Markdown(
163
+ f"""
164
+ Continuously captures live webcam feed for **{RAW_RECORDING_DURATION_SECONDS} seconds**,
165
+ then makes a prediction. There is a **{DELAY_BETWEEN_PREDICTIONS_SECONDS/60:.0f} minute delay** afterwards.
166
+ """
167
+ )
168
+ with gr.Row():
169
+ with gr.Column():
170
+ webcam_input = gr.Image(
171
+ sources=["webcam"],
172
+ streaming=True,
173
+ label="Live Webcam Feed"
174
+ )
175
+ status_output = gr.Textbox(label="Current Status", value="Initializing...")
176
+ reset_button = gr.Button("Reset / Start New Cycle")
177
+ with gr.Column():
178
+ prediction_output = gr.Textbox(label="Prediction Result", value="Waiting...")
179
+
180
+ # IMPORTANT: Use webcam_input.stream() with a generator function (live_predict_stream)
181
+ # to enable progressive updates via 'yield'.
182
+ webcam_input.stream(
183
+ live_predict_stream,
184
+ inputs=[webcam_input],
185
+ outputs=[status_output, prediction_output]
186
+ )
187
+
188
+ # The reset button is a regular click event, not a stream
189
+ reset_button.click(
190
+ reset_app_state_manual,
191
+ inputs=[],
192
+ outputs=[status_output, prediction_output]
193
+ )
194
+
195
+ with gr.Tab("API Endpoint for External Clients"):
196
+ gr.Markdown(
197
+ """
198
+ Use this API endpoint to send base64-encoded frames for prediction.
199
+ """
200
+ )
201
+ # Placeholder for the API tab. The actual API calls target /run/predict_from_frames_api
202
+ gr.Interface(
203
+ fn=lambda frames_list: "API endpoint is active for programmatic calls. See documentation in app.py.",
204
+ inputs=gr.Json(label="List of Base64-encoded image strings"),
205
+ outputs=gr.Textbox(label="API Response"),
206
+ live=False,
207
+ allow_flagging="never"
208
+ )
209
+
210
+
211
+ if __name__ == "__main__":
212
+ demo.launch()