DawnC commited on
Commit
6197eab
·
verified ·
1 Parent(s): da70c52

Upload 3 files

Browse files

update video processor function, now can know the time line of objects and fixed error issues

Files changed (3) hide show
  1. app.py +166 -63
  2. ui_manager.py +212 -141
  3. video_processor.py +500 -295
app.py CHANGED
@@ -8,6 +8,8 @@ import cv2
8
  from PIL import Image
9
  import tempfile
10
  import uuid
 
 
11
  import spaces
12
 
13
  from detection_model import DetectionModel
@@ -27,7 +29,7 @@ ui_manager = None
27
  def initialize_processors():
28
  """
29
  Initialize the image and video processors with LLM support.
30
-
31
  Returns:
32
  bool: True if initialization was successful, False otherwise
33
  """
@@ -49,8 +51,9 @@ def initialize_processors():
49
  else:
50
  print("WARNING: scene_analyzer attribute not found in image_processor")
51
 
52
- video_processor = VideoProcessor(image_processor)
53
- print("VideoProcessor initialized successfully")
 
54
  return True
55
 
56
  except Exception as e:
@@ -62,7 +65,7 @@ def initialize_processors():
62
  try:
63
  print("Attempting fallback initialization without LLM...")
64
  image_processor = ImageProcessor(use_llm=False, enable_places365=False)
65
- video_processor = VideoProcessor(image_processor)
66
  print("Fallback processors initialized successfully without LLM and Places365")
67
  return True
68
 
@@ -77,25 +80,25 @@ def initialize_processors():
77
  def initialize_ui_manager():
78
  """
79
  Initialize the UI manager and set up references to processors.
80
-
81
  Returns:
82
  UIManager: Initialized UI manager instance
83
  """
84
  global ui_manager, image_processor
85
-
86
  ui_manager = UIManager()
87
-
88
  # Set image processor reference for dynamic class retrieval
89
  if image_processor:
90
  ui_manager.set_image_processor(image_processor)
91
-
92
  return ui_manager
93
 
94
  @spaces.GPU(duration=180)
95
  def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
96
  """
97
  Processes a single uploaded image.
98
-
99
  Args:
100
  image: PIL Image object
101
  model_name: Name of the YOLO model to use
@@ -103,10 +106,10 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
103
  filter_classes: List of class names/IDs to filter
104
  use_llm: Whether to use LLM for enhanced descriptions
105
  enable_landmark: Whether to enable landmark detection
106
-
107
  Returns:
108
- Tuple: (result_image, result_text, formatted_stats, plot_figure,
109
- scene_description_html, original_desc_html, activities_list_data,
110
  safety_data, zones, lighting)
111
  """
112
  # Enhanced safety check for image_processor
@@ -140,7 +143,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
140
 
141
  print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
142
  print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
143
-
144
  try:
145
  image_processor.use_llm = use_llm
146
 
@@ -366,7 +369,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
366
  </div>
367
  '''
368
 
369
- # 原始描述只在使用 LLM 且有增強描述時在折疊區顯示
370
  original_desc_visibility = "block" if use_llm and enhanced_description else "none"
371
  original_desc_html = f'''
372
  <div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
@@ -483,95 +486,195 @@ def download_video_from_url(video_url, max_duration_minutes=10):
483
  print(f"Error downloading video: {e}\n{error_details}")
484
  return None, f"Error downloading video: {str(e)}"
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  @spaces.GPU
488
- def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
 
489
  """
490
- Handles video upload or URL input and calls the VideoProcessor.
491
 
492
  Args:
493
- video_input: Uploaded video file
494
- video_url: Video URL (if using URL input)
495
- input_type: Type of input ("upload" or "url")
496
- model_name: Name of the YOLO model to use
497
- confidence_threshold: Confidence threshold for detections
498
- process_interval: Frame processing interval
499
 
500
  Returns:
501
  Tuple: (output_video_path, summary_html, formatted_stats)
502
  """
503
- print(f"Received video request: input_type={input_type}")
504
- video_path = None
 
 
505
 
506
- # Handle based on input type
 
 
507
  if input_type == "upload" and video_input:
508
- print(f"Processing uploaded video file")
509
  video_path = video_input
 
510
  elif input_type == "url" and video_url:
511
  print(f"Processing video from URL: {video_url}")
512
- # Download video from URL
513
- video_path, error_message = download_video_from_url(video_url)
514
- if error_message:
515
- error_html = f"<div class='video-summary-content-wrapper'><pre>{error_message}</pre></div>"
516
- return None, error_html, {"error": error_message}
517
- else:
518
- print("No valid video input provided.")
519
- return None, "<div class='video-summary-content-wrapper'><pre>Please upload a video file or provide a valid video URL.</pre></div>", {}
520
-
521
- print(f"Starting video processing with: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
 
 
 
 
522
  try:
523
- # Call the VideoProcessor method
524
- output_video_path, summary_text, stats_dict = video_processor.process_video_file(
525
  video_path=video_path,
526
  model_name=model_name,
527
  confidence_threshold=confidence_threshold,
528
- process_interval=int(process_interval) # Ensure interval is int
529
  )
530
- print(f"Video processing function returned: path={output_video_path}, summary length={len(summary_text)}")
531
-
532
- # Wrap processing summary in HTML tags for consistent styling with scene understanding page
533
- summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_text}</pre></div>"
534
-
535
- # Format statistics for better display
536
- formatted_stats = {}
537
- if stats_dict and isinstance(stats_dict, dict):
538
- formatted_stats = stats_dict
539
-
540
- return output_video_path, summary_html, formatted_stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
 
542
  except Exception as e:
543
  print(f"Error in handle_video_upload: {e}")
544
- import traceback
545
- error_msg = f"Error processing video: {str(e)}\n{traceback.format_exc()}"
546
  error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
547
  return None, error_html, {"error": str(e)}
548
 
549
-
550
  def main():
551
- """
552
- Main function to initialize processors and launch the Gradio interface.
553
- """
554
  global ui_manager
555
 
556
- # Initialize processors
 
557
  print("Initializing processors...")
558
  initialization_success = initialize_processors()
559
  if not initialization_success:
560
- print("WARNING: Failed to initialize processors. Application may not function correctly.")
561
  return
562
-
563
- # Initialize UI manager
564
  print("Initializing UI manager...")
565
  ui_manager = initialize_ui_manager()
566
-
567
- # Create and launch the Gradio interface
568
  print("Creating Gradio interface...")
569
  demo_interface = ui_manager.create_interface(
570
  handle_image_upload_fn=handle_image_upload,
571
  handle_video_upload_fn=handle_video_upload,
572
  download_video_from_url_fn=download_video_from_url
573
  )
574
-
575
  print("Launching application...")
576
  demo_interface.launch(debug=True)
577
 
 
8
  from PIL import Image
9
  import tempfile
10
  import uuid
11
+ import time
12
+ import traceback
13
  import spaces
14
 
15
  from detection_model import DetectionModel
 
29
  def initialize_processors():
30
  """
31
  Initialize the image and video processors with LLM support.
32
+
33
  Returns:
34
  bool: True if initialization was successful, False otherwise
35
  """
 
51
  else:
52
  print("WARNING: scene_analyzer attribute not found in image_processor")
53
 
54
+ # 初始化獨立的VideoProcessor
55
+ video_processor = VideoProcessor()
56
+ print("VideoProcessor initialized successfully as independent module")
57
  return True
58
 
59
  except Exception as e:
 
65
  try:
66
  print("Attempting fallback initialization without LLM...")
67
  image_processor = ImageProcessor(use_llm=False, enable_places365=False)
68
+ video_processor = VideoProcessor()
69
  print("Fallback processors initialized successfully without LLM and Places365")
70
  return True
71
 
 
80
  def initialize_ui_manager():
81
  """
82
  Initialize the UI manager and set up references to processors.
83
+
84
  Returns:
85
  UIManager: Initialized UI manager instance
86
  """
87
  global ui_manager, image_processor
88
+
89
  ui_manager = UIManager()
90
+
91
  # Set image processor reference for dynamic class retrieval
92
  if image_processor:
93
  ui_manager.set_image_processor(image_processor)
94
+
95
  return ui_manager
96
 
97
  @spaces.GPU(duration=180)
98
  def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
99
  """
100
  Processes a single uploaded image.
101
+
102
  Args:
103
  image: PIL Image object
104
  model_name: Name of the YOLO model to use
 
106
  filter_classes: List of class names/IDs to filter
107
  use_llm: Whether to use LLM for enhanced descriptions
108
  enable_landmark: Whether to enable landmark detection
109
+
110
  Returns:
111
+ Tuple: (result_image, result_text, formatted_stats, plot_figure,
112
+ scene_description_html, original_desc_html, activities_list_data,
113
  safety_data, zones, lighting)
114
  """
115
  # Enhanced safety check for image_processor
 
143
 
144
  print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
145
  print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
146
+
147
  try:
148
  image_processor.use_llm = use_llm
149
 
 
369
  </div>
370
  '''
371
 
372
+ # 原始描述只在使用 LLM 且有增強敘述時會在折疊區顯示
373
  original_desc_visibility = "block" if use_llm and enhanced_description else "none"
374
  original_desc_html = f'''
375
  <div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
 
486
  print(f"Error downloading video: {e}\n{error_details}")
487
  return None, f"Error downloading video: {str(e)}"
488
 
489
+ def generate_basic_video_summary(analysis_results: Dict) -> str:
490
+ """
491
+ 生成基本的視頻統計摘要
492
+
493
+ Args:
494
+ analysis_results (Dict): 新的分析結果結構
495
+
496
+ Returns:
497
+ str: 詳細的統計摘要
498
+ """
499
+ summary_lines = ["=== Video Analysis Summary ===", ""]
500
+
501
+ # process info
502
+ processing_info = analysis_results.get("processing_info", {})
503
+ duration = processing_info.get("video_duration_seconds", 0)
504
+ total_frames = processing_info.get("total_frames", 0)
505
+ analyzed_frames = processing_info.get("frames_analyzed", 0)
506
+
507
+ summary_lines.extend([
508
+ f"Video Duration: {duration:.1f} seconds ({total_frames} total frames)",
509
+ f"Frames Analyzed: {analyzed_frames} frames (every {processing_info.get('processing_interval', 1)} frames)",
510
+ ""
511
+ ])
512
+
513
+ # object detected summary
514
+ object_summary = analysis_results.get("object_summary", {})
515
+ total_objects = object_summary.get("total_unique_objects_detected", 0)
516
+ object_types = object_summary.get("object_types_found", 0)
517
+
518
+ summary_lines.extend([
519
+ f"Objects Detected: {total_objects} total objects across {object_types} categories",
520
+ ""
521
+ ])
522
+
523
+ # detailed counting number
524
+ detailed_counts = object_summary.get("detailed_counts", {})
525
+ if detailed_counts:
526
+ summary_lines.extend([
527
+ "Object Breakdown:",
528
+ *[f" • {count} {name}(s)" for name, count in detailed_counts.items()],
529
+ ""
530
+ ])
531
+
532
+ # 實用分析摘要
533
+ practical_analytics = analysis_results.get("practical_analytics", {})
534
+
535
+ # 物體密度分析
536
+ density_info = practical_analytics.get("object_density", {})
537
+ if density_info:
538
+ objects_per_min = density_info.get("objects_per_minute", 0)
539
+ peak_periods = density_info.get("peak_activity_periods", [])
540
+ summary_lines.extend([
541
+ f"Activity Level: {objects_per_min:.1f} objects per minute",
542
+ f"Peak Activity Periods: {len(peak_periods)} identified",
543
+ ""
544
+ ])
545
+
546
+ # 場景適合性
547
+ scene_info = practical_analytics.get("scene_appropriateness", {})
548
+ if scene_info.get("scene_detected", False):
549
+ scene_name = scene_info.get("scene_name", "unknown")
550
+ appropriateness = scene_info.get("appropriateness_score", 0)
551
+ summary_lines.extend([
552
+ f"Scene Type: {scene_name}",
553
+ f"Object-Scene Compatibility: {appropriateness:.1%}",
554
+ ""
555
+ ])
556
+
557
+ # 品質指標
558
+ quality_info = practical_analytics.get("quality_metrics", {})
559
+ if quality_info:
560
+ quality_grade = quality_info.get("quality_grade", "unknown")
561
+ overall_confidence = quality_info.get("overall_confidence", 0)
562
+ summary_lines.extend([
563
+ f"Detection Quality: {quality_grade.title()} (avg confidence: {overall_confidence:.3f})",
564
+ ""
565
+ ])
566
+
567
+ summary_lines.append(f"Processing completed in {processing_info.get('processing_time_seconds', 0):.1f} seconds.")
568
+
569
+ return "\n".join(summary_lines)
570
 
571
  @spaces.GPU
572
+ def handle_video_upload(video_input, video_url, input_type, model_name,
573
+ confidence_threshold, process_interval):
574
  """
575
+ 處理影片上傳的函數
576
 
577
  Args:
578
+ video_input: 上傳的視頻文件
579
+ video_url: 視頻URL(如果使用URL輸入)
580
+ input_type: 輸入類型("upload" "url"
581
+ model_name: YOLO模型名稱
582
+ confidence_threshold: 置信度閾值
583
+ process_interval: 處理間隔(每N幀處理一次)
584
 
585
  Returns:
586
  Tuple: (output_video_path, summary_html, formatted_stats)
587
  """
588
+ if video_processor is None:
589
+ error_msg = "Error: Video processor not initialized."
590
+ error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
591
+ return None, error_html, {"error": "Video processor not available"}
592
 
593
+ video_path = None
594
+
595
+ # 根據輸入類型處理
596
  if input_type == "upload" and video_input:
 
597
  video_path = video_input
598
+ print(f"Processing uploaded video file: {video_path}")
599
  elif input_type == "url" and video_url:
600
  print(f"Processing video from URL: {video_url}")
601
+ video_path, error_msg = download_video_from_url(video_url)
602
+ if error_msg:
603
+ error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
604
+ return None, error_html, {"error": error_msg}
605
+
606
+ if not video_path:
607
+ error_msg = "Please provide a video file or valid URL."
608
+ error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
609
+ return None, error_html, {"error": "No video input provided"}
610
+
611
+ print(f"Starting practical video analysis: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
612
+
613
+ processing_start_time = time.time()
614
+
615
  try:
616
+ output_video_path, analysis_results = video_processor.process_video(
 
617
  video_path=video_path,
618
  model_name=model_name,
619
  confidence_threshold=confidence_threshold,
620
+ process_interval=int(process_interval)
621
  )
622
+
623
+ print(f"Video processing function returned: path={output_video_path}")
624
+
625
+ if output_video_path is None:
626
+ error_msg = analysis_results.get("error", "Unknown error occurred during video processing")
627
+ error_html = f"<div class='video-summary-content-wrapper'><pre>Processing failed: {error_msg}</pre></div>"
628
+ return None, error_html, analysis_results
629
+
630
+ # 生成摘要,直接用統計數據
631
+ basic_summary = generate_basic_video_summary(analysis_results)
632
+
633
+ # Final Result
634
+ processing_time = time.time() - processing_start_time
635
+ processing_info = analysis_results.get("processing_info", {})
636
+
637
+ summary_lines = [
638
+ f"Video processing completed in {processing_time:.2f} seconds.",
639
+ f"Analyzed {processing_info.get('frames_analyzed', 0)} frames out of {processing_info.get('total_frames', 0)} total frames.",
640
+ f"Processing interval: every {process_interval} frames",
641
+ basic_summary
642
+ ]
643
+
644
+ summary_content = '\n'.join(summary_lines)
645
+ summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_content}</pre></div>"
646
+
647
+ return output_video_path, summary_html, analysis_results
648
 
649
  except Exception as e:
650
  print(f"Error in handle_video_upload: {e}")
651
+ traceback.print_exc()
652
+ error_msg = f"影片處理失敗: {str(e)}"
653
  error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
654
  return None, error_html, {"error": str(e)}
655
 
 
656
  def main():
657
+ """主函數,初始化並啟動Gradio"""
 
 
658
  global ui_manager
659
 
660
+ print("=== VisionScout Application Starting ===")
661
+
662
  print("Initializing processors...")
663
  initialization_success = initialize_processors()
664
  if not initialization_success:
665
+ print("ERROR: Failed to initialize processors. Application cannot start.")
666
  return
667
+
 
668
  print("Initializing UI manager...")
669
  ui_manager = initialize_ui_manager()
670
+
 
671
  print("Creating Gradio interface...")
672
  demo_interface = ui_manager.create_interface(
673
  handle_image_upload_fn=handle_image_upload,
674
  handle_video_upload_fn=handle_video_upload,
675
  download_video_from_url_fn=download_video_from_url
676
  )
677
+
678
  print("Launching application...")
679
  demo_interface.launch(debug=True)
680
 
ui_manager.py CHANGED
@@ -7,17 +7,17 @@ from style import Style
7
 
8
  class UIManager:
9
  """
10
- Manages all UI-related functionality for the VisionScout application.
11
  Handles Gradio interface creation, component definitions, and event binding.
12
  """
13
-
14
  def __init__(self):
15
  """Initialize the UI Manager."""
16
  self.available_models = None
17
  self.model_choices = []
18
  self.class_choices_formatted = []
19
  self._setup_model_choices()
20
-
21
  def _setup_model_choices(self):
22
  """Setup model choices for dropdowns."""
23
  try:
@@ -26,14 +26,14 @@ class UIManager:
26
  except ImportError:
27
  # Fallback model choices if DetectionModel is not available
28
  self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
29
-
30
  # Setup class choices
31
  self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
32
-
33
  def get_all_classes(self):
34
  """
35
  Gets all available COCO classes.
36
-
37
  Returns:
38
  List[Tuple[int, str]]: List of (class_id, class_name) tuples
39
  """
@@ -52,7 +52,7 @@ class UIManager:
52
  except Exception:
53
  pass
54
 
55
- # Fallback to standard COCO (ensure keys are ints)
56
  default_classes = {
57
  0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
58
  6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
@@ -72,27 +72,27 @@ class UIManager:
72
  77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
73
  }
74
  return sorted(default_classes.items())
75
-
76
  def set_image_processor(self, image_processor):
77
  """
78
  Set the image processor reference for dynamic class retrieval.
79
-
80
  Args:
81
  image_processor: The ImageProcessor instance
82
  """
83
  self._image_processor = image_processor
84
-
85
  def get_css_styles(self):
86
  """
87
  Get CSS styles for the interface.
88
-
89
  Returns:
90
  str: CSS styles
91
  """
92
  try:
93
  return Style.get_css()
94
  except ImportError:
95
- # Fallback CSS if Style module is not available
96
  return """
97
  .app-header {
98
  text-align: center;
@@ -111,15 +111,23 @@ class UIManager:
111
  border: none !important;
112
  border-radius: 8px !important;
113
  }
 
 
 
 
 
 
 
 
114
  """
115
-
116
  def get_model_description(self, model_name):
117
  """
118
  Get model description for the given model name.
119
-
120
  Args:
121
  model_name: Name of the model
122
-
123
  Returns:
124
  str: Model description
125
  """
@@ -127,11 +135,11 @@ class UIManager:
127
  return DetectionModel.get_model_description(model_name)
128
  except ImportError:
129
  return f"Model: {model_name}"
130
-
131
  def create_header(self):
132
  """
133
  Create the application header.
134
-
135
  Returns:
136
  gr.HTML: Header HTML component
137
  """
@@ -142,7 +150,7 @@ class UIManager:
142
  <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
143
  <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
144
  <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
145
- <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
146
  </div>
147
  <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
148
  <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
@@ -152,18 +160,18 @@ class UIManager:
152
  </div>
153
  </div>
154
  """)
155
-
156
  def create_footer(self):
157
  """
158
  Create the application footer.
159
-
160
  Returns:
161
  gr.HTML: Footer HTML component
162
  """
163
  return gr.HTML("""
164
  <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
165
  <div style="margin-bottom: 15px;">
166
- <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
167
  </div>
168
  <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
169
  <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
@@ -173,27 +181,27 @@ class UIManager:
173
  </div>
174
  </div>
175
  """)
176
-
177
  def create_image_tab(self):
178
  """
179
  Create the image processing tab with all components.
180
-
181
  Returns:
182
  Dict: Dictionary containing all image tab components
183
  """
184
  components = {}
185
-
186
  with gr.Tab("Image Processing"):
187
  components['current_image_model'] = gr.State("yolov8m.pt")
188
-
189
  with gr.Row(equal_height=False):
190
  # Left Column: Image Input & Controls
191
  with gr.Column(scale=4, elem_classes="input-panel"):
192
  with gr.Group():
193
  gr.HTML('<div class="section-heading">Upload Image</div>')
194
  components['image_input'] = gr.Image(
195
- type="pil",
196
- label="Upload an image",
197
  elem_classes="upload-box"
198
  )
199
 
@@ -204,7 +212,7 @@ class UIManager:
204
  label="Select Model",
205
  info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
206
  )
207
-
208
  components['image_model_info'] = gr.Markdown(
209
  self.get_model_description("yolov8m.pt")
210
  )
@@ -234,7 +242,7 @@ class UIManager:
234
  components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
235
  components['animals_btn'] = gr.Button("Animals", size="sm")
236
  components['objects_btn'] = gr.Button("Common Objects", size="sm")
237
-
238
  components['image_class_filter'] = gr.Dropdown(
239
  choices=self.class_choices_formatted,
240
  multiselect=True,
@@ -243,8 +251,8 @@ class UIManager:
243
  )
244
 
245
  components['image_detect_btn'] = gr.Button(
246
- "Analyze Image",
247
- variant="primary",
248
  elem_classes="detect-btn"
249
  )
250
 
@@ -289,21 +297,21 @@ class UIManager:
289
  # Detection Result Tab
290
  with gr.Tab("Detection Result"):
291
  components['image_result_image'] = gr.Image(
292
- type="pil",
293
  label="Detection Result"
294
  )
295
  gr.HTML('<div class="section-heading">Detection Details</div>')
296
  components['image_result_text'] = gr.Textbox(
297
- label=None,
298
- lines=10,
299
- elem_id="detection-details",
300
  container=False
301
  )
302
 
303
  # Scene Understanding Tab
304
  with gr.Tab("Scene Understanding"):
305
  gr.HTML('<div class="section-heading">Scene Analysis</div>')
306
-
307
  # Info details
308
  gr.HTML("""
309
  <details class="info-details" style="margin: 5px 0 15px 0;">
@@ -327,16 +335,16 @@ class UIManager:
327
  </p>
328
  </div>
329
  ''')
330
-
331
  components['image_scene_description_html'] = gr.HTML(
332
- label=None,
333
  elem_id="scene_analysis_description_text"
334
  )
335
 
336
  # Original Scene Analysis accordion
337
  with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
338
  components['image_llm_description'] = gr.HTML(
339
- label=None,
340
  elem_id="original_scene_description_text"
341
  )
342
 
@@ -344,32 +352,32 @@ class UIManager:
344
  with gr.Column(scale=1):
345
  gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
346
  components['image_activities_list'] = gr.Dataframe(
347
- headers=["Activity"],
348
- datatype=["str"],
349
- row_count=5,
350
- col_count=1,
351
  wrap=True
352
  )
353
 
354
  with gr.Column(scale=1):
355
  gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
356
  components['image_safety_list'] = gr.Dataframe(
357
- headers=["Concern"],
358
- datatype=["str"],
359
- row_count=5,
360
- col_count=1,
361
  wrap=True
362
  )
363
 
364
  gr.HTML('<div class="section-heading">Functional Zones</div>')
365
  components['image_zones_json'] = gr.JSON(
366
- label=None,
367
  elem_classes="json-box"
368
  )
369
 
370
  gr.HTML('<div class="section-heading">Lighting Conditions</div>')
371
  components['image_lighting_info'] = gr.JSON(
372
- label=None,
373
  elem_classes="json-box"
374
  )
375
 
@@ -379,27 +387,28 @@ class UIManager:
379
  with gr.Column(scale=3, elem_classes="plot-column"):
380
  gr.HTML('<div class="section-heading">Object Distribution</div>')
381
  components['image_plot_output'] = gr.Plot(
382
- label=None,
383
  elem_classes="large-plot-container"
384
  )
385
  with gr.Column(scale=2, elem_classes="stats-column"):
386
  gr.HTML('<div class="section-heading">Detection Statistics</div>')
387
  components['image_stats_json'] = gr.JSON(
388
- label=None,
389
  elem_classes="enhanced-json-display"
390
  )
391
-
392
  return components
393
 
394
  def create_video_tab(self):
395
  """
396
  Create the video processing tab with all components.
397
-
 
398
  Returns:
399
  Dict: Dictionary containing all video tab components
400
  """
401
  components = {}
402
-
403
  with gr.Tab("Video Processing"):
404
  with gr.Row(equal_height=False):
405
  # Left Column: Video Input & Controls
@@ -444,21 +453,35 @@ class UIManager:
444
  choices=self.model_choices,
445
  value="yolov8n.pt",
446
  label="Select Model (Video)",
447
- info="Faster models (like 'n') are recommended"
448
  )
449
  components['video_confidence'] = gr.Slider(
450
  minimum=0.1, maximum=0.9, value=0.4, step=0.05,
451
- label="Confidence Threshold (Video)"
 
452
  )
453
  components['video_process_interval'] = gr.Slider(
454
  minimum=1, maximum=60, value=10, step=1,
455
  label="Processing Interval (Frames)",
456
- info="Analyze every Nth frame (higher value = faster)"
457
  )
458
-
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  components['video_process_btn'] = gr.Button(
460
- "Process Video",
461
- variant="primary",
462
  elem_classes="detect-btn"
463
  )
464
 
@@ -467,9 +490,17 @@ class UIManager:
467
  gr.HTML('<div class="section-heading">How to Use (Video)</div>')
468
  gr.Markdown("""
469
  1. Choose your input method: Upload a file or enter a URL.
470
- 2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
471
- 3. Click "Process Video". **Processing can take a significant amount of time.**
472
- 4. The annotated video and summary will appear on the right when finished.
 
 
 
 
 
 
 
 
473
  """)
474
 
475
  # Video examples
@@ -477,8 +508,9 @@ class UIManager:
477
  gr.HTML("""
478
  <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
479
  <p style="font-size: 14px; color: #4A5568; margin: 0;">
480
- Upload any video containing objects that YOLO can detect. For testing, find sample videos
481
- <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
 
482
  </p>
483
  </div>
484
  """)
@@ -486,48 +518,87 @@ class UIManager:
486
  # Right Column: Video Results
487
  with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
488
  gr.HTML("""
489
- <div class="section-heading">Video Result</div>
490
  <details class="info-details" style="margin: 5px 0 15px 0;">
491
  <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
492
- 🎬 Video Processing Notes
493
  </summary>
494
  <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
495
  <p style="font-size: 13px; color: #718096; margin: 0;">
496
- The processed video includes bounding boxes around detected objects. For longer videos,
497
- consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
 
 
 
 
498
  </p>
499
  </div>
500
  </details>
501
  """)
502
-
503
  components['video_output'] = gr.Video(
504
- label="Processed Video",
505
  elem_classes="video-output-container"
506
  )
507
 
508
- gr.HTML('<div class="section-heading">Processing Summary</div>')
509
- components['video_summary_text'] = gr.HTML(
510
- label=None,
511
- elem_id="video-summary-html-output"
512
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
 
514
- gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
515
- components['video_stats_json'] = gr.JSON(
516
- label=None,
517
- elem_classes="video-stats-display"
518
- )
519
-
520
  return components
521
-
522
  def get_filter_button_mappings(self):
523
  """
524
  Get the class ID mappings for filter buttons.
525
-
526
  Returns:
527
  Dict: Dictionary containing class ID lists for different categories
528
  """
529
  available_classes_list = self.get_all_classes()
530
-
531
  return {
532
  'people_classes_ids': [0],
533
  'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
@@ -535,36 +606,36 @@ class UIManager:
535
  'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
536
  'available_classes_list': available_classes_list
537
  }
538
-
539
- def create_interface(self,
540
- handle_image_upload_fn,
541
- handle_video_upload_fn,
542
  download_video_from_url_fn):
543
  """
544
  Create the complete Gradio interface.
545
-
546
  Args:
547
  handle_image_upload_fn: Function to handle image upload
548
  handle_video_upload_fn: Function to handle video upload
549
  download_video_from_url_fn: Function to download video from URL
550
-
551
  Returns:
552
  gr.Blocks: Complete Gradio interface
553
  """
554
  css = self.get_css_styles()
555
-
556
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
557
-
558
  # Header
559
  with gr.Group(elem_classes="app-header"):
560
  self.create_header()
561
 
562
  # Main Content with Tabs
563
  with gr.Tabs(elem_classes="tabs"):
564
-
565
  # Image Processing Tab
566
  image_components = self.create_image_tab()
567
-
568
  # Video Processing Tab
569
  video_components = self.create_video_tab()
570
 
@@ -573,22 +644,22 @@ class UIManager:
573
 
574
  # Setup Event Listeners
575
  self._setup_event_listeners(
576
- image_components,
577
- video_components,
578
- handle_image_upload_fn,
579
  handle_video_upload_fn
580
  )
581
 
582
  return demo
583
-
584
- def _setup_event_listeners(self,
585
- image_components,
586
- video_components,
587
- handle_image_upload_fn,
588
  handle_video_upload_fn):
589
  """
590
  Setup all event listeners for the interface.
591
-
592
  Args:
593
  image_components: Dictionary of image tab components
594
  video_components: Dictionary of video tab components
@@ -611,73 +682,73 @@ class UIManager:
611
  common_objects_ids = filter_mappings['common_objects_ids']
612
 
613
  image_components['people_btn'].click(
614
- lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
615
  outputs=image_components['image_class_filter']
616
  )
617
  image_components['vehicles_btn'].click(
618
- lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
619
  outputs=image_components['image_class_filter']
620
  )
621
  image_components['animals_btn'].click(
622
- lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
623
  outputs=image_components['image_class_filter']
624
  )
625
  image_components['objects_btn'].click(
626
- lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
627
  outputs=image_components['image_class_filter']
628
  )
629
 
630
  # Video Input Type Change Handler
631
  video_components['video_input_type'].change(
632
- fn=lambda input_type: [
633
- # Show/hide file upload
634
- gr.update(visible=(input_type == "upload")),
635
- # Show/hide URL input
636
- gr.update(visible=(input_type == "url"))
637
- ],
638
- inputs=[video_components['video_input_type']],
639
- outputs=[video_components['video_input'], video_components['video_url_input']]
640
  )
641
 
642
  # Image Detect Button Click Handler
643
  image_components['image_detect_btn'].click(
644
  fn=handle_image_upload_fn,
645
  inputs=[
646
- image_components['image_input'],
647
- image_components['image_model_dropdown'],
648
- image_components['image_confidence'],
649
- image_components['image_class_filter'],
650
- image_components['use_llm'],
651
  image_components['use_landmark_detection']
652
  ],
653
  outputs=[
654
- image_components['image_result_image'],
655
- image_components['image_result_text'],
656
- image_components['image_stats_json'],
657
  image_components['image_plot_output'],
658
- image_components['image_scene_description_html'],
659
- image_components['image_llm_description'],
660
- image_components['image_activities_list'],
661
- image_components['image_safety_list'],
662
  image_components['image_zones_json'],
663
  image_components['image_lighting_info']
664
  ]
665
  )
666
 
667
- # Video Process Button Click Handler
668
  video_components['video_process_btn'].click(
669
- fn=handle_video_upload_fn,
670
- inputs=[
671
- video_components['video_input'],
672
- video_components['video_url_input'],
673
- video_components['video_input_type'],
674
- video_components['video_model_dropdown'],
675
- video_components['video_confidence'],
676
- video_components['video_process_interval']
677
  ],
678
- outputs=[
679
- video_components['video_output'],
680
- video_components['video_summary_text'],
681
- video_components['video_stats_json']
682
  ]
683
  )
 
7
 
8
  class UIManager:
9
  """
10
+ Manages all UI-related functionality
11
  Handles Gradio interface creation, component definitions, and event binding.
12
  """
13
+
14
  def __init__(self):
15
  """Initialize the UI Manager."""
16
  self.available_models = None
17
  self.model_choices = []
18
  self.class_choices_formatted = []
19
  self._setup_model_choices()
20
+
21
  def _setup_model_choices(self):
22
  """Setup model choices for dropdowns."""
23
  try:
 
26
  except ImportError:
27
  # Fallback model choices if DetectionModel is not available
28
  self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
29
+
30
  # Setup class choices
31
  self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
32
+
33
  def get_all_classes(self):
34
  """
35
  Gets all available COCO classes.
36
+
37
  Returns:
38
  List[Tuple[int, str]]: List of (class_id, class_name) tuples
39
  """
 
52
  except Exception:
53
  pass
54
 
55
+ # COCO Classes
56
  default_classes = {
57
  0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
58
  6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
 
72
  77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
73
  }
74
  return sorted(default_classes.items())
75
+
76
  def set_image_processor(self, image_processor):
77
  """
78
  Set the image processor reference for dynamic class retrieval.
79
+
80
  Args:
81
  image_processor: The ImageProcessor instance
82
  """
83
  self._image_processor = image_processor
84
+
85
  def get_css_styles(self):
86
  """
87
  Get CSS styles for the interface.
88
+
89
  Returns:
90
  str: CSS styles
91
  """
92
  try:
93
  return Style.get_css()
94
  except ImportError:
95
+ # fallback defualt CSS style
96
  return """
97
  .app-header {
98
  text-align: center;
 
111
  border: none !important;
112
  border-radius: 8px !important;
113
  }
114
+ .video-summary-content-wrapper {
115
+ max-height: 400px;
116
+ overflow-y: auto;
117
+ background-color: #f8f9fa;
118
+ border-radius: 8px;
119
+ padding: 15px;
120
+ border: 1px solid #e2e8f0;
121
+ }
122
  """
123
+
124
  def get_model_description(self, model_name):
125
  """
126
  Get model description for the given model name.
127
+
128
  Args:
129
  model_name: Name of the model
130
+
131
  Returns:
132
  str: Model description
133
  """
 
135
  return DetectionModel.get_model_description(model_name)
136
  except ImportError:
137
  return f"Model: {model_name}"
138
+
139
  def create_header(self):
140
  """
141
  Create the application header.
142
+
143
  Returns:
144
  gr.HTML: Header HTML component
145
  """
 
150
  <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
151
  <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
152
  <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
153
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis with Temporal Tracking</div>
154
  </div>
155
  <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
156
  <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
 
160
  </div>
161
  </div>
162
  """)
163
+
164
  def create_footer(self):
165
  """
166
  Create the application footer.
167
+
168
  Returns:
169
  gr.HTML: Footer HTML component
170
  """
171
  return gr.HTML("""
172
  <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
173
  <div style="margin-bottom: 15px;">
174
+ <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Enhanced Video Processing with Temporal Analysis • Created with Gradio</p>
175
  </div>
176
  <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
177
  <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
 
181
  </div>
182
  </div>
183
  """)
184
+
185
  def create_image_tab(self):
186
  """
187
  Create the image processing tab with all components.
188
+
189
  Returns:
190
  Dict: Dictionary containing all image tab components
191
  """
192
  components = {}
193
+
194
  with gr.Tab("Image Processing"):
195
  components['current_image_model'] = gr.State("yolov8m.pt")
196
+
197
  with gr.Row(equal_height=False):
198
  # Left Column: Image Input & Controls
199
  with gr.Column(scale=4, elem_classes="input-panel"):
200
  with gr.Group():
201
  gr.HTML('<div class="section-heading">Upload Image</div>')
202
  components['image_input'] = gr.Image(
203
+ type="pil",
204
+ label="Upload an image",
205
  elem_classes="upload-box"
206
  )
207
 
 
212
  label="Select Model",
213
  info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
214
  )
215
+
216
  components['image_model_info'] = gr.Markdown(
217
  self.get_model_description("yolov8m.pt")
218
  )
 
242
  components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
243
  components['animals_btn'] = gr.Button("Animals", size="sm")
244
  components['objects_btn'] = gr.Button("Common Objects", size="sm")
245
+
246
  components['image_class_filter'] = gr.Dropdown(
247
  choices=self.class_choices_formatted,
248
  multiselect=True,
 
251
  )
252
 
253
  components['image_detect_btn'] = gr.Button(
254
+ "Analyze Image",
255
+ variant="primary",
256
  elem_classes="detect-btn"
257
  )
258
 
 
297
  # Detection Result Tab
298
  with gr.Tab("Detection Result"):
299
  components['image_result_image'] = gr.Image(
300
+ type="pil",
301
  label="Detection Result"
302
  )
303
  gr.HTML('<div class="section-heading">Detection Details</div>')
304
  components['image_result_text'] = gr.Textbox(
305
+ label=None,
306
+ lines=10,
307
+ elem_id="detection-details",
308
  container=False
309
  )
310
 
311
  # Scene Understanding Tab
312
  with gr.Tab("Scene Understanding"):
313
  gr.HTML('<div class="section-heading">Scene Analysis</div>')
314
+
315
  # Info details
316
  gr.HTML("""
317
  <details class="info-details" style="margin: 5px 0 15px 0;">
 
335
  </p>
336
  </div>
337
  ''')
338
+
339
  components['image_scene_description_html'] = gr.HTML(
340
+ label=None,
341
  elem_id="scene_analysis_description_text"
342
  )
343
 
344
  # Original Scene Analysis accordion
345
  with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
346
  components['image_llm_description'] = gr.HTML(
347
+ label=None,
348
  elem_id="original_scene_description_text"
349
  )
350
 
 
352
  with gr.Column(scale=1):
353
  gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
354
  components['image_activities_list'] = gr.Dataframe(
355
+ headers=["Activity"],
356
+ datatype=["str"],
357
+ row_count=5,
358
+ col_count=1,
359
  wrap=True
360
  )
361
 
362
  with gr.Column(scale=1):
363
  gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
364
  components['image_safety_list'] = gr.Dataframe(
365
+ headers=["Concern"],
366
+ datatype=["str"],
367
+ row_count=5,
368
+ col_count=1,
369
  wrap=True
370
  )
371
 
372
  gr.HTML('<div class="section-heading">Functional Zones</div>')
373
  components['image_zones_json'] = gr.JSON(
374
+ label=None,
375
  elem_classes="json-box"
376
  )
377
 
378
  gr.HTML('<div class="section-heading">Lighting Conditions</div>')
379
  components['image_lighting_info'] = gr.JSON(
380
+ label=None,
381
  elem_classes="json-box"
382
  )
383
 
 
387
  with gr.Column(scale=3, elem_classes="plot-column"):
388
  gr.HTML('<div class="section-heading">Object Distribution</div>')
389
  components['image_plot_output'] = gr.Plot(
390
+ label=None,
391
  elem_classes="large-plot-container"
392
  )
393
  with gr.Column(scale=2, elem_classes="stats-column"):
394
  gr.HTML('<div class="section-heading">Detection Statistics</div>')
395
  components['image_stats_json'] = gr.JSON(
396
+ label=None,
397
  elem_classes="enhanced-json-display"
398
  )
399
+
400
  return components
401
 
402
  def create_video_tab(self):
403
  """
404
  Create the video processing tab with all components.
405
+ 注意:移除了複雜的時序分析控制項,簡化為基本的統計分析
406
+
407
  Returns:
408
  Dict: Dictionary containing all video tab components
409
  """
410
  components = {}
411
+
412
  with gr.Tab("Video Processing"):
413
  with gr.Row(equal_height=False):
414
  # Left Column: Video Input & Controls
 
453
  choices=self.model_choices,
454
  value="yolov8n.pt",
455
  label="Select Model (Video)",
456
+ info="Faster models (like 'n') are recommended for video processing"
457
  )
458
  components['video_confidence'] = gr.Slider(
459
  minimum=0.1, maximum=0.9, value=0.4, step=0.05,
460
+ label="Confidence Threshold (Video)",
461
+ info="Higher threshold reduces false detections"
462
  )
463
  components['video_process_interval'] = gr.Slider(
464
  minimum=1, maximum=60, value=10, step=1,
465
  label="Processing Interval (Frames)",
466
+ info="Analyze every Nth frame (higher value = faster processing)"
467
  )
468
+
469
+ # 簡化的分析說明
470
+ gr.HTML("""
471
+ <div style="padding: 8px; margin-top: 10px; background-color: #f0f7ff; border-radius: 4px; border-left: 3px solid #4299e1; font-size: 12px;">
472
+ <p style="margin: 0; color: #4a5568;">
473
+ <b>Analysis Features:</b><br>
474
+ • Accurate object counting with duplicate detection removal<br>
475
+ • Timeline analysis showing when objects first appear<br>
476
+ • Duration tracking for object presence in video<br>
477
+ • Simple, clear statistical summaries
478
+ </p>
479
+ </div>
480
+ """)
481
+
482
  components['video_process_btn'] = gr.Button(
483
+ "Analyze Video",
484
+ variant="primary",
485
  elem_classes="detect-btn"
486
  )
487
 
 
490
  gr.HTML('<div class="section-heading">How to Use (Video)</div>')
491
  gr.Markdown("""
492
  1. Choose your input method: Upload a file or enter a URL.
493
+ 2. Adjust settings if needed:
494
+ * Use **faster models** (yolov8n) for quicker processing
495
+ * Set **larger intervals** (15+ frames) for longer videos
496
+ * Adjust **confidence threshold** to filter low-quality detections
497
+ 3. Click "Analyze Video". **Processing time varies based on video length.**
498
+ 4. Review the results: annotated video and statistical analysis.
499
+
500
+ **⚡ Performance Tips:**
501
+ * For videos longer than 2 minutes, use interval ≥ 15 frames
502
+ * YOLOv8n model provides best speed for video processing
503
+ * Higher confidence thresholds reduce processing noise
504
  """)
505
 
506
  # Video examples
 
508
  gr.HTML("""
509
  <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
510
  <p style="font-size: 14px; color: #4A5568; margin: 0;">
511
+ Upload any video containing objects that YOLO can detect. For testing, find sample videos from
512
+ <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">Pexels</a> or
513
+ <a href="https://www.youtube.com/results?search_query=traffic+camera+footage" target="_blank" style="color: #3182ce; text-decoration: underline;">YouTube traffic footage</a>.
514
  </p>
515
  </div>
516
  """)
 
518
  # Right Column: Video Results
519
  with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
520
  gr.HTML("""
521
+ <div class="section-heading">Video Analysis Results</div>
522
  <details class="info-details" style="margin: 5px 0 15px 0;">
523
  <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
524
+ 🎬 Simplified Video Analysis Features
525
  </summary>
526
  <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
527
  <p style="font-size: 13px; color: #718096; margin: 0;">
528
+ <b>Focus on practical insights:</b> This analysis provides accurate object counts and timing information
529
+ without complex tracking. The system uses spatial clustering to eliminate duplicate detections and
530
+ provides clear timeline data showing when objects first appear and how long they remain visible.
531
+ <br><br>
532
+ <b>Key benefits:</b> Reliable object counting, clear timeline analysis, and easy-to-understand results
533
+ that directly answer questions like "How many cars are in this video?" and "When do they appear?"
534
  </p>
535
  </div>
536
  </details>
537
  """)
538
+
539
  components['video_output'] = gr.Video(
540
+ label="Analyzed Video with Object Detection",
541
  elem_classes="video-output-container"
542
  )
543
 
544
+ with gr.Tabs(elem_classes="video-results-tabs"):
545
+ # Analysis Summary Tab
546
+ with gr.Tab("Analysis Summary"):
547
+ gr.HTML('<div class="section-heading">Video Analysis Report</div>')
548
+ gr.HTML("""
549
+ <div style="margin-bottom: 10px; padding: 8px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #4299e1; font-size: 12px;">
550
+ <p style="margin: 0; color: #4a5568;">
551
+ This summary provides object counts, timeline information, and insights about what appears in your video.
552
+ Results are based on spatial clustering analysis to ensure accurate counting.
553
+ </p>
554
+ </div>
555
+ """)
556
+ components['video_summary_text'] = gr.HTML(
557
+ label=None,
558
+ elem_id="video-summary-html-output"
559
+ )
560
+
561
+ # Detailed Statistics Tab
562
+ with gr.Tab("Detailed Statistics"):
563
+ gr.HTML('<div class="section-heading">Complete Analysis Data</div>')
564
+
565
+ with gr.Accordion("Processing Information", open=True):
566
+ gr.HTML("""
567
+ <div style="padding: 6px; background-color: #f8f9fa; border-radius: 4px; margin-bottom: 10px; font-size: 12px;">
568
+ <p style="margin: 0; color: #4a5568;">
569
+ Basic information about video processing parameters and performance.
570
+ </p>
571
+ </div>
572
+ """)
573
+ components['video_stats_json'] = gr.JSON(
574
+ label=None,
575
+ elem_classes="video-stats-display"
576
+ )
577
+
578
+ with gr.Accordion("Object Details", open=False):
579
+ gr.HTML("""
580
+ <div style="padding: 6px; background-color: #f8f9fa; border-radius: 4px; margin-bottom: 10px; font-size: 12px;">
581
+ <p style="margin: 0; color: #4a5568;">
582
+ Detailed breakdown of each object type detected, including timing and confidence information.
583
+ </p>
584
+ </div>
585
+ """)
586
+ components['video_object_details'] = gr.JSON(
587
+ label="Object-by-Object Analysis",
588
+ elem_classes="object-details-display"
589
+ )
590
 
 
 
 
 
 
 
591
  return components
592
+
593
  def get_filter_button_mappings(self):
594
  """
595
  Get the class ID mappings for filter buttons.
596
+
597
  Returns:
598
  Dict: Dictionary containing class ID lists for different categories
599
  """
600
  available_classes_list = self.get_all_classes()
601
+
602
  return {
603
  'people_classes_ids': [0],
604
  'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
 
606
  'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
607
  'available_classes_list': available_classes_list
608
  }
609
+
610
+ def create_interface(self,
611
+ handle_image_upload_fn,
612
+ handle_video_upload_fn,
613
  download_video_from_url_fn):
614
  """
615
  Create the complete Gradio interface.
616
+
617
  Args:
618
  handle_image_upload_fn: Function to handle image upload
619
  handle_video_upload_fn: Function to handle video upload
620
  download_video_from_url_fn: Function to download video from URL
621
+
622
  Returns:
623
  gr.Blocks: Complete Gradio interface
624
  """
625
  css = self.get_css_styles()
626
+
627
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
628
+
629
  # Header
630
  with gr.Group(elem_classes="app-header"):
631
  self.create_header()
632
 
633
  # Main Content with Tabs
634
  with gr.Tabs(elem_classes="tabs"):
635
+
636
  # Image Processing Tab
637
  image_components = self.create_image_tab()
638
+
639
  # Video Processing Tab
640
  video_components = self.create_video_tab()
641
 
 
644
 
645
  # Setup Event Listeners
646
  self._setup_event_listeners(
647
+ image_components,
648
+ video_components,
649
+ handle_image_upload_fn,
650
  handle_video_upload_fn
651
  )
652
 
653
  return demo
654
+
655
+ def _setup_event_listeners(self,
656
+ image_components,
657
+ video_components,
658
+ handle_image_upload_fn,
659
  handle_video_upload_fn):
660
  """
661
  Setup all event listeners for the interface.
662
+
663
  Args:
664
  image_components: Dictionary of image tab components
665
  video_components: Dictionary of video tab components
 
682
  common_objects_ids = filter_mappings['common_objects_ids']
683
 
684
  image_components['people_btn'].click(
685
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
686
  outputs=image_components['image_class_filter']
687
  )
688
  image_components['vehicles_btn'].click(
689
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
690
  outputs=image_components['image_class_filter']
691
  )
692
  image_components['animals_btn'].click(
693
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
694
  outputs=image_components['image_class_filter']
695
  )
696
  image_components['objects_btn'].click(
697
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
698
  outputs=image_components['image_class_filter']
699
  )
700
 
701
  # Video Input Type Change Handler
702
  video_components['video_input_type'].change(
703
+ fn=lambda input_type: [
704
+ # Show/hide file upload
705
+ gr.update(visible=(input_type == "upload")),
706
+ # Show/hide URL input
707
+ gr.update(visible=(input_type == "url"))
708
+ ],
709
+ inputs=[video_components['video_input_type']],
710
+ outputs=[video_components['video_input'], video_components['video_url_input']]
711
  )
712
 
713
  # Image Detect Button Click Handler
714
  image_components['image_detect_btn'].click(
715
  fn=handle_image_upload_fn,
716
  inputs=[
717
+ image_components['image_input'],
718
+ image_components['image_model_dropdown'],
719
+ image_components['image_confidence'],
720
+ image_components['image_class_filter'],
721
+ image_components['use_llm'],
722
  image_components['use_landmark_detection']
723
  ],
724
  outputs=[
725
+ image_components['image_result_image'],
726
+ image_components['image_result_text'],
727
+ image_components['image_stats_json'],
728
  image_components['image_plot_output'],
729
+ image_components['image_scene_description_html'],
730
+ image_components['image_llm_description'],
731
+ image_components['image_activities_list'],
732
+ image_components['image_safety_list'],
733
  image_components['image_zones_json'],
734
  image_components['image_lighting_info']
735
  ]
736
  )
737
 
738
+ # Video Process Button Click Handler
739
  video_components['video_process_btn'].click(
740
+ fn=handle_video_upload_fn,
741
+ inputs=[
742
+ video_components['video_input'],
743
+ video_components['video_url_input'],
744
+ video_components['video_input_type'],
745
+ video_components['video_model_dropdown'],
746
+ video_components['video_confidence'],
747
+ video_components['video_process_interval']
748
  ],
749
+ outputs=[
750
+ video_components['video_output'],
751
+ video_components['video_summary_text'],
752
+ video_components['video_stats_json']
753
  ]
754
  )
video_processor.py CHANGED
@@ -2,345 +2,550 @@ import cv2
2
  import os
3
  import tempfile
4
  import uuid
5
- from PIL import Image
 
6
  import numpy as np
 
7
  from typing import Dict, List, Tuple, Any, Optional
8
- import time
9
  from collections import defaultdict
 
 
10
 
11
- from image_processor import ImageProcessor
12
- from evaluation_metrics import EvaluationMetrics
13
- from scene_analyzer import SceneAnalyzer
14
  from detection_model import DetectionModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  class VideoProcessor:
17
  """
18
- Handles the processing of video files, including object detection
19
- and scene analysis on selected frames.
 
 
 
20
  """
21
- def __init__(self, image_processor: ImageProcessor):
22
- """
23
- Initializes the VideoProcessor.
24
-
25
- Args:
26
- image_processor (ImageProcessor): An initialized ImageProcessor instance.
27
- """
28
- self.image_processor = image_processor
29
-
30
- def process_video_file(self,
31
- video_path: str,
32
- model_name: str,
33
- confidence_threshold: float,
34
- process_interval: int = 5,
35
- scene_desc_interval_sec: int = 3) -> Tuple[Optional[str], str, Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  """
37
- Processes an uploaded video file, performs detection and periodic scene analysis,
38
- and returns the path to the annotated output video file along with a summary.
39
-
40
  Args:
41
- video_path (str): Path to the input video file.
42
- model_name (str): Name of the YOLO model to use.
43
- confidence_threshold (float): Confidence threshold for object detection.
44
- process_interval (int): Process every Nth frame. Defaults to 5.
45
- scene_desc_interval_sec (int): Update scene description every N seconds. Defaults to 3.
46
-
47
  Returns:
48
- Tuple[Optional[str], str, Dict]: (Path to output video or None, Summary text, Statistics dictionary)
49
  """
50
  if not video_path or not os.path.exists(video_path):
51
  print(f"Error: Video file not found at {video_path}")
52
- return None, "Error: Video file not found.", {}
53
-
54
- print(f"Starting video processing for: {video_path}")
55
  start_time = time.time()
56
-
 
 
 
 
 
 
57
  cap = cv2.VideoCapture(video_path)
58
  if not cap.isOpened():
59
- print(f"Error: Could not open video file {video_path}")
60
- return None, "Error opening video file.", {}
61
-
62
- # Get video properties
63
- fps = cap.get(cv2.CAP_PROP_FPS)
64
- if fps <= 0: # Handle case where fps is not available or invalid
65
- fps = 30 # Assume a default fps
66
- print(f"Warning: Could not get valid FPS for video. Assuming {fps} FPS.")
67
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
68
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
69
- total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
70
- print(f"Video properties: {width}x{height} @ {fps:.2f} FPS, Total Frames: {total_frames_video}")
71
-
72
- # Calculate description update interval in frames
73
- description_update_interval_frames = int(fps * scene_desc_interval_sec)
74
- if description_update_interval_frames < 1:
75
- description_update_interval_frames = int(fps) # Update at least once per second if interval is too short
76
-
77
- object_trackers = {} # 儲存ID與物體的映射
78
- last_detected_objects = {} # 儲存上一次檢測到的物體資訊
79
- next_object_id = 0 # 下一個可用的物體ID
80
- tracking_threshold = 0.6 # 相同物體的IoU
81
- object_colors = {} # 每個被追蹤的物體分配固定顏色
82
-
83
- # Setup Output Video
84
- output_filename = f"processed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
85
- temp_dir = tempfile.gettempdir() # Use system's temp directory
86
  output_path = os.path.join(temp_dir, output_filename)
87
- # Ensure the output path has a compatible extension (like .mp4)
88
  if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
89
  output_path += ".mp4"
90
-
91
- # Use 'mp4v' for MP4, common and well-supported
92
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
93
  out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
 
94
  if not out.isOpened():
95
- print(f"Error: Could not open VideoWriter for path: {output_path}")
96
  cap.release()
97
- return None, f"Error creating output video file at {output_path}.", {}
 
98
  print(f"Output video will be saved to: {output_path}")
99
-
 
 
 
 
 
 
 
 
 
100
  frame_count = 0
101
  processed_frame_count = 0
102
- all_stats = [] # Store stats for each processed frame
103
- summary_lines = []
104
- last_description = "Analyzing scene..." # Initial description
105
- frame_since_last_desc = description_update_interval_frames # Trigger analysis on first processed frame
106
-
107
  try:
108
  while True:
109
  ret, frame = cap.read()
110
  if not ret:
111
- break # End of video
112
-
113
  frame_count += 1
114
- frame_since_last_desc += 1
115
- current_frame_annotated = False # Flag if this frame was processed and annotated
116
-
117
- # Process frame based on interval
118
  if frame_count % process_interval == 0:
119
  processed_frame_count += 1
120
- print(f"Processing frame {frame_count}...")
121
- current_frame_annotated = True
122
-
123
- # Use ImageProcessor for single-frame tasks
124
- # 1. Convert frame format BGR -> RGB -> PIL
125
  try:
 
126
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
127
  pil_image = Image.fromarray(frame_rgb)
 
 
 
 
 
 
 
 
 
 
128
  except Exception as e:
129
- print(f"Error converting frame {frame_count}: {e}")
130
- continue # Skip this frame
131
-
132
- # 2. Get appropriate model instance
133
- # Confidence is passed from UI, model_name too
134
- model_instance = self.image_processor.get_model_instance(model_name, confidence_threshold)
135
- if not model_instance or not model_instance.is_model_loaded:
136
- print(f"Error: Model {model_name} not loaded. Skipping frame {frame_count}.")
137
- # Draw basic frame without annotation
138
- cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
139
- cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
140
- out.write(frame)
141
- continue
142
-
143
-
144
- # 3. Perform detection
145
- detection_result = model_instance.detect(pil_image) # Use PIL image
146
-
147
- current_description_for_frame = last_description # Default to last known description
148
- scene_analysis_result = None
149
- stats = {}
150
-
151
- if detection_result and hasattr(detection_result, 'boxes') and len(detection_result.boxes) > 0:
152
- # Ensure SceneAnalyzer is ready within ImageProcessor
153
- if not hasattr(self.image_processor, 'scene_analyzer') or self.image_processor.scene_analyzer is None:
154
- print("Initializing SceneAnalyzer...")
155
- # Pass class names from the current detection result
156
- self.image_processor.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
157
- elif self.image_processor.scene_analyzer.class_names is None:
158
- # Update class names if they were missing
159
- self.image_processor.scene_analyzer.class_names = detection_result.names
160
- if hasattr(self.image_processor.scene_analyzer, 'spatial_analyzer'):
161
- self.image_processor.scene_analyzer.spatial_analyzer.class_names = detection_result.names
162
-
163
-
164
- # 4. Perform Scene Analysis (periodically)
165
- if frame_since_last_desc >= description_update_interval_frames:
166
- print(f"Analyzing scene at frame {frame_count} (threshold: {description_update_interval_frames} frames)...")
167
- # Pass lighting_info=None for now, as it's disabled for performance
168
- scene_analysis_result = self.image_processor.analyze_scene(detection_result, lighting_info=None)
169
- current_description_for_frame = scene_analysis_result.get("description", last_description)
170
- last_description = current_description_for_frame # Cache the new description
171
- frame_since_last_desc = 0 # Reset counter
172
-
173
- # 5. Calculate Statistics for this frame
174
- stats = EvaluationMetrics.calculate_basic_stats(detection_result)
175
- stats['frame_number'] = frame_count # Add frame number to stats
176
- all_stats.append(stats)
177
-
178
- # 6. Draw annotations
179
- names = detection_result.names
180
- boxes = detection_result.boxes.xyxy.cpu().numpy()
181
- classes = detection_result.boxes.cls.cpu().numpy().astype(int)
182
- confs = detection_result.boxes.conf.cpu().numpy()
183
-
184
- def calculate_iou(box1, box2):
185
- """Calculate Intersection IOU value"""
186
- x1_1, y1_1, x2_1, y2_1 = box1
187
- x1_2, y1_2, x2_2, y2_2 = box2
188
-
189
- xi1 = max(x1_1, x1_2)
190
- yi1 = max(y1_1, y1_2)
191
- xi2 = min(x2_1, x2_2)
192
- yi2 = min(y2_1, y2_2)
193
-
194
- inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
195
- box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
196
- box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
197
-
198
- union_area = box1_area + box2_area - inter_area
199
-
200
- return inter_area / union_area if union_area > 0 else 0
201
-
202
- # 處理當前幀中的所有檢測
203
- current_detected_objects = {}
204
-
205
- for box, cls_id, conf in zip(boxes, classes, confs):
206
- x1, y1, x2, y2 = map(int, box)
207
-
208
- # 查找最匹配的已追蹤物體
209
- best_match_id = None
210
- best_match_iou = 0
211
-
212
- for obj_id, (old_box, old_cls_id, _) in last_detected_objects.items():
213
- if old_cls_id == cls_id: # 同一類別才比較
214
- iou = calculate_iou(box, old_box)
215
- if iou > tracking_threshold and iou > best_match_iou:
216
- best_match_id = obj_id
217
- best_match_iou = iou
218
-
219
- # 如果找到匹配,使用現有ID;否則分配新ID
220
- if best_match_id is not None:
221
- obj_id = best_match_id
222
- else:
223
- obj_id = next_object_id
224
- next_object_id += 1
225
-
226
- # 使用更明顯的顏色
227
- bright_colors = [
228
- (0, 0, 255), # red
229
- (0, 255, 0), # green
230
- (255, 0, 0), # blue
231
- (0, 255, 255), # yellow
232
- (255, 0, 255), # purple
233
- (255, 128, 0), # orange
234
- (128, 0, 255) # purple
235
- ]
236
- object_colors[obj_id] = bright_colors[obj_id % len(bright_colors)]
237
-
238
- # update tracking info
239
- current_detected_objects[obj_id] = (box, cls_id, conf)
240
-
241
- color = object_colors.get(obj_id, (0, 255, 0)) # default is green
242
- label = f"{names.get(cls_id, 'Unknown')}-{obj_id}: {conf:.2f}"
243
-
244
- # 平滑化邊界框:如果是已知物體,與上一幀位置平均
245
- if obj_id in last_detected_objects:
246
- old_box, _, _ = last_detected_objects[obj_id]
247
- old_x1, old_y1, old_x2, old_y2 = map(int, old_box)
248
- # 平滑係數
249
- alpha = 0.7 # current weight
250
- beta = 0.3 # history weight
251
-
252
- x1 = int(alpha * x1 + beta * old_x1)
253
- y1 = int(alpha * y1 + beta * old_y1)
254
- x2 = int(alpha * x2 + beta * old_x2)
255
- y2 = int(alpha * y2 + beta * old_y2)
256
-
257
- # draw box and label
258
- cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
259
- # add text
260
- (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
261
- cv2.rectangle(frame, (x1, y1 - h - 10), (x1 + w, y1 - 10), color, -1)
262
- cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
263
-
264
- # update tracking info
265
- last_detected_objects = current_detected_objects.copy()
266
-
267
-
268
- # Draw the current scene description on the frame
269
- cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA) # Black outline
270
- cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA) # White text
271
-
272
- # Write the frame (annotated or original) to the output video
273
- # Draw last known description if this frame wasn't processed
274
- if not current_frame_annotated:
275
- cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
276
- cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
277
-
278
- out.write(frame) # Write frame to output file
279
-
280
  except Exception as e:
281
- print(f"Error during video processing loop for {video_path}: {e}")
282
- import traceback
283
  traceback.print_exc()
284
- summary_lines.append(f"An error occurred during processing: {e}")
285
  finally:
286
- # Release resources
287
  cap.release()
288
  out.release()
289
- print(f"Video processing finished. Resources released. Output path: {output_path}")
290
- if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
291
- print(f"Error: Output video file was not created or is empty at {output_path}")
292
- summary_lines.append("Error: Failed to create output video.")
293
- output_path = None
294
-
295
- end_time = time.time()
296
- processing_time = end_time - start_time
297
- summary_lines.insert(0, f"Finished processing in {processing_time:.2f} seconds.")
298
- summary_lines.insert(1, f"Processed {processed_frame_count} frames out of {frame_count} (interval: {process_interval} frames).")
299
- summary_lines.insert(2, f"Scene description updated approximately every {scene_desc_interval_sec} seconds.")
300
-
301
- # Generate Aggregate Statistics
302
- aggregated_stats = {
303
- "total_frames_read": frame_count,
304
- "total_frames_processed": processed_frame_count,
305
- "avg_objects_per_processed_frame": 0, # Calculate below
306
- "cumulative_detections": {}, # Total times each class was detected
307
- "max_concurrent_detections": {} # Max count of each class in a single processed frame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  }
309
- object_cumulative_counts = {}
310
- object_max_concurrent_counts = {} # Store the max count found for each object type
311
- total_detected_in_processed = 0
312
-
313
- # Iterate through stats collected from each processed frame
314
- for frame_stats in all_stats:
315
- total_objects_in_frame = frame_stats.get("total_objects", 0)
316
- total_detected_in_processed += total_objects_in_frame
317
-
318
- # Iterate through object classes detected in this frame
319
- for obj_name, obj_data in frame_stats.get("class_statistics", {}).items():
320
- count_in_frame = obj_data.get("count", 0)
321
-
322
- # Cumulative count
323
- if obj_name not in object_cumulative_counts:
324
- object_cumulative_counts[obj_name] = 0
325
- object_cumulative_counts[obj_name] += count_in_frame
326
-
327
- # Max concurrent count
328
- if obj_name not in object_max_concurrent_counts:
329
- object_max_concurrent_counts[obj_name] = 0
330
- # Update the max count if the current frame's count is higher
331
- object_max_concurrent_counts[obj_name] = max(object_max_concurrent_counts[obj_name], count_in_frame)
332
-
333
- # Add sorted results to the final dictionary
334
- aggregated_stats["cumulative_detections"] = dict(sorted(object_cumulative_counts.items(), key=lambda item: item[1], reverse=True))
335
- aggregated_stats["max_concurrent_detections"] = dict(sorted(object_max_concurrent_counts.items(), key=lambda item: item[1], reverse=True))
336
-
337
- # Calculate average objects per processed frame
338
- if processed_frame_count > 0:
339
- aggregated_stats["avg_objects_per_processed_frame"] = round(total_detected_in_processed / processed_frame_count, 2)
340
-
341
- summary_text = "\n".join(summary_lines)
342
- print("Generated Summary:\n", summary_text)
343
- print("Aggregated Stats (Revised):\n", aggregated_stats) # Print the revised stats
344
-
345
- # Return the potentially updated output_path
346
- return output_path, summary_text, aggregated_stats
 
2
  import os
3
  import tempfile
4
  import uuid
5
+ import time
6
+ import traceback
7
  import numpy as np
8
+ from PIL import Image
9
  from typing import Dict, List, Tuple, Any, Optional
 
10
  from collections import defaultdict
11
+ from dataclasses import dataclass
12
+ import math
13
 
 
 
 
14
  from detection_model import DetectionModel
15
+ from evaluation_metrics import EvaluationMetrics
16
+
17
+ @dataclass
18
+ class ObjectRecord:
19
+ """物體記錄數據結構"""
20
+ class_name: str
21
+ first_seen_time: float
22
+ last_seen_time: float
23
+ total_detections: int
24
+ peak_count_in_frame: int
25
+ confidence_avg: float
26
+
27
+ def get_duration(self) -> float:
28
+ """獲取物體在影片中的持續時間"""
29
+ return self.last_seen_time - self.first_seen_time
30
+
31
+ def format_time(self, seconds: float) -> str:
32
+ """格式化時間顯示"""
33
+ minutes = int(seconds // 60)
34
+ secs = int(seconds % 60)
35
+ if minutes > 0:
36
+ return f"{minutes}m{secs:02d}s"
37
+ return f"{secs}s"
38
 
39
  class VideoProcessor:
40
  """
41
+ 專注於實用統計分析的視頻處理器:
42
+ - 準確的物體計數和識別
43
+ - 物體出現時間分析
44
+ - 檢測品質評估
45
+ - 活動密度統計
46
  """
47
+
48
+ def __init__(self):
49
+ """初始化視頻處理器"""
50
+ self.detection_models: Dict[str, DetectionModel] = {}
51
+
52
+ # 分析參數
53
+ self.spatial_cluster_threshold = 100 # 像素距離閾值,用於合併重複檢測
54
+ self.confidence_filter_threshold = 0.1 # 最低信心度過濾
55
+
56
+ # 統計數據收集
57
+ self.frame_detections = [] # 每幀檢測結果
58
+ self.object_timeline = defaultdict(list) # 物體時間線記錄
59
+ self.frame_timestamps = [] # 幀時間戳記錄
60
+
61
+ def get_or_create_model(self, model_name: str, confidence_threshold: float) -> DetectionModel:
62
+ """獲取或創建檢測模型實例"""
63
+ model_key = f"{model_name}_{confidence_threshold}"
64
+
65
+ if model_key not in self.detection_models:
66
+ try:
67
+ model = DetectionModel(model_name, confidence_threshold)
68
+ self.detection_models[model_key] = model
69
+ print(f"Loaded detection model: {model_name} with confidence {confidence_threshold}")
70
+ except Exception as e:
71
+ print(f"Error loading model {model_name}: {e}")
72
+ raise
73
+
74
+ return self.detection_models[model_key]
75
+
76
+ def cluster_detections_by_position(self, detections: List[Dict], threshold: float = 100) -> List[Dict]:
77
+ """根據位置聚類檢測結果,合併相近的重複檢測"""
78
+ if not detections:
79
+ return []
80
+
81
+ # 按物體類別分組進行聚類處理
82
+ class_groups = defaultdict(list)
83
+ for det in detections:
84
+ class_groups[det['class_name']].append(det)
85
+
86
+ clustered_results = []
87
+
88
+ for class_name, class_detections in class_groups.items():
89
+ if len(class_detections) == 1:
90
+ clustered_results.extend(class_detections)
91
+ continue
92
+
93
+ # 執行空間聚類算法
94
+ clusters = []
95
+ used = set()
96
+
97
+ for i, det1 in enumerate(class_detections):
98
+ if i in used:
99
+ continue
100
+
101
+ cluster = [det1]
102
+ used.add(i)
103
+
104
+ # 計算檢測框中心點
105
+ x1_center = (det1['bbox'][0] + det1['bbox'][2]) / 2
106
+ y1_center = (det1['bbox'][1] + det1['bbox'][3]) / 2
107
+
108
+ # 查找相近的檢測結果
109
+ for j, det2 in enumerate(class_detections):
110
+ if j in used:
111
+ continue
112
+
113
+ x2_center = (det2['bbox'][0] + det2['bbox'][2]) / 2
114
+ y2_center = (det2['bbox'][1] + det2['bbox'][3]) / 2
115
+
116
+ distance = math.sqrt((x1_center - x2_center)**2 + (y1_center - y2_center)**2)
117
+
118
+ if distance < threshold:
119
+ cluster.append(det2)
120
+ used.add(j)
121
+
122
+ clusters.append(cluster)
123
+
124
+ # 為每個聚類生成代表性檢測結果
125
+ for cluster in clusters:
126
+ best_detection = max(cluster, key=lambda x: x['confidence'])
127
+ avg_confidence = sum(det['confidence'] for det in cluster) / len(cluster)
128
+ best_detection['confidence'] = avg_confidence
129
+ best_detection['cluster_size'] = len(cluster)
130
+ clustered_results.append(best_detection)
131
+
132
+ return clustered_results
133
+
134
+ def analyze_frame_detections(self, detections: Any, timestamp: float, class_names: Dict[int, str]):
135
+ """分析單幀的檢測結果並更新統計記錄"""
136
+ if not hasattr(detections, 'boxes') or len(detections.boxes) == 0:
137
+ self.frame_detections.append([])
138
+ self.frame_timestamps.append(timestamp)
139
+ return
140
+
141
+ # extract detected data
142
+ boxes = detections.boxes.xyxy.cpu().numpy()
143
+ classes = detections.boxes.cls.cpu().numpy().astype(int)
144
+ confidences = detections.boxes.conf.cpu().numpy()
145
+
146
+ # 轉換為統一的檢測格式
147
+ frame_detections = []
148
+ for box, cls_id, conf in zip(boxes, classes, confidences):
149
+ if conf >= self.confidence_filter_threshold:
150
+ frame_detections.append({
151
+ 'bbox': tuple(box),
152
+ 'class_id': cls_id,
153
+ 'class_name': class_names.get(cls_id, f'class_{cls_id}'),
154
+ 'confidence': conf,
155
+ 'timestamp': timestamp
156
+ })
157
+
158
+ # 為了避免有重複偵測, 用空間聚類
159
+ clustered_detections = self.cluster_detections_by_position(
160
+ frame_detections, self.spatial_cluster_threshold
161
+ )
162
+
163
+ # record results
164
+ self.frame_detections.append(clustered_detections)
165
+ self.frame_timestamps.append(timestamp)
166
+
167
+ # 更新物體時間線記錄
168
+ for detection in clustered_detections:
169
+ class_name = detection['class_name']
170
+ self.object_timeline[class_name].append({
171
+ 'timestamp': timestamp,
172
+ 'confidence': detection['confidence'],
173
+ 'bbox': detection['bbox']
174
+ })
175
+
176
+ def generate_object_statistics(self, fps: float) -> Dict[str, ObjectRecord]:
177
+ """生成物體統計數據"""
178
+ object_stats = {}
179
+
180
+ for class_name, timeline in self.object_timeline.items():
181
+ if not timeline:
182
+ continue
183
+
184
+ # 計算基本時間統計
185
+ timestamps = [entry['timestamp'] for entry in timeline]
186
+ confidences = [entry['confidence'] for entry in timeline]
187
+
188
+ first_seen = min(timestamps)
189
+ last_seen = max(timestamps)
190
+ total_detections = len(timeline)
191
+ avg_confidence = sum(confidences) / len(confidences)
192
+
193
+ # 計算每個時間點的物體數量以確定峰值
194
+ frame_counts = defaultdict(int)
195
+ for entry in timeline:
196
+ frame_timestamp = entry['timestamp']
197
+ frame_counts[frame_timestamp] += 1
198
+
199
+ peak_count = max(frame_counts.values()) if frame_counts else 1
200
+
201
+ # 創建物體記錄
202
+ object_stats[class_name] = ObjectRecord(
203
+ class_name=class_name,
204
+ first_seen_time=first_seen,
205
+ last_seen_time=last_seen,
206
+ total_detections=total_detections,
207
+ peak_count_in_frame=peak_count,
208
+ confidence_avg=avg_confidence
209
+ )
210
+
211
+ return object_stats
212
+
213
+ def analyze_object_density(self, object_stats: Dict[str, ObjectRecord], video_duration: float) -> Dict[str, Any]:
214
+ """分析物體密度和活動模式"""
215
+ total_objects = sum(record.peak_count_in_frame for record in object_stats.values())
216
+ objects_per_minute = (total_objects / video_duration) * 60 if video_duration > 0 else 0
217
+
218
+ # 分析每30秒時間段的活動分布
219
+ time_segments = defaultdict(int)
220
+ segment_duration = 30
221
+
222
+ for detections, timestamp in zip(self.frame_detections, self.frame_timestamps):
223
+ segment = int(timestamp // segment_duration) * segment_duration
224
+ time_segments[segment] += len(detections)
225
+
226
+ # 辨識活動高峰時段
227
+ peak_segments = []
228
+ if time_segments:
229
+ max_activity = max(time_segments.values())
230
+ threshold = max_activity * 0.8 # 80%活動量代表高度活躍
231
+
232
+ for segment, activity in time_segments.items():
233
+ if activity >= threshold:
234
+ peak_segments.append({
235
+ 'start_time': segment,
236
+ 'end_time': min(segment + segment_duration, video_duration),
237
+ 'activity_count': activity,
238
+ 'description': f"{segment}s-{min(segment + segment_duration, video_duration):.0f}s"
239
+ })
240
+
241
+ return {
242
+ 'total_objects_detected': total_objects,
243
+ 'objects_per_minute': round(objects_per_minute, 2),
244
+ 'video_duration_seconds': video_duration,
245
+ 'peak_activity_periods': peak_segments,
246
+ 'activity_distribution': {str(k): v for k, v in time_segments.items()}
247
+ }
248
+
249
+ def analyze_quality_metrics(self, object_stats: Dict[str, ObjectRecord]) -> Dict[str, Any]:
250
+ """分析檢測品質指標"""
251
+ all_confidences = []
252
+ class_confidence_stats = {}
253
+
254
+ # 收集所有置信度數據進行分析
255
+ for class_name, record in object_stats.items():
256
+ class_confidences = []
257
+ for detection_data in self.object_timeline[class_name]:
258
+ conf = detection_data['confidence']
259
+ all_confidences.append(conf)
260
+ class_confidences.append(conf)
261
+
262
+ # 計算各類別的置信度統計
263
+ if class_confidences:
264
+ class_confidence_stats[class_name] = {
265
+ 'average_confidence': round(np.mean(class_confidences), 3),
266
+ 'min_confidence': round(np.min(class_confidences), 3),
267
+ 'max_confidence': round(np.max(class_confidences), 3),
268
+ 'confidence_stability': round(1 - np.std(class_confidences), 3),
269
+ 'detection_count': len(class_confidences)
270
+ }
271
+
272
+ # 計算整體品質指標
273
+ if all_confidences:
274
+ overall_confidence = np.mean(all_confidences)
275
+ confidence_std = np.std(all_confidences)
276
+
277
+ # 品質等級評估
278
+ if overall_confidence > 0.8 and confidence_std < 0.1:
279
+ quality_grade = "excellent"
280
+ elif overall_confidence > 0.6 and confidence_std < 0.2:
281
+ quality_grade = "good"
282
+ elif overall_confidence > 0.4:
283
+ quality_grade = "fair"
284
+ else:
285
+ quality_grade = "poor"
286
+
287
+ quality_analysis = f"Detection quality: {quality_grade} (avg confidence: {overall_confidence:.3f})"
288
+ else:
289
+ overall_confidence = 0
290
+ confidence_std = 0
291
+ quality_grade = "no_data"
292
+ quality_analysis = "No detection data available for quality analysis"
293
+
294
+ return {
295
+ 'overall_confidence': round(overall_confidence, 3),
296
+ 'confidence_stability': round(1 - confidence_std, 3),
297
+ 'quality_grade': quality_grade,
298
+ 'class_confidence_breakdown': class_confidence_stats,
299
+ 'total_detections_analyzed': len(all_confidences),
300
+ 'quality_analysis': quality_analysis
301
+ }
302
+
303
+ def generate_timeline_analysis(self, object_stats: Dict[str, ObjectRecord], video_duration: float) -> Dict[str, Any]:
304
+ """生成時間線分析報告"""
305
+ timeline_analysis = {
306
+ 'video_duration_seconds': video_duration,
307
+ 'object_appearances': {},
308
+ 'timeline_summary': []
309
+ }
310
+
311
+ # 分析每個物體的出現的時序
312
+ for class_name, record in object_stats.items():
313
+ timeline_analysis['object_appearances'][class_name] = {
314
+ 'first_appearance': record.format_time(record.first_seen_time),
315
+ 'first_appearance_seconds': round(record.first_seen_time, 1),
316
+ 'last_seen': record.format_time(record.last_seen_time),
317
+ 'last_seen_seconds': round(record.last_seen_time, 1),
318
+ 'duration_in_video': record.format_time(record.get_duration()),
319
+ 'duration_seconds': round(record.get_duration(), 1),
320
+ 'estimated_count': record.peak_count_in_frame,
321
+ 'detection_confidence': round(record.confidence_avg, 3)
322
+ }
323
+
324
+ # timeline summary
325
+ if object_stats:
326
+ sorted_objects = sorted(object_stats.values(), key=lambda x: x.first_seen_time)
327
+
328
+ for i, record in enumerate(sorted_objects):
329
+ if record.first_seen_time < 2.0:
330
+ summary = f"{record.peak_count_in_frame} {record.class_name}(s) present from the beginning"
331
+ else:
332
+ summary = f"{record.peak_count_in_frame} {record.class_name}(s) first appeared at {record.format_time(record.first_seen_time)}"
333
+
334
+ timeline_analysis['timeline_summary'].append(summary)
335
+
336
+ return timeline_analysis
337
+
338
+ def draw_simple_annotations(self, frame: np.ndarray, detections: List[Dict]) -> np.ndarray:
339
+ """��視頻幀上繪製檢測標註"""
340
+ annotated_frame = frame.copy()
341
+
342
+ # 不同物體類別分配顏色
343
+ colors = {
344
+ 'person': (0, 255, 0), # green
345
+ 'car': (255, 0, 0), # blue
346
+ 'truck': (0, 0, 255), # red
347
+ 'bus': (255, 255, 0), # 青色
348
+ 'bicycle': (255, 0, 255), # purple
349
+ 'motorcycle': (0, 255, 255) # yellow
350
+ }
351
+
352
+ # 繪製每個檢測結果
353
+ for detection in detections:
354
+ x1, y1, x2, y2 = map(int, detection['bbox'])
355
+ class_name = detection['class_name']
356
+ confidence = detection['confidence']
357
+
358
+ color = colors.get(class_name, (128, 128, 128)) # set gray to default color
359
+
360
+ # 繪製邊界框
361
+ cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), color, 2)
362
+
363
+ # 準備標籤文字
364
+ label = f"{class_name}: {confidence:.2f}"
365
+ if 'cluster_size' in detection and detection['cluster_size'] > 1:
366
+ label += f" (merged: {detection['cluster_size']})"
367
+
368
+ # 繪製標籤背景和文字
369
+ (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
370
+ cv2.rectangle(annotated_frame, (x1, y1 - h - 10), (x1 + w, y1), color, -1)
371
+ cv2.putText(annotated_frame, label, (x1, y1 - 5),
372
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
373
+
374
+ return annotated_frame
375
+
376
+ def _ensure_string_keys(self, data):
377
+ """確保所有字典鍵值都轉換為字串格式以支援JSON序列化"""
378
+ if isinstance(data, dict):
379
+ return {str(key): self._ensure_string_keys(value) for key, value in data.items()}
380
+ elif isinstance(data, list):
381
+ return [self._ensure_string_keys(item) for item in data]
382
+ else:
383
+ return data
384
+
385
+ def process_video(self,
386
+ video_path: str,
387
+ model_name: str,
388
+ confidence_threshold: float,
389
+ process_interval: int = 10) -> Tuple[Optional[str], Dict[str, Any]]:
390
  """
391
+ 處理視頻文件,執行物體檢測和統計分析
392
+
 
393
  Args:
394
+ video_path: 視頻文件路徑
395
+ model_name: YOLO模型名稱
396
+ confidence_threshold: 置信度閾值
397
+ process_interval: 處理間隔(每N幀處理一次)
398
+
 
399
  Returns:
400
+ Tuple[Optional[str], Dict[str, Any]]: (輸出視頻路徑, 分析結果)
401
  """
402
  if not video_path or not os.path.exists(video_path):
403
  print(f"Error: Video file not found at {video_path}")
404
+ return None, {"error": "Video file not found"}
405
+
406
+ print(f"Starting focused video analysis: {video_path}")
407
  start_time = time.time()
408
+
409
+ # 重置處理狀態
410
+ self.frame_detections.clear()
411
+ self.object_timeline.clear()
412
+ self.frame_timestamps.clear()
413
+
414
+ # 開啟視頻文件
415
  cap = cv2.VideoCapture(video_path)
416
  if not cap.isOpened():
417
+ return None, {"error": "Could not open video file"}
418
+
419
+ # 取得視頻基本屬性
420
+ fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
 
 
 
 
421
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
422
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
423
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
424
+ video_duration = total_frames / fps
425
+
426
+ print(f"Video properties: {width}x{height} @ {fps:.2f} FPS")
427
+ print(f"Duration: {video_duration:.1f}s, Total frames: {total_frames}")
428
+ print(f"Processing every {process_interval} frames")
429
+
430
+ # 設定輸出視頻文件
431
+ output_filename = f"analyzed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
432
+ temp_dir = tempfile.gettempdir()
 
 
 
 
 
 
 
433
  output_path = os.path.join(temp_dir, output_filename)
 
434
  if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
435
  output_path += ".mp4"
436
+
 
437
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
438
  out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
439
+
440
  if not out.isOpened():
 
441
  cap.release()
442
+ return None, {"error": "Could not create output video file"}
443
+
444
  print(f"Output video will be saved to: {output_path}")
445
+
446
+ # 載入檢測模型
447
+ try:
448
+ detection_model = self.get_or_create_model(model_name, confidence_threshold)
449
+ except Exception as e:
450
+ cap.release()
451
+ out.release()
452
+ return None, {"error": f"Failed to load detection model: {str(e)}"}
453
+
454
+ # 主要視頻處理循環
455
  frame_count = 0
456
  processed_frame_count = 0
457
+
 
 
 
 
458
  try:
459
  while True:
460
  ret, frame = cap.read()
461
  if not ret:
462
+ break
463
+
464
  frame_count += 1
465
+ timestamp = frame_count / fps
466
+
467
+ # 根據處理間隔決定是否分析此幀
 
468
  if frame_count % process_interval == 0:
469
  processed_frame_count += 1
470
+
471
+ if processed_frame_count % 5 == 0:
472
+ print(f"Processing frame {frame_count}/{total_frames} ({timestamp:.1f}s)")
473
+
 
474
  try:
475
+ # 執行物體檢測
476
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
477
  pil_image = Image.fromarray(frame_rgb)
478
+ detections = detection_model.detect(pil_image)
479
+
480
+ # 分析檢測結果
481
+ class_names = detections.names if hasattr(detections, 'names') else {}
482
+ self.analyze_frame_detections(detections, timestamp, class_names)
483
+
484
+ # 繪製檢測標註
485
+ current_detections = self.frame_detections[-1] if self.frame_detections else []
486
+ frame = self.draw_simple_annotations(frame, current_detections)
487
+
488
  except Exception as e:
489
+ print(f"Error processing frame {frame_count}: {e}")
490
+ continue
491
+
492
+ # 寫入處理後的幀到輸出視頻
493
+ out.write(frame)
494
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  except Exception as e:
496
+ print(f"Error during video processing: {e}")
 
497
  traceback.print_exc()
 
498
  finally:
 
499
  cap.release()
500
  out.release()
501
+
502
+ # 生成最終分析結果
503
+ processing_time = time.time() - start_time
504
+
505
+ # 執行各項統計分析
506
+ object_stats = self.generate_object_statistics(fps)
507
+ object_density = self.analyze_object_density(object_stats, video_duration)
508
+ quality_metrics = self.analyze_quality_metrics(object_stats)
509
+ timeline_analysis = self.generate_timeline_analysis(object_stats, video_duration)
510
+
511
+ # 計算基本統計數據
512
+ total_unique_objects = sum(record.peak_count_in_frame for record in object_stats.values())
513
+
514
+ # 組織分析結果
515
+ analysis_results = {
516
+ "processing_info": {
517
+ "processing_time_seconds": round(processing_time, 2),
518
+ "total_frames": frame_count,
519
+ "frames_analyzed": processed_frame_count,
520
+ "processing_interval": process_interval,
521
+ "video_duration_seconds": round(video_duration, 2),
522
+ "fps": fps
523
+ },
524
+ "object_summary": {
525
+ "total_unique_objects_detected": total_unique_objects,
526
+ "object_types_found": len(object_stats),
527
+ "detailed_counts": {
528
+ name: record.peak_count_in_frame
529
+ for name, record in object_stats.items()
530
+ }
531
+ },
532
+ "timeline_analysis": timeline_analysis,
533
+ "analytics": {
534
+ "object_density": object_density,
535
+ "quality_metrics": quality_metrics
536
  }
537
+ }
538
+
539
+ # 確保所有字典鍵值都是字串格式
540
+ analysis_results = self._ensure_string_keys(analysis_results)
541
+
542
+ # 驗證輸出文件
543
+ if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
544
+ print(f"Warning: Output video file was not created properly")
545
+ return None, analysis_results
546
+
547
+ print(f"Video processing completed in {processing_time:.2f} seconds")
548
+ print(f"Found {total_unique_objects} total objects across {len(object_stats)} categories")
549
+ print(f"Quality grade: {quality_metrics['quality_grade']}")
550
+
551
+ return output_path, analysis_results