Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import numpy as np | |
import logging | |
import traceback | |
from typing import Dict, List, Tuple, Any, Optional | |
from PIL import Image | |
from component_initializer import ComponentInitializer | |
from scene_scoring_engine import SceneScoringEngine | |
from landmark_processing_manager import LandmarkProcessingManager | |
from scene_analysis_coordinator import SceneAnalysisCoordinator | |
class SceneAnalyzer: | |
""" | |
Core class for scene analysis and understanding based on object detection results. | |
Analyzes detected objects, their relationships, and infers the scene type. | |
此class為場景理解的總窗口 | |
This is the main Facade class that coordinates all scene analysis components | |
while maintaining the original public interface for backward compatibility. | |
""" | |
EVERYDAY_SCENE_TYPE_KEYS = [ | |
"general_indoor_space", "generic_street_view", | |
"desk_area_workspace", "outdoor_gathering_spot", | |
"kitchen_counter_or_utility_area" | |
] | |
def __init__(self, class_names: Dict[int, str] = None, use_llm: bool = True, | |
use_clip: bool = True, enable_landmark: bool = True, | |
llm_model_path: str = None): | |
""" | |
Initialize the scene analyzer with optional class name mappings. | |
Args: | |
class_names: Dictionary mapping class IDs to class names (optional) | |
use_llm: Whether to enable LLM enhancement functionality | |
use_clip: Whether to enable CLIP analysis functionality | |
enable_landmark: Whether to enable landmark detection functionality | |
llm_model_path: Path to LLM model (optional) | |
""" | |
self.logger = logging.getLogger(__name__) | |
try: | |
# Initialize all components through the component initializer | |
self.component_initializer = ComponentInitializer( | |
class_names=class_names, | |
use_llm=use_llm, | |
use_clip=use_clip, | |
enable_landmark=enable_landmark, | |
llm_model_path=llm_model_path | |
) | |
# Get data structures for easy access | |
self.SCENE_TYPES = self.component_initializer.get_data_structure('SCENE_TYPES') | |
self.OBJECT_CATEGORIES = self.component_initializer.get_data_structure('OBJECT_CATEGORIES') | |
self.LANDMARK_ACTIVITIES = self.component_initializer.get_data_structure('LANDMARK_ACTIVITIES') | |
# Initialize specialized engines | |
self.scene_scoring_engine = SceneScoringEngine( | |
scene_types=self.SCENE_TYPES, | |
enable_landmark=enable_landmark | |
) | |
self.landmark_processing_manager = LandmarkProcessingManager( | |
enable_landmark=enable_landmark, | |
use_clip=use_clip | |
) | |
# Initialize the main coordinator | |
self.scene_analysis_coordinator = SceneAnalysisCoordinator( | |
component_initializer=self.component_initializer, | |
scene_scoring_engine=self.scene_scoring_engine, | |
landmark_processing_manager=self.landmark_processing_manager | |
) | |
# Store configuration for backward compatibility | |
self.class_names = class_names | |
self.use_clip = use_clip | |
self.use_llm = use_llm | |
self.enable_landmark = enable_landmark | |
self.use_landmark_detection = enable_landmark | |
# Get component references for backward compatibility | |
self.spatial_analyzer = self.component_initializer.get_component('spatial_analyzer') | |
self.descriptor = self.component_initializer.get_component('descriptor') | |
self.scene_describer = self.component_initializer.get_component('scene_describer') | |
self.clip_analyzer = self.component_initializer.get_component('clip_analyzer') | |
self.llm_enhancer = self.component_initializer.get_component('llm_enhancer') | |
self.landmark_classifier = self.component_initializer.get_component('landmark_classifier') | |
# Set landmark classifier in the processing manager | |
if self.landmark_classifier: | |
self.landmark_processing_manager.set_landmark_classifier(self.landmark_classifier) | |
self.logger.info("SceneAnalyzer initialized successfully with all components") | |
except Exception as e: | |
self.logger.error(f"Critical error during SceneAnalyzer initialization: {e}") | |
traceback.print_exc() | |
raise | |
def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, | |
class_confidence_threshold: float = 0.25, scene_confidence_threshold: float = 0.6, | |
enable_landmark: bool = True, places365_info: Optional[Dict] = None) -> Dict: | |
""" | |
Analyze detection results to determine scene type and provide understanding. | |
Args: | |
detection_result: Detection result from YOLOv8 or similar | |
lighting_info: Optional lighting condition analysis results | |
class_confidence_threshold: Minimum confidence to consider an object | |
scene_confidence_threshold: Minimum confidence to determine a scene | |
enable_landmark: Whether to enable landmark detection and recognition for this run | |
places365_info: Optional Places365 scene classification results | |
Returns: | |
Dictionary with scene analysis results | |
""" | |
try: | |
return self.scene_analysis_coordinator.analyze( | |
detection_result=detection_result, | |
lighting_info=lighting_info, | |
class_confidence_threshold=class_confidence_threshold, | |
scene_confidence_threshold=scene_confidence_threshold, | |
enable_landmark=enable_landmark, | |
places365_info=places365_info | |
) | |
except Exception as e: | |
self.logger.error(f"Error in scene analysis: {e}") | |
traceback.print_exc() | |
# Return a safe fallback result | |
return { | |
"scene_type": "unknown", | |
"confidence": 0.0, | |
"description": "Scene analysis failed due to an internal error.", | |
"enhanced_description": "An error occurred during scene analysis. Please check the system logs for details.", | |
"objects_present": [], | |
"object_count": 0, | |
"regions": {}, | |
"possible_activities": [], | |
"safety_concerns": [], | |
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0} | |
} | |
def generate_scene_description(self, scene_type: str, detected_objects: List[Dict], | |
confidence: float, lighting_info: Optional[Dict] = None, | |
functional_zones: Optional[Dict] = None, | |
enable_landmark: bool = True, | |
scene_scores: Optional[Dict] = None, | |
spatial_analysis: Optional[Dict] = None, | |
image_dimensions: Optional[Tuple[int, int]] = None) -> str: | |
""" | |
Generate scene description and pass all necessary context to the underlying describer. | |
Args: | |
scene_type: Identified scene type | |
detected_objects: List of detected objects | |
confidence: Scene classification confidence | |
lighting_info: Lighting condition information (optional) | |
functional_zones: Functional zone information (optional) | |
enable_landmark: Whether to enable landmark description (optional) | |
scene_scores: Scene scores (optional) | |
spatial_analysis: Spatial analysis results (optional) | |
image_dimensions: Image dimensions (width, height) (optional) | |
Returns: | |
str: Generated scene description | |
""" | |
try: | |
# Convert functional_zones from Dict to List[str] and filter technical terms | |
functional_zones_list = [] | |
if functional_zones and isinstance(functional_zones, dict): | |
# Filter out technical terms, keep only meaningful descriptions | |
filtered_zones = {k: v for k, v in functional_zones.items() | |
if not k.endswith('_zone') or k in ['dining_zone', 'seating_zone', 'work_zone']} | |
functional_zones_list = [v.get('description', k) for k, v in filtered_zones.items() | |
if isinstance(v, dict) and v.get('description')] | |
elif functional_zones and isinstance(functional_zones, list): | |
# Filter technical terms from list | |
functional_zones_list = [zone for zone in functional_zones | |
if not zone.endswith('_zone') or 'area' in zone] | |
# Generate detailed object statistics | |
object_statistics = {} | |
for obj in detected_objects: | |
class_name = obj.get("class_name", "unknown") | |
if class_name not in object_statistics: | |
object_statistics[class_name] = { | |
"count": 0, | |
"avg_confidence": 0.0, | |
"max_confidence": 0.0, | |
"instances": [] | |
} | |
stats = object_statistics[class_name] | |
stats["count"] += 1 | |
stats["instances"].append(obj) | |
stats["max_confidence"] = max(stats["max_confidence"], obj.get("confidence", 0.0)) | |
# Calculate average confidence | |
for class_name, stats in object_statistics.items(): | |
if stats["count"] > 0: | |
total_conf = sum(inst.get("confidence", 0.0) for inst in stats["instances"]) | |
stats["avg_confidence"] = total_conf / stats["count"] | |
if self.scene_describer: | |
return self.scene_describer.generate_description( | |
scene_type=scene_type, | |
detected_objects=detected_objects, | |
confidence=confidence, | |
lighting_info=lighting_info, | |
functional_zones=functional_zones_list, | |
enable_landmark=enable_landmark, | |
scene_scores=scene_scores, | |
spatial_analysis=spatial_analysis, | |
image_dimensions=image_dimensions, | |
object_statistics=object_statistics | |
) | |
else: | |
return f"A {scene_type} scene with {len(detected_objects)} detected objects." | |
except Exception as e: | |
self.logger.error(f"Error generating scene description: {e}") | |
return f"A {scene_type} scene." | |
def process_unknown_objects(self, detection_result, detected_objects): | |
""" | |
Process objects that YOLO failed to identify or have low confidence for landmark detection. | |
Args: | |
detection_result: YOLO detection results | |
detected_objects: List of identified objects | |
Returns: | |
tuple: (updated object list, landmark object list) | |
""" | |
try: | |
return self.landmark_processing_manager.process_unknown_objects( | |
detection_result, detected_objects, self.clip_analyzer | |
) | |
except Exception as e: | |
self.logger.error(f"Error processing unknown objects: {e}") | |
traceback.print_exc() | |
return detected_objects, [] | |
def _compute_scene_scores(self, detected_objects: List[Dict], | |
spatial_analysis_results: Optional[Dict] = None) -> Dict[str, float]: | |
""" | |
Compute confidence scores for each scene type based on detected objects. | |
Args: | |
detected_objects: List of detected objects with their details | |
spatial_analysis_results: Optional output from SpatialAnalyzer | |
Returns: | |
Dictionary mapping scene types to confidence scores | |
""" | |
return self.scene_scoring_engine.compute_scene_scores( | |
detected_objects, spatial_analysis_results | |
) | |
def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]: | |
""" | |
Determine the most likely scene type based on scores. | |
Args: | |
scene_scores: Dictionary mapping scene types to confidence scores | |
Returns: | |
Tuple of (best_scene_type, confidence) | |
""" | |
return self.scene_scoring_engine.determine_scene_type(scene_scores) | |
def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], | |
clip_scene_scores: Dict[str, float], | |
num_yolo_detections: int = 0, | |
avg_yolo_confidence: float = 0.0, | |
lighting_info: Optional[Dict] = None, | |
places365_info: Optional[Dict] = None) -> Dict[str, float]: | |
""" | |
Fuse scene scores from YOLO-based object detection, CLIP-based analysis, and Places365. | |
Args: | |
yolo_scene_scores: Scene scores based on YOLO object detection | |
clip_scene_scores: Scene scores based on CLIP analysis | |
num_yolo_detections: Total number of non-landmark objects detected by YOLO | |
avg_yolo_confidence: Average confidence of non-landmark objects detected by YOLO | |
lighting_info: Optional lighting condition analysis results | |
places365_info: Optional Places365 scene classification results | |
Returns: | |
Dict: Fused scene scores incorporating all analysis sources | |
""" | |
return self.scene_scoring_engine.fuse_scene_scores( | |
yolo_scene_scores, clip_scene_scores, num_yolo_detections, | |
avg_yolo_confidence, lighting_info, places365_info | |
) | |
def _get_alternative_scene_type(self, landmark_scene_type, detected_objects, scene_scores): | |
""" | |
Select appropriate alternative type for landmark scene types. | |
Args: | |
landmark_scene_type: Original landmark scene type | |
detected_objects: List of detected objects | |
scene_scores: All scene type scores | |
Returns: | |
str: Appropriate alternative scene type | |
""" | |
return self.landmark_processing_manager.get_alternative_scene_type( | |
landmark_scene_type, detected_objects, scene_scores | |
) | |
def _remove_landmark_references(self, text): | |
""" | |
Remove all landmark references from text. | |
Args: | |
text: Input text | |
Returns: | |
str: Text with landmark references removed | |
""" | |
return self.landmark_processing_manager.remove_landmark_references(text) | |
def _define_image_regions(self): | |
"""Define regions of the image for spatial analysis (3x3 grid).""" | |
self.regions = { | |
"top_left": (0, 0, 1/3, 1/3), | |
"top_center": (1/3, 0, 2/3, 1/3), | |
"top_right": (2/3, 0, 1, 1/3), | |
"middle_left": (0, 1/3, 1/3, 2/3), | |
"middle_center": (1/3, 1/3, 2/3, 2/3), | |
"middle_right": (2/3, 1/3, 1, 2/3), | |
"bottom_left": (0, 2/3, 1/3, 1), | |
"bottom_center": (1/3, 2/3, 2/3, 1), | |
"bottom_right": (2/3, 2/3, 1, 1) | |
} | |
def get_component_status(self) -> Dict[str, bool]: | |
""" | |
Get the initialization status of all components. | |
Returns: | |
Dictionary mapping component names to their initialization status | |
""" | |
return self.component_initializer.get_initialization_summary() | |
def is_component_available(self, component_name: str) -> bool: | |
""" | |
Check if a specific component is available and properly initialized. | |
Args: | |
component_name: Name of the component to check | |
Returns: | |
bool: Whether the component is available | |
""" | |
return self.component_initializer.is_component_available(component_name) | |
def update_landmark_enable_status(self, enable_landmark: bool): | |
""" | |
Update the landmark detection enable status across all components. | |
Args: | |
enable_landmark: Whether to enable landmark detection | |
""" | |
self.enable_landmark = enable_landmark | |
self.use_landmark_detection = enable_landmark | |
# Update all related components | |
self.component_initializer.update_landmark_enable_status(enable_landmark) | |
self.scene_scoring_engine.update_enable_landmark_status(enable_landmark) | |
self.landmark_processing_manager.update_enable_landmark_status(enable_landmark) | |
# Update the coordinator's enable_landmark status | |
if hasattr(self.scene_analysis_coordinator, 'enable_landmark'): | |
self.scene_analysis_coordinator.enable_landmark = enable_landmark | |