diff --git "a/enhance_scene_describer.py" "b/enhance_scene_describer.py" deleted file mode 100644--- "a/enhance_scene_describer.py" +++ /dev/null @@ -1,2350 +0,0 @@ -import os -import re -import json -import logging -import random -import numpy as np -from typing import Dict, List, Tuple, Any, Optional - -from scene_type import SCENE_TYPES -from scene_detail_templates import SCENE_DETAIL_TEMPLATES -from object_template_fillers import OBJECT_TEMPLATE_FILLERS -from lighting_conditions import LIGHTING_CONDITIONS -from viewpoint_templates import VIEWPOINT_TEMPLATES -from cultural_templates import CULTURAL_TEMPLATES -from confidence_templates import CONFIDENCE_TEMPLATES -from landmark_data import ALL_LANDMARKS - -class EnhancedSceneDescriber: - """ - Enhanced scene description generator with improved template handling, - viewpoint awareness, and cultural context recognition. - Provides detailed natural language descriptions of scenes based on - detection results and scene classification. - """ - - def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None): - """ - Initialize the enhanced scene describer. - - Args: - templates_db: Optional custom templates database - scene_types: Dictionary of scene type definitions - """ - self.logger = logging.getLogger(self.__class__.__name__) # Use class name for logger - self.logger.setLevel(logging.INFO) # Or your desired logging level - # Optional: Add a handler if not configured globally - if not self.logger.hasHandlers(): - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - self.logger.addHandler(handler) - - # Load or use provided scene types - self.scene_types = scene_types or self._load_default_scene_types() - - # Load templates database - self.templates = templates_db or self._load_templates() - - # Initialize viewpoint detection parameters - self._initialize_viewpoint_parameters() - - def _load_default_scene_types(self) -> Dict: - """ - Load default scene types. - - Returns: - Dict: Scene type definitions - """ - - return SCENE_TYPES - - def _load_templates(self) -> Dict: - """ - Load description templates from imported Python modules. - - Returns: - Dict: Template collections for different description components - """ - templates = {} - - # 載入事先準備的模板 - templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES - templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS - templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES - templates["cultural_templates"] = CULTURAL_TEMPLATES - - # 從 LIGHTING_CONDITIONS 獲取照明模板 - templates["lighting_templates"] = { - key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items() - } - - # 設置默認的置信度模板 - templates["confidence_templates"] = { - "high": "{description} {details}", - "medium": "This appears to be {description} {details}", - "low": "This might be {description}, but the confidence is low. {details}" - } - - # 初始化其他必要的模板(現在這個函數簡化了很多) - self._initialize_default_templates(templates) - - return templates - - def _initialize_default_templates(self, templates: Dict): - """ - 檢查模板字典並填充任何缺失的默認模板。 - - 在將模板移至專門的模組後,此方法主要作為安全機制, - 確保即使導入失敗或某些模板未在外部定義,系統仍能正常運行。 - - Args: - templates: 要檢查和更新的模板字典 - """ - # 檢查關鍵模板類型是否存在,如果不存在則添加默認值 - - # 置信度模板 - 用於控制描述的語氣 - if "confidence_templates" not in templates: - templates["confidence_templates"] = { - "high": "{description} {details}", - "medium": "This appears to be {description} {details}", - "low": "This might be {description}, but the confidence is low. {details}" - } - - # 場景細節模板 - if "scene_detail_templates" not in templates: - templates["scene_detail_templates"] = { - "default": ["A space with various objects."] - } - - # 物體填充模板,用於生成物體描述 - if "object_template_fillers" not in templates: - templates["object_template_fillers"] = { - "default": ["various items"] - } - - # 視角模板,雖然現在從專門模組導入,但可作為備份 - if "viewpoint_templates" not in templates: - # 使用簡化版的默認視角模板 - templates["viewpoint_templates"] = { - "eye_level": { - "prefix": "From eye level, ", - "observation": "the scene is viewed straight on." - }, - "aerial": { - "prefix": "From above, ", - "observation": "the scene is viewed from a bird's-eye perspective." - } - } - - # 文化模板 - if "cultural_templates" not in templates: - templates["cultural_templates"] = { - "asian": { - "elements": ["cultural elements"], - "description": "The scene has Asian characteristics." - }, - "european": { - "elements": ["architectural features"], - "description": "The scene has European characteristics." - } - } - - # 照明模板 - 用於描述光照條件 - if "lighting_templates" not in templates: - templates["lighting_templates"] = { - "day_clear": "The scene is captured during daylight.", - "night": "The scene is captured at night.", - "unknown": "The lighting conditions are not easily determined." - } - - - def _initialize_viewpoint_parameters(self): - """ - Initialize parameters used for viewpoint detection. - """ - self.viewpoint_params = { - # Parameters for detecting aerial views - "aerial_threshold": 0.7, # High object density viewed from top - "aerial_size_variance_threshold": 0.15, # Low size variance in aerial views - - # Parameters for detecting low angle views - "low_angle_threshold": 0.3, # Bottom-heavy object distribution - "vertical_size_ratio_threshold": 1.8, # Vertical objects appear taller - - # Parameters for detecting elevated views - "elevated_threshold": 0.6, # Objects mostly in middle/bottom - "elevated_top_threshold": 0.3 # Few objects at top of frame - } - - def _generate_landmark_description(self, - scene_type: str, - detected_objects: List[Dict], - confidence: float, - lighting_info: Optional[Dict] = None, - functional_zones: Optional[Dict] = None, - landmark_objects: Optional[List[Dict]] = None) -> str: - """ - 生成包含地標信息的場景描述 - - Args: - scene_type: 識別的場景類型 - detected_objects: 檢測到的物體列表 - confidence: 場景分類置信度 - lighting_info: 照明條件信息(可選) - functional_zones: 功能區域信息(可選) - landmark_objects: 識別為地標的物體列表(可選) - - Returns: - str: 包含地標信息的自然語言場景描述 - """ - # 如果沒有提供地標物體,則從檢測物體中篩選 - if landmark_objects is None: - landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)] - - # 如果沒有地標,退回到標準描述 - if not landmark_objects: - if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]: - # 場景類型是地標但沒有具體地標物體 - base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable." - else: - # 使用標準方法生成基本描述 - return self._format_final_description(self._generate_scene_details( - scene_type, - detected_objects, - lighting_info, - self._detect_viewpoint(detected_objects) - )) - else: - # 獲取主要地標(信心度最高的) - primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0)) - landmark_name = primary_landmark.get("class_name", "landmark") - landmark_location = primary_landmark.get("location", "") - - # 根據地標類型選擇適當的描述模板 - if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural": - base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}." - elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument": - base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}." - else: - base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}." - - # 加地標的額外信息 - landmark_details = [] - for landmark in landmark_objects: - details = [] - - # 加建造年份 - if "year_built" in landmark: - details.append(f"built in {landmark['year_built']}") - - # 加建築風格 - if "architectural_style" in landmark: - details.append(f"featuring {landmark['architectural_style']} architectural style") - - # 加重要性 - if "significance" in landmark: - details.append(landmark["significance"]) - - # 如果有詳細信息,加到描述中 - if details: - landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})") - - # 將詳細信息添加到基本描述中 - if landmark_details: - description = base_description + " " + "The scene features " + ", ".join(landmark_details) + "." - else: - description = base_description - - # 獲取視角 - viewpoint = self._detect_viewpoint(detected_objects) - - # 生成人員活動描述 - people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) # 人的類別ID通常為0 - - if people_count > 0: - if people_count == 1: - people_description = "There is one person in the scene, likely a tourist or visitor." - elif people_count < 5: - people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark." - else: - people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination." - - description = self._smart_append(description, people_description) - - # 添加照明信息 - if lighting_info and "time_of_day" in lighting_info: - lighting_type = lighting_info["time_of_day"] - if lighting_type in self.templates.get("lighting_templates", {}): - lighting_description = self.templates["lighting_templates"][lighting_type] - description = self._smart_append(description, lighting_description) - - # 添加視角描述 - if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}): - viewpoint_template = self.templates["viewpoint_templates"][viewpoint] - - # 添加視角前綴 - prefix = viewpoint_template.get('prefix', '') - if prefix and not description.startswith(prefix): - # 保持句子流暢性 - if description and description[0].isupper(): - description = prefix + description[0].lower() + description[1:] - else: - description = prefix + description - - # 添加視角觀察描述 - viewpoint_desc = viewpoint_template.get("observation", "").format( - scene_elements="the landmark and surrounding area" - ) - - if viewpoint_desc and viewpoint_desc not in description: - description = self._smart_append(description, viewpoint_desc) - - # 添加功能區域描述 - if functional_zones and len(functional_zones) > 0: - zones_desc = self._describe_functional_zones(functional_zones) - if zones_desc: - description = self._smart_append(description, zones_desc) - - # 描述可能的活動 - landmark_activities = [] - - # 根據地標類型生成通用活動 - if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects): - landmark_activities = [ - "nature photography", - "scenic viewing", - "hiking or walking", - "guided nature tours", - "outdoor appreciation" - ] - elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects): - landmark_activities = [ - "historical sightseeing", - "educational tours", - "cultural appreciation", - "photography of historical architecture", - "learning about historical significance" - ] - else: - landmark_activities = [ - "sightseeing", - "taking photographs", - "guided tours", - "cultural tourism", - "souvenir shopping" - ] - - # 添加活動描述 - if landmark_activities: - activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "." - description = self._smart_append(description, activities_text) - - # 最後格式化描述 - return self._format_final_description(description) - - def filter_landmark_references(self, text, enable_landmark=True): - """ - 動態過濾文本中的地標引用 - - Args: - text: 需���過濾的文本 - enable_landmark: 是否啟用地標功能 - - Returns: - str: 過濾後的文本 - """ - if enable_landmark or not text: - return text - - try: - # 動態收集所有地標名稱和位置 - landmark_names = [] - locations = [] - - for landmark_id, info in ALL_LANDMARKS.items(): - # 收集地標名稱及其別名 - landmark_names.append(info["name"]) - landmark_names.extend(info.get("aliases", [])) - - # 收集地理位置 - if "location" in info: - location = info["location"] - locations.append(location) - - # 處理分離的城市和國家名稱 - parts = location.split(",") - if len(parts) >= 1: - locations.append(parts[0].strip()) - if len(parts) >= 2: - locations.append(parts[1].strip()) - - # 使用正則表達式動態替換所有地標名稱 - import re - for name in landmark_names: - if name and len(name) > 2: # 避免過短的名稱 - text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE) - - # 動態替換所有位置引用 - for location in locations: - if location and len(location) > 2: - # 替換常見位置表述模式 - text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE) - text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE) - text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE) - - except ImportError: - # 如果無法導入,使用基本模式 - pass - - # 通用地標描述模式替換 - landmark_patterns = [ - (r'a (tourist|popular|famous) landmark', r'an urban structure'), - (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'), - (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'), - (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'), - (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'), - (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'), - (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'), - (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'), - (r'landmark scene', r'urban scene'), - (r'tourist destination', r'urban area'), - (r'tourist attraction', r'urban area') - ] - - for pattern, replacement in landmark_patterns: - text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) - - return text - - - def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float, - lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True, - scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None, - image_dimensions: Optional[Dict] = None, places365_info: Optional[Dict] = None, - object_statistics: Optional[Dict] = None) -> str: - """ - Generate enhanced scene description based on detection results, scene type, - and additional contextual information. - This version ensures that the main scene_details (from the first call) - is properly integrated and not overwritten by a simplified second call. - """ - # Handle unknown scene type or very low confidence as an early exit - if scene_type == "unknown" or confidence < 0.4: - # _generate_generic_description should also ideally use image_dimensions if it does spatial reasoning - generic_desc = self._generate_generic_description(detected_objects, lighting_info) - return self._format_final_description(generic_desc) - - # Filter out landmark objects if landmark detection is disabled for this run - current_detected_objects = detected_objects - if not enable_landmark: - current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)] - - # Log Places365 context if available - places365_context = "" - if places365_info and places365_info.get('confidence', 0) > 0.3: - scene_label = places365_info.get('scene_label', '') - attributes = places365_info.get('attributes', []) - is_indoor = places365_info.get('is_indoor', None) - - if scene_label: - places365_context = f"Scene context: {scene_label}" - if attributes: - places365_context += f" with characteristics: {', '.join(attributes[:3])}" - if is_indoor is not None: - indoor_outdoor = "indoor" if is_indoor else "outdoor" - places365_context += f" ({indoor_outdoor} environment)" - - print(f"Enhanced description incorporating Places365 context: {places365_context}") - - landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)] - has_landmark_in_scene = len(landmark_objects_in_scene) > 0 - - # If landmark processing is enabled and it's a landmark scene or landmarks are detected - if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene): - landmark_desc = self._generate_landmark_description( - scene_type, - current_detected_objects, # Pass potentially filtered list - confidence, - lighting_info, - functional_zones, - landmark_objects_in_scene # Pass the explicitly filtered landmark objects - ) - return self._format_final_description(landmark_desc) - - # **[Start of main description construction for non-landmark or landmark-disabled everyday scenes]** - - # Detect viewpoint based on current (potentially filtered) objects - viewpoint = self._detect_viewpoint(current_detected_objects) - current_scene_type = scene_type # Use a mutable variable for scene_type if it can change - - # Process aerial viewpoint scene types (may re-assign current_scene_type) - if viewpoint == "aerial": - if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): # Use lower for robustness - current_scene_type = "aerial_view_intersection" - elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]): - current_scene_type = "aerial_view_commercial_area" - elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]): - current_scene_type = "aerial_view_plaza" - else: # Default aerial if specific not matched - current_scene_type = "aerial_view_general" # Or use a specific default like aerial_view_intersection - - # Detect cultural context (only for non-aerial viewpoints) - cultural_context = None - if viewpoint != "aerial": - cultural_context = self._detect_cultural_context(current_scene_type, current_detected_objects) - - # Get base description for the (potentially updated) scene type - base_description = "A scene" # Default initialization - if viewpoint == "aerial": - # Check if current_scene_type (which might be an aerial type) has a base description - if current_scene_type in self.scene_types: - base_description = self.scene_types[current_scene_type].get("description", "An aerial view showing the layout and movement patterns from above") - else: - base_description = "An aerial view showing the layout and movement patterns from above" - elif current_scene_type in self.scene_types: - base_description = self.scene_types[current_scene_type].get("description", "A scene") - - # spatial analysis, and image dimensions. This is where dynamic description or template filling happens. - core_scene_details = self._generate_scene_details( - current_scene_type, # Use the potentially updated scene_type - current_detected_objects, - lighting_info, - viewpoint, - spatial_analysis=spatial_analysis, # Pass this through - image_dimensions=image_dimensions, # Pass this through - places365_info=places365_info, # Pass Places365 info - object_statistics=object_statistics # Pass object statistics - ) - - # Start with the base description derived from SCENE_TYPES or a default. - description = base_description - if core_scene_details and core_scene_details.strip() != "": # Ensure core_scene_details is not empty - # If base_description is generic like "A scene", consider replacing it or appending smartly. - if base_description.lower() == "a scene" and len(core_scene_details) > len(base_description): - description = core_scene_details # Prioritize dynamic/template-filled details if base is too generic - else: - description = self._smart_append(description, core_scene_details) - elif not core_scene_details and not description: # If both are empty, use a generic fallback - description = self._generate_generic_description(current_detected_objects, lighting_info) - - - # Append secondary description from scene type template, if any - if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]: - secondary_desc = self.scene_types[current_scene_type]["secondary_description"] - if secondary_desc: - description = self._smart_append(description, secondary_desc) - - # Append people count information - people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0] - if people_objs: - people_count = len(people_objs) - - if people_count == 1: people_phrase = "a single person" - elif people_count > 1 and people_count <= 3: people_phrase = f"{people_count} people" # Accurate for small counts - elif people_count > 3 and people_count <=7: people_phrase = "several people" - else: people_phrase = "multiple people" # For larger counts, or use "numerous" - - # Only add if not already well covered in core_scene_details or base_description - if "person" not in description.lower() and "people" not in description.lower() and "pedestrian" not in description.lower(): - description = self._smart_append(description, f"The scene includes {people_phrase}.") - - # Append cultural context - if cultural_context and viewpoint != "aerial": # Already checked viewpoint - cultural_elements = self._generate_cultural_elements(cultural_context) - if cultural_elements: - description = self._smart_append(description, cultural_elements) - - # Append lighting information - lighting_description_text = "" - if lighting_info and "time_of_day" in lighting_info: - lighting_type = lighting_info["time_of_day"] - lighting_desc_template = self.templates.get("lighting_templates", {}).get(lighting_type) - if lighting_desc_template: - lighting_description_text = lighting_desc_template - if lighting_description_text and lighting_description_text.lower() not in description.lower(): - description = self._smart_append(description, lighting_description_text) - - # Append viewpoint information (if not eye-level) - if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}): - viewpoint_template = self.templates["viewpoint_templates"][viewpoint] - prefix = viewpoint_template.get('prefix', '') - observation_template = viewpoint_template.get("observation", "") - - # Determine scene_elements for the observation template - scene_elements_for_vp = "the overall layout and objects" # Generic default - if viewpoint == "aerial": - scene_elements_for_vp = "crossing patterns and general layout" - - viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp) - - # Combine prefix and observation carefully - full_viewpoint_text = "" - if prefix: - full_viewpoint_text = prefix.strip() + " " - if viewpoint_observation_text and viewpoint_observation_text[0].islower(): - full_viewpoint_text += viewpoint_observation_text - elif viewpoint_observation_text: - full_viewpoint_text = prefix + viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else prefix + viewpoint_observation_text - - elif viewpoint_observation_text: # No prefix, but observation exists - full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:] - - - if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower(): - description = self._smart_append(description, full_viewpoint_text) - - - # Append functional zones information - if functional_zones and len(functional_zones) > 0: - zones_desc_text = self._describe_functional_zones(functional_zones) - if zones_desc_text: - description = self._smart_append(description, zones_desc_text) - - final_formatted_description = self._format_final_description(description) - - if not enable_landmark: - final_formatted_description = self.filter_landmark_references(final_formatted_description, enable_landmark=False) - - # If after all processing, description is empty, fallback to a very generic one. - if not final_formatted_description.strip() or final_formatted_description.strip() == ".": - self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.") - final_formatted_description = self._format_final_description( - self._generate_generic_description(current_detected_objects, lighting_info) - ) - - return final_formatted_description - - - def _smart_append(self, current_text: str, new_fragment: str) -> str: - """ - Intelligently append a new text fragment to the current text, - handling punctuation and capitalization correctly. - - Args: - current_text: The existing text to append to - new_fragment: The new text fragment to append - - Returns: - str: The combined text with proper formatting - """ - # Handle empty cases - if not new_fragment: - return current_text - - if not current_text: - # Ensure first character is uppercase for the first fragment - return new_fragment[0].upper() + new_fragment[1:] if new_fragment else "" - - # Clean up existing text - current_text = current_text.rstrip() - - # Check for ending punctuation - ends_with_sentence = current_text.endswith(('.', '!', '?')) - ends_with_comma = current_text.endswith(',') - - # Specifically handle the "A xxx A yyy" pattern that's causing issues - if (current_text.startswith("A ") or current_text.startswith("An ")) and \ - (new_fragment.startswith("A ") or new_fragment.startswith("An ")): - return current_text + ". " + new_fragment - - # 檢查新片段是否包含地標名稱(通常為專有名詞) - has_landmark_name = any(word[0].isupper() for word in new_fragment.split() - if len(word) > 2 and not word.startswith(("A ", "An ", "The "))) - - # Decide how to join the texts - if ends_with_sentence: - # After a sentence, start with uppercase and add proper spacing - joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:]) - elif ends_with_comma: - # After a comma, maintain flow with lowercase unless it's a proper noun or special case - if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name: - joined_text = current_text + " " + new_fragment - else: - joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:] - elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower(): - # When adding a new sentence about the scene, use a period - joined_text = current_text + ". " + new_fragment - else: - # For other cases, decide based on the content - if self._is_related_phrases(current_text, new_fragment): - if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name: - joined_text = current_text + ", " + new_fragment - else: - joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:] - else: - # Use period for unrelated phrases - joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:]) - - return joined_text - - def _is_related_phrases(self, text1: str, text2: str) -> bool: - """ - Determine if two phrases are related and should be connected with a comma - rather than separated with a period. - - Args: - text1: The first text fragment - text2: The second text fragment to be appended - - Returns: - bool: Whether the phrases appear to be related - """ - # Check if either phrase starts with "A" or "An" - these are likely separate descriptions - if (text1.startswith("A ") or text1.startswith("An ")) and \ - (text2.startswith("A ") or text2.startswith("An ")): - return False # These are separate descriptions, not related phrases - - # Check if the second phrase starts with a connecting word - connecting_words = ["which", "where", "who", "whom", "whose", "with", "without", - "this", "these", "that", "those", "and", "or", "but"] - - first_word = text2.split()[0].lower() if text2 else "" - if first_word in connecting_words: - return True - - # Check if the first phrase ends with something that suggests continuity - ending_patterns = ["such as", "including", "like", "especially", "particularly", - "for example", "for instance", "namely", "specifically"] - - for pattern in ending_patterns: - if text1.lower().endswith(pattern): - return True - - # Check if both phrases are about the scene - if "scene" in text1.lower() and "scene" in text2.lower(): - return False # Separate statements about the scene should be separate sentences - - return False - - - def _format_final_description(self, text: str) -> str: - """ - Format the final description text to ensure correct punctuation, - capitalization, and spacing. - """ - if not text or not text.strip(): # Also check if text is just whitespace - return "" - - # Trim leading/trailing whitespace first - text = text.strip() - - # 1. Handle consecutive "A/An" segments (potentially split them into sentences) - text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE) - text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE) - - # 2. Ensure first character of the entire text is uppercase - if text: - text = text[0].upper() + text[1:] - - # 3. Normalize whitespace: multiple spaces to one - text = re.sub(r'\s{2,}', ' ', text) - - # 4. Capitalize after sentence-ending punctuation (. ! ?) - def capitalize_after_punctuation(match): - return match.group(1) + match.group(2).upper() - text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text) - - # 5. Handle capitalization after commas (your existing robust logic is good) - def fix_capitalization_after_comma(match): - leading_comma_space = match.group(1) # (,\s+) - word_after_comma = match.group(2) # ([A-Z][a-zA-Z]*) - - proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll", - "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", - "January", "February", "March", "April", "May", "June", "July", - "August", "September", "October", "November", "December"] - - if word_after_comma in proper_nouns_exceptions: - return match.group(0) - # If the word looks like a proper noun (e.g., multi-word capitalized, or a known location/brand) - # This heuristic can be tricky. For simplicity, if it's already capitalized and not a common word, keep it. - if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]: - return match.group(0) # Keep it if it looks like a proper noun already - - return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:] - text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # Added hyphen and apostrophe to word - - # 6. Correct spacing around punctuation - text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Ensures one space AFTER punctuation, none before - text = text.replace(' .', '.').replace(' ,', ',') # Clean up potential space before period/comma from previous rule - - # 7. Consolidate multiple sentence-ending punctuations (e.g., "!!", "?.", ".?") - text = re.sub(r'[.!?]{2,}', '.', text) # Convert multiple to a single period - text = re.sub(r',+', ',', text) # Multiple commas to one - - # 8. Ensure text ends with a single sentence-ending punctuation mark - text = text.strip() # Remove trailing whitespace before checking last char - if text and not text[-1] in '.!?': - text += '.' - - # 9. Remove any leading punctuation or extra spaces that might have been introduced - text = re.sub(r'^[.,;:!?\s]+', '', text) - - # 10. Final check for first letter capitalization - if text: - text = text[0].upper() + text[1:] - - # 11. Remove space before final punctuation mark if accidentally added by rule 7 - text = re.sub(r'\s+([.!?])$', r'\1', text) - - return text.strip() # Final strip - - def _is_intersection(self, detected_objects: List[Dict]) -> bool: - """ - 通過分析物體分佈來判斷場景是否為十字路口 - """ - # 檢查行人分佈模式 - pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0] - - if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口 - # 抓取行人位置 - positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians] - - # 分析 x 和 y 坐標分佈 - x_coords = [pos[0] for pos in positions] - y_coords = [pos[1] for pos in positions] - - # 計算 x 和 y 坐標的變異數 - x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 - y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 - - # 計算範圍 - x_range = max(x_coords) - min(x_coords) - y_range = max(y_coords) - min(y_coords) - - # 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口 - if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: - return True - - return False - - def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str: - """ - Generate a generic description when scene type is unknown or confidence is very low. - - Args: - detected_objects: List of detected objects - lighting_info: Optional lighting condition information - - Returns: - str: Generic description based on detected objects - """ - # Count object occurrences - obj_counts = {} - for obj in detected_objects: - class_name = obj["class_name"] - if class_name not in obj_counts: - obj_counts[class_name] = 0 - obj_counts[class_name] += 1 - - # Get top objects by count - top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5] - - if not top_objects: - base_desc = "No clearly identifiable objects are visible in this scene." - else: - # Format object list - objects_text = [] - for name, count in top_objects: - if count > 1: - objects_text.append(f"{count} {name}s") - else: - objects_text.append(name) - - if len(objects_text) == 1: - objects_list = objects_text[0] - elif len(objects_text) == 2: - objects_list = f"{objects_text[0]} and {objects_text[1]}" - else: - objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}" - - base_desc = f"This scene contains {objects_list}." - - # Add lighting information if available - if lighting_info and "time_of_day" in lighting_info: - lighting_type = lighting_info["time_of_day"] - if lighting_type in self.templates.get("lighting_templates", {}): - lighting_desc = self.templates["lighting_templates"][lighting_type] - base_desc += f" {lighting_desc}" - - return base_desc - - def _get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7) -> List[Dict]: - """ - Helper function to get the most prominent objects. - Prioritizes high-confidence, large objects, and ensures a diversity of object types. - - Args: - detected_objects: List of detected objects. - min_prominence_score: Minimum score for an object to be considered initially. - max_categories_to_return: Max number of different object categories to prioritize. - max_total_objects: Overall cap on the number of prominent objects returned. - - Returns: - List of prominent detected objects. - """ - if not detected_objects: - return [] - - scored_objects = [] - for obj in detected_objects: - area = obj.get("normalized_area", 0.0) + 1e-6 - confidence = obj.get("confidence", 0.0) - - # Base score: area and confidence are key - score = (area * 0.65) + (confidence * 0.35) # Slightly more weight to area - - # Bonus for generally important object classes (in a generic way) - # This is a simple heuristic. More advanced would be context-dependent. - # For example, 'person' is often more salient. - # Avoid hardcoding specific class_ids here if possible, or use broad categories if available. - # For simplicity, we'll keep the landmark bonus for now. - if obj.get("class_name") == "person": # Example: person is generally prominent - score += 0.1 - if obj.get("is_landmark"): # Landmarks are always prominent - score += 0.5 - - if score >= min_prominence_score: - scored_objects.append((obj, score)) - - if not scored_objects: - return [] - - # Sort by score in descending order - scored_objects.sort(key=lambda x: x[1], reverse=True) - - # Prioritize diversity of object categories first - prominent_by_category = {} - final_prominent_objects = [] - - for obj, score in scored_objects: - category = obj.get("class_name", "unknown") - if category not in prominent_by_category: - if len(prominent_by_category) < max_categories_to_return: - prominent_by_category[category] = obj - final_prominent_objects.append(obj) - - elif len(final_prominent_objects) < max_total_objects and obj not in final_prominent_objects: - if score > 0.3: - final_prominent_objects.append(obj) - - # If still under max_total_objects, fill with highest scored remaining objects regardless of category - if len(final_prominent_objects) < max_total_objects: - for obj, score in scored_objects: - if len(final_prominent_objects) >= max_total_objects: - break - if obj not in final_prominent_objects: - final_prominent_objects.append(obj) - - # Re-sort the final list by original prominence score to maintain order - final_prominent_objects_with_scores = [] - for obj in final_prominent_objects: - for original_obj, original_score in scored_objects: - if obj is original_obj: # Check for object identity - final_prominent_objects_with_scores.append((obj, original_score)) - break - - final_prominent_objects_with_scores.sort(key=lambda x: x[1], reverse=True) - - return [obj for obj, score in final_prominent_objects_with_scores[:max_total_objects]] - - - def _format_object_list_for_description(self, - objects: List[Dict], - use_indefinite_article_for_one: bool = False, - count_threshold_for_generalization: int = -1, # Default to -1 for precise counts - max_types_to_list: int = 5 - ) -> str: - """ - Formats a list of detected objects into a human-readable string with counts. - Args: - objects: List of object dictionaries, each expected to have 'class_name'. - use_indefinite_article_for_one: If True, uses "a/an" for single items. If False, uses "one". - count_threshold_for_generalization: If count exceeds this, use general terms. -1 means precise counts. - max_types_to_list: Maximum number of different object types to include in the list. - """ - if not objects: - return "no specific objects clearly identified" - - counts: Dict[str, int] = {} - for obj in objects: - name = obj.get("class_name", "unknown object") - if name == "unknown object" or not name: # Skip unknown or empty names - continue - counts[name] = counts.get(name, 0) + 1 - - if not counts: - return "no specific objects clearly identified" - - descriptions = [] - # Sort by count (desc) then name (asc) for consistent output order - # Limit the number of distinct object types being listed - sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list] - - - for name, count in sorted_counts: - if count == 1: - if use_indefinite_article_for_one: - if name[0].lower() in 'aeiou': - descriptions.append(f"an {name}") - else: - descriptions.append(f"a {name}") - else: - descriptions.append(f"one {name}") # Output "one car" instead of "a car" - else: # count > 1 - plural_name = name - if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")): - plural_name = name[:-1] + "ies" - elif name.endswith(("s", "sh", "ch", "x", "z")): - plural_name = name + "es" - elif not name.endswith("s"): # Avoid double 's' like "buss" - plural_name = name + "s" - - if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization: - if count <= count_threshold_for_generalization + 3: - descriptions.append(f"several {plural_name}") - else: - descriptions.append(f"many {plural_name}") - else: # Use exact count (e.g., "6 cars") - descriptions.append(f"{count} {plural_name}") - - if not descriptions: - return "no specific objects clearly identified" - - if len(descriptions) == 1: - return descriptions[0] - elif len(descriptions) == 2: - return f"{descriptions[0]} and {descriptions[1]}" - else: - # Oxford comma for lists of 3 or more. - return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}" - - def _get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str: - """ - Generates a brief spatial description for an object. - (This is a new helper function) - """ - region = obj.get("region") - if region: - # Convert region name to more descriptive terms - region_map = { - "top_left": "in the top-left", "top_center": "at the top-center", "top_right": "in the top-right", - "middle_left": "on the middle-left side", "middle_center": "in the center", "middle_right": "on the middle-right side", - "bottom_left": "in the bottom-left", "bottom_center": "at the bottom-center", "bottom_right": "in the bottom-right" - } - # More general terms if exact region is not critical - if "top" in region: general_v_pos = "towards the top" - elif "bottom" in region: general_v_pos = "towards the bottom" - else: general_v_pos = "in the middle vertically" - - if "left" in region: general_h_pos = "towards the left" - elif "right" in region: general_h_pos = "towards the right" - else: general_h_pos = "in the center horizontally" - - # Prioritize specific region if available, else use general - specific_desc = region_map.get(region, "") - if specific_desc: - return f"{specific_desc} of the frame" - else: - return f"{general_v_pos} and {general_h_pos} of the frame" - - # Fallback if region info is not detailed enough or missing - # We can use normalized_center if available - norm_center = obj.get("normalized_center") - if norm_center and image_width and image_height: # Check if image_width/height are provided - x_norm, y_norm = norm_center - h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center" - v_pos = "top" if y_norm < 0.4 else "bottom" if y_norm > 0.6 else "middle" - - if h_pos == "center" and v_pos == "middle": - return "near the center of the image" - return f"in the {v_pos}-{h_pos} area of the image" - - return "in the scene" # Generic fallback - - - def _generate_dynamic_everyday_description(self, - detected_objects: List[Dict], - lighting_info: Optional[Dict] = None, - viewpoint: str = "eye_level", - spatial_analysis: Optional[Dict] = None, - image_dimensions: Optional[Tuple[int, int]] = None, - places365_info: Optional[Dict] = None, - object_statistics: Optional[Dict] = None - ) -> str: - """ - Dynamically generates a description for everyday scenes based on ALL relevant detected_objects, - their counts, and context. - It aims to describe the overall scene first, then details of object groups including accurate counts. - """ - description_segments = [] - image_width, image_height = image_dimensions if image_dimensions else (None, None) - - if hasattr(self, 'logger'): - self.logger.info(f"DynamicDesc: Start. Total Raw Objects: {len(detected_objects)}, View: {viewpoint}, Light: {lighting_info is not None}") - - # 1. Overall Ambiance (Lighting and Viewpoint) - ambiance_parts = [] - if lighting_info: - time_of_day = lighting_info.get("time_of_day", "unknown lighting") - is_indoor = lighting_info.get("is_indoor") - ambiance_statement = "This is" - if is_indoor is True: ambiance_statement += " an indoor scene" - elif is_indoor is False: ambiance_statement += " an outdoor scene" - else: ambiance_statement += " a scene" - lighting_map = self.templates.get("lighting_templates", {}) - readable_lighting_base = lighting_map.get(time_of_day, f"with {time_of_day.replace('_', ' ')} lighting conditions") - readable_lighting = readable_lighting_base.lower().replace("the scene is captured", "").replace("the scene has", "").strip() - ambiance_statement += f", likely {readable_lighting}." - ambiance_parts.append(ambiance_statement) - - if viewpoint and viewpoint != "eye_level": - vp_templates = self.templates.get("viewpoint_templates", {}) - if viewpoint in vp_templates: - vp_prefix = vp_templates[viewpoint].get("prefix", "").strip() - if vp_prefix: - if not ambiance_parts: - ambiance_parts.append(f"{vp_prefix.capitalize()} the general layout of the scene is observed.") - else: - ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed {vp_templates[viewpoint].get('short_desc', viewpoint)}." - - if ambiance_parts: - description_segments.append(" ".join(ambiance_parts)) - - # 2. Describe ALL detected objects, grouped by class, with accurate counts and locations - if not detected_objects: - # This part remains, but the conditions to reach here might change based on confident_objects check - if not description_segments: - description_segments.append("A general scene is visible, but no specific objects were clearly identified.") - else: - description_segments.append("Within this setting, no specific objects were clearly identified.") - else: - objects_by_class: Dict[str, List[Dict]] = {} - - # keeping 0.25 as a placeholder - confidence_filter_threshold = getattr(self, 'confidence_threshold_for_description', 0.25) - confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= confidence_filter_threshold] - - if not confident_objects: - # This message is more appropriate if objects existed but none met confidence - no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description." - if not description_segments: description_segments.append(no_confident_obj_msg) - else: description_segments.append(no_confident_obj_msg.lower().capitalize()) # Append as a new sentence - else: - if object_statistics: - # 使用預計算的統計信息,並採用動態置信度策略 - for class_name, stats in object_statistics.items(): - count = stats.get("count", 0) - avg_confidence = stats.get("avg_confidence", 0) - - # 動態調整置信度閾值:裝飾性物品使用較低閾值 - dynamic_threshold = confidence_filter_threshold - if class_name in ["potted plant", "vase", "clock", "book"]: - dynamic_threshold = max(0.15, confidence_filter_threshold * 0.6) - elif count >= 3: # 數量多的物品降低閾值 - dynamic_threshold = max(0.2, confidence_filter_threshold * 0.8) - - if count > 0 and avg_confidence >= dynamic_threshold: - matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name] - if not matching_objects: - # 如果高信心度的物體中沒有,從原始列表中尋找 - matching_objects = [obj for obj in detected_objects - if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold] - - if matching_objects: - actual_count = min(stats["count"], len(matching_objects)) - objects_by_class[class_name] = matching_objects[:actual_count] - else: - # 回退邏輯同樣使用動態閾值 - for obj in confident_objects: - name = obj.get("class_name", "unknown object") - if name == "unknown object" or not name: continue - if name not in objects_by_class: - objects_by_class[name] = [] - objects_by_class[name].append(obj) - - if not objects_by_class: # Should be rare if confident_objects was not empty and had valid names - description_segments.append("No common objects were confidently identified for detailed description.") - else: - def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]): - class_name_key, obj_group_list = item_tuple - priority = 3 # 預設優先級 - count = len(obj_group_list) - - # 動態優先級:基於場景相關性和數量 - if class_name_key == "person": - priority = 0 - elif class_name_key in ["dining table", "chair", "sofa", "bed"]: - priority = 1 # 室內主要家具 - elif class_name_key in ["car", "bus", "truck", "traffic light"]: - priority = 2 # 交通相關物體 - elif count >= 3: # 數量多的物體提升優先級 - priority = max(1, priority - 1) - elif class_name_key in ["potted plant", "vase", "clock", "book"] and count >= 2: - priority = 2 # 裝飾性物品有一定數量時提升優先級 - - avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0 - - # 增加數量權重:多個同類物體更重要 - quantity_bonus = min(count / 5.0, 1.0) # 最多1.0的加成 - - return (priority, -len(obj_group_list), -avg_area, -quantity_bonus) - - # 去除重複的邏輯 - deduplicated_objects_by_class = {} - processed_positions = [] - - for class_name, group_of_objects in objects_by_class.items(): - unique_objects = [] - - for obj in group_of_objects: - obj_position = obj.get("normalized_center", [0.5, 0.5]) - is_duplicate = False - - # 檢查是否與已處理的物體位置重疊 - for processed_pos in processed_positions: - position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1]) - if position_distance < 0.15: # 位置重疊閾值 - is_duplicate = True - break - - if not is_duplicate: - unique_objects.append(obj) - processed_positions.append(obj_position) - - if unique_objects: - deduplicated_objects_by_class[class_name] = unique_objects - - objects_by_class = deduplicated_objects_by_class - - sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups) - - object_clauses = [] # Stores individual object group descriptions - - for class_name, group_of_objects in sorted_object_groups: - count = len(group_of_objects) - if count == 0: continue - - # 使用統計信息確保準確的數量描述 - if object_statistics and class_name in object_statistics: - actual_count = object_statistics[class_name]["count"] - # 根據實際統計數量生成描述 - if actual_count == 1: - formatted_name_with_exact_count = f"one {class_name}" - else: - plural_form = f"{class_name}s" if not class_name.endswith('s') else class_name - formatted_name_with_exact_count = f"{actual_count} {plural_form}" - else: - # 回退到原有的格式化邏輯 - formatted_name_with_exact_count = self._format_object_list_for_description( - [group_of_objects[0]] * count, - use_indefinite_article_for_one=False, - count_threshold_for_generalization=-1 - ) - - if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count: - continue - - # Determine collective location for the group - location_description_suffix = "" # e.g., "is in the center" or "are in the west area" - if count == 1: - location_description_suffix = f"is {self._get_spatial_description(group_of_objects[0], image_width, image_height)}" - else: - distinct_regions = sorted(list(set(obj.get("region", "unknown_region") for obj in group_of_objects))) - known_regions = [r for r in distinct_regions if r != "unknown_region"] - if not known_regions and "unknown_region" in distinct_regions: - location_description_suffix = "are visible in the scene" - elif len(known_regions) == 1: - location_description_suffix = f"are primarily in the {known_regions[0].replace('_', ' ')} area" - elif len(known_regions) == 2: - location_description_suffix = f"are mainly across the {known_regions[0].replace('_',' ')} and {known_regions[1].replace('_',' ')} areas" - elif len(known_regions) > 2: - location_description_suffix = "are distributed in various parts of the scene" - else: - location_description_suffix = "are visible in the scene" - - # Capitalize the object description (e.g., "Six cars") - formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:] - object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}") - - if object_clauses: - # Join object clauses into one or more sentences. - if not description_segments: # If no ambiance, start with the first object clause. - if object_clauses: - first_clause = object_clauses.pop(0) # Take the first one out - description_segments.append(first_clause + ".") - else: # Ambiance exists, prepend with "The scene features..." or similar - if object_clauses: - description_segments.append("The scene features:") # Or "Key elements include:" - - # Add remaining object clauses as separate points or a continuous sentence - # For now, let's join them into a single continuous sentence string to be added. - if object_clauses: # If there are more clauses after the first (or after "The scene features:") - joined_object_clauses = ". ".join(object_clauses) - if joined_object_clauses and not joined_object_clauses.endswith("."): - joined_object_clauses += "." - description_segments.append(joined_object_clauses) - - elif not description_segments : # No ambiance and no describable objects after filtering - return "The image depicts a scene, but specific objects could not be described with confidence or detail." - - # --- Final assembly and formatting --- - # Join all collected segments. _smart_append might be better if parts are not full sentences. - # Since we aim for full sentences in segments, simple join then format. - raw_description = "" - for i, segment in enumerate(filter(None, description_segments)): - segment = segment.strip() - if not segment: continue - - if not raw_description: # First non-empty segment - raw_description = segment - else: - if not raw_description.endswith(('.', '!', '?')): - raw_description += "." - raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper()) - - if raw_description and not raw_description.endswith(('.', '!', '?')): - raw_description += "." - - final_description = self._format_final_description(raw_description) # Crucial for final polish - - if not final_description or len(final_description.strip()) < 20: - # Fallback if description is too short or empty after processing - # Use a more informative fallback if confident_objects existed - if 'confident_objects' in locals() and confident_objects: - return "The scene contains several detected objects, but a detailed textual description could not be fully constructed." - else: - return "A general scene is depicted with no objects identified with high confidence." - - return final_description - - - def _generate_scene_details(self, - scene_type: str, - detected_objects: List[Dict], - lighting_info: Optional[Dict] = None, - viewpoint: str = "eye_level", - spatial_analysis: Optional[Dict] = None, - image_dimensions: Optional[Tuple[int, int]] = None, - places365_info: Optional[Dict] = None, - object_statistics: Optional[Dict] = None - ) -> str: - """ - Generate detailed description based on scene type and detected objects. - Enhanced to handle everyday scenes dynamically with accurate object counting. - - Args: - scene_type: Identified scene type. - detected_objects: List of detected objects. - lighting_info: Optional lighting condition information. - viewpoint: Detected viewpoint (aerial, eye_level, etc.). - spatial_analysis: Optional results from SpatialAnalyzer. - image_dimensions: Optional tuple of (image_width, image_height). - places365_info: Optional Places365 scene classification results. - object_statistics: Optional detailed object statistics with counts and confidence. - - Returns: - str: Detailed scene description. - """ - scene_details = "" - scene_templates = self.templates.get("scene_detail_templates", {}) - - # List of scene types considered "everyday" or generic - everyday_scene_types = [ - "general_indoor_space", "generic_street_view", - "desk_area_workspace", "outdoor_gathering_spot", - "kitchen_counter_or_utility_area", "unknown" - ] - - # Extract Places365 attributes for enhanced description - places365_attributes = [] - scene_specific_details = "" - - if places365_info and places365_info.get('confidence', 0) > 0.4: - attributes = places365_info.get('attributes', []) - scene_label = places365_info.get('scene_label', '') - - # Filter relevant attributes for description enhancement - relevant_attributes = [attr for attr in attributes if attr in [ - 'natural_lighting', 'artificial_lighting', 'commercial', 'residential', - 'workplace', 'recreational', 'educational', 'open_space', 'enclosed_space' - ]] - places365_attributes = relevant_attributes[:2] - - # Generate scene-specific contextual details using object statistics - if object_statistics: - if 'commercial' in attributes and object_statistics.get('person', {}).get('count', 0) > 0: - person_count = object_statistics['person']['count'] - if person_count == 1: - scene_specific_details = "This appears to be an active commercial environment with a customer present." - else: - scene_specific_details = f"This appears to be an active commercial environment with {person_count} people present." - elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']: - scene_specific_details = "The setting suggests a comfortable residential living space." - elif 'workplace' in attributes and any(object_statistics.get(obj, {}).get('count', 0) > 0 - for obj in ['laptop', 'keyboard', 'monitor']): - scene_specific_details = "The environment indicates an active workspace or office setting." - else: - # Fallback to original logic if object_statistics not available - if 'commercial' in attributes and any(obj['class_name'] in ['person', 'chair', 'table'] for obj in detected_objects): - scene_specific_details = "This appears to be an active commercial environment with customer activity." - elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']: - scene_specific_details = "The setting suggests a comfortable residential living space." - elif 'workplace' in attributes and any(obj['class_name'] in ['laptop', 'keyboard', 'monitor'] for obj in detected_objects): - scene_specific_details = "The environment indicates an active workspace or office setting." - - # Determine scene description approach - is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in scene_templates - treat_as_everyday = scene_type in everyday_scene_types - - if hasattr(self, 'enable_landmark') and not self.enable_landmark: - if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]: - treat_as_everyday = True - - if treat_as_everyday or not is_confident_specific_scene: - # Generate dynamic description for everyday scenes with object statistics - self.logger.info(f"Generating dynamic description for scene_type: {scene_type}") - scene_details = self._generate_dynamic_everyday_description( - detected_objects, - lighting_info, - viewpoint, - spatial_analysis, - image_dimensions, - places365_info, - object_statistics # Pass object statistics to dynamic description - ) - elif scene_type in scene_templates: - # Use template-based description with enhanced object information - self.logger.info(f"Using template for scene_type: {scene_type}") - viewpoint_key = f"{scene_type}_{viewpoint}" - templates_list = scene_templates.get(viewpoint_key, scene_templates.get(scene_type, [])) - - if templates_list: - detail_template = random.choice(templates_list) - scene_details = self._fill_detail_template( - detail_template, - detected_objects, - scene_type, - places365_info, - object_statistics # Pass object statistics to template filling - ) - else: - scene_details = self._generate_dynamic_everyday_description( - detected_objects, lighting_info, viewpoint, spatial_analysis, - image_dimensions, places365_info, object_statistics - ) - else: - # Fallback to dynamic description with object statistics - self.logger.info(f"No specific template for {scene_type}, generating dynamic description.") - scene_details = self._generate_dynamic_everyday_description( - detected_objects, lighting_info, viewpoint, spatial_analysis, - image_dimensions, places365_info, object_statistics - ) - - # Filter out landmark references if landmark detection is disabled - if hasattr(self, 'enable_landmark') and not self.enable_landmark: - scene_details = self.filter_landmark_references(scene_details, enable_landmark=False) - - return scene_details if scene_details else "A scene with some visual elements." - - def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str: - """ - Fill a template with specific details based on detected objects. - - Args: - template: Template string with placeholders - detected_objects: List of detected objects - scene_type: Identified scene type - - Returns: - str: Filled template - """ - # Find placeholders in the template using simple {placeholder} syntax - import re - placeholders = re.findall(r'\{([^}]+)\}', template) - - filled_template = template - - # Get object template fillers - fillers = self.templates.get("object_template_fillers", {}) - - # 基於物品的統計資訊形成更準確的模板填充內容 - statistics_based_replacements = {} - if object_statistics: - # 根據統計信息生成具體的物體描述 - for class_name, stats in object_statistics.items(): - count = stats.get("count", 0) - if count > 0: - # 為常見物體類別生成基於統計的描述 - if class_name == "potted plant": - if count == 1: - statistics_based_replacements["plant_elements"] = "a potted plant" - elif count <= 3: - statistics_based_replacements["plant_elements"] = f"{count} potted plants" - else: - statistics_based_replacements["plant_elements"] = f"multiple potted plants ({count} total)" - - elif class_name == "chair": - if count == 1: - statistics_based_replacements["seating"] = "a chair" - elif count <= 4: - statistics_based_replacements["seating"] = f"{count} chairs" - else: - statistics_based_replacements["seating"] = f"numerous chairs ({count} total)" - - elif class_name == "person": - if count == 1: - statistics_based_replacements["people_and_vehicles"] = "a person" - statistics_based_replacements["pedestrian_flow"] = "an individual walking" - elif count <= 5: - statistics_based_replacements["people_and_vehicles"] = f"{count} people" - statistics_based_replacements["pedestrian_flow"] = f"{count} people walking" - else: - statistics_based_replacements["people_and_vehicles"] = f"many people ({count} individuals)" - statistics_based_replacements["pedestrian_flow"] = f"a crowd of {count} people" - - # 為所有可能的變數設置默認值 - default_replacements = { - # 室內相關 - "furniture": "various furniture pieces", - "seating": "comfortable seating", - "electronics": "entertainment devices", - "bed_type": "a bed", - "bed_location": "room", - "bed_description": "sleeping arrangements", - "extras": "personal items", - "table_setup": "a dining table and chairs", - "table_description": "a dining surface", - "dining_items": "dining furniture and tableware", - "appliances": "kitchen appliances", - "kitchen_items": "cooking utensils and dishware", - "cooking_equipment": "cooking equipment", - "office_equipment": "work-related furniture and devices", - "desk_setup": "a desk and chair", - "computer_equipment": "electronic devices", - - # 室外/城市相關 - "traffic_description": "vehicles and pedestrians", - "people_and_vehicles": "people and various vehicles", - "street_elements": "urban infrastructure", - "park_features": "benches and greenery", - "outdoor_elements": "natural features", - "park_description": "outdoor amenities", - "store_elements": "merchandise displays", - "shopping_activity": "customers browse and shop", - "store_items": "products for sale", - - # 高級餐廳相關 - "design_elements": "elegant decor", - "lighting": "stylish lighting fixtures", - - # 亞洲商業街相關 - "storefront_features": "compact shops", - "pedestrian_flow": "people walking", - "asian_elements": "distinctive cultural elements", - "cultural_elements": "traditional design features", - "signage": "colorful signs", - "street_activities": "busy urban activity", - - # 金融區相關 - "buildings": "tall buildings", - "traffic_elements": "vehicles", - "skyscrapers": "high-rise buildings", - "road_features": "wide streets", - "architectural_elements": "modern architecture", - "city_landmarks": "prominent structures", - - # 十字路口相關 - "crossing_pattern": "marked pedestrian crossings", - "pedestrian_behavior": "careful walking", - "pedestrian_density": "groups of pedestrians", - "traffic_pattern": "regulated traffic flow", - - # 交通樞紐相關 - "transit_vehicles": "public transportation vehicles", - "passenger_activity": "commuter movement", - "transportation_modes": "various transit options", - "passenger_needs": "waiting areas", - "transit_infrastructure": "transit facilities", - "passenger_movement": "commuter flow", - - # 購物區相關 - "retail_elements": "shops and displays", - "store_types": "various retail establishments", - "walkway_features": "pedestrian pathways", - "commercial_signage": "store signs", - "consumer_behavior": "shopping activities", - - # 空中視角相關 - "commercial_layout": "organized retail areas", - "pedestrian_pattern": "people movement patterns", - "gathering_features": "public gathering spaces", - "movement_pattern": "crowd flow patterns", - "urban_elements": "city infrastructure", - "public_activity": "social interaction", - - # 文化特定元素 - "stall_elements": "vendor booths", - "lighting_features": "decorative lights", - "food_elements": "food offerings", - "vendor_stalls": "market stalls", - "nighttime_activity": "evening commerce", - "cultural_lighting": "traditional lighting", - "night_market_sounds": "lively market sounds", - "evening_crowd_behavior": "nighttime social activity", - "architectural_elements": "cultural buildings", - "religious_structures": "sacred buildings", - "decorative_features": "ornamental designs", - "cultural_practices": "traditional activities", - "temple_architecture": "religious structures", - "sensory_elements": "atmospheric elements", - "visitor_activities": "cultural experiences", - "ritual_activities": "ceremonial practices", - "cultural_symbols": "meaningful symbols", - "architectural_style": "historical buildings", - "historic_elements": "traditional architecture", - "urban_design": "city planning elements", - "social_behaviors": "public interactions", - "european_features": "European architectural details", - "tourist_activities": "visitor activities", - "local_customs": "regional practices", - - # 時間特定元素 - "lighting_effects": "artificial lighting", - "shadow_patterns": "light and shadow", - "urban_features": "city elements", - "illuminated_elements": "lit structures", - "evening_activities": "nighttime activities", - "light_sources": "lighting points", - "lit_areas": "illuminated spaces", - "shadowed_zones": "darker areas", - "illuminated_signage": "bright signs", - "colorful_lighting": "multicolored lights", - "neon_elements": "neon signs", - "night_crowd_behavior": "evening social patterns", - "light_displays": "lighting installations", - "building_features": "architectural elements", - "nightlife_activities": "evening entertainment", - "lighting_modifier": "bright", - - # 混合環境元素 - "transitional_elements": "connecting features", - "indoor_features": "interior elements", - "outdoor_setting": "exterior spaces", - "interior_amenities": "inside comforts", - "exterior_features": "outside elements", - "inside_elements": "interior design", - "outside_spaces": "outdoor areas", - "dual_environment_benefits": "combined settings", - "passenger_activities": "waiting behaviors", - "transportation_types": "transit vehicles", - "sheltered_elements": "covered areas", - "exposed_areas": "open sections", - "waiting_behaviors": "passenger activities", - "indoor_facilities": "inside services", - "platform_features": "transit platform elements", - "transit_routines": "transportation procedures", - - # 專門場所元素 - "seating_arrangement": "spectator seating", - "playing_surface": "athletic field", - "sporting_activities": "sports events", - "spectator_facilities": "viewer accommodations", - "competition_space": "sports arena", - "sports_events": "athletic competitions", - "viewing_areas": "audience sections", - "field_elements": "field markings and equipment", - "game_activities": "competitive play", - "construction_equipment": "building machinery", - "building_materials": "construction supplies", - "construction_activities": "building work", - "work_elements": "construction tools", - "structural_components": "building structures", - "site_equipment": "construction gear", - "raw_materials": "building supplies", - "construction_process": "building phases", - "medical_elements": "healthcare equipment", - "clinical_activities": "medical procedures", - "facility_design": "healthcare layout", - "healthcare_features": "medical facilities", - "patient_interactions": "care activities", - "equipment_types": "medical devices", - "care_procedures": "health services", - "treatment_spaces": "clinical areas", - "educational_furniture": "learning furniture", - "learning_activities": "educational practices", - "instructional_design": "teaching layout", - "classroom_elements": "school equipment", - "teaching_methods": "educational approaches", - "student_engagement": "learning participation", - "learning_spaces": "educational areas", - "educational_tools": "teaching resources", - "knowledge_transfer": "learning exchanges" - } - - # 將統計的資訊形成的替換內容合併到默認替換中 - default_replacements.update(statistics_based_replacements) - - # Add Places365-specific template variables - places365_scene_context = "" - places365_atmosphere = "" - - if places365_info and places365_info.get('confidence', 0) > 0.35: - scene_label = places365_info.get('scene_label', '').replace('_', ' ') - attributes = places365_info.get('attributes', []) - - if scene_label and scene_label != scene_type: - places365_scene_context = f"characteristic of a {scene_label}" - - if 'natural_lighting' in attributes: - places365_atmosphere = "with natural illumination" - elif 'artificial_lighting' in attributes: - places365_atmosphere = "under artificial lighting" - - # Update default_replacements with Places365 context - if places365_scene_context: - default_replacements["places365_context"] = places365_scene_context - else: - default_replacements["places365_context"] = "" - - if places365_atmosphere: - default_replacements["places365_atmosphere"] = places365_atmosphere - else: - default_replacements["places365_atmosphere"] = "" - - # For each placeholder, try to fill with appropriate content - for placeholder in placeholders: - if placeholder in fillers: - # Get random filler for this placeholder - options = fillers[placeholder] - if options: - # Select 1-3 items from the options list - num_items = min(len(options), random.randint(1, 3)) - selected_items = random.sample(options, num_items) - - # Create a formatted list - if len(selected_items) == 1: - replacement = selected_items[0] - elif len(selected_items) == 2: - replacement = f"{selected_items[0]} and {selected_items[1]}" - else: - replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}" - - # Replace the placeholder - filled_template = filled_template.replace(f"{{{placeholder}}}", replacement) - else: - # Try to fill with scene-specific logic - replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type) - if replacement: - filled_template = filled_template.replace(f"{{{placeholder}}}", replacement) - elif placeholder in default_replacements: - # Use default replacement if available - filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder]) - else: - # Last resort default - filled_template = filled_template.replace(f"{{{placeholder}}}", "various items") - - return filled_template - - def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str: - """ - Generate content for a template placeholder based on scene-specific logic. - - Args: - placeholder: Template placeholder - detected_objects: List of detected objects - scene_type: Identified scene type - - Returns: - str: Content for the placeholder - """ - # Handle different types of placeholders with custom logic - if placeholder == "furniture": - # Extract furniture items - furniture_ids = [56, 57, 58, 59, 60, 61] # Example furniture IDs - furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids] - - if furniture_objects: - furniture_names = [obj["class_name"] for obj in furniture_objects[:3]] - return ", ".join(set(furniture_names)) - return "various furniture items" - - elif placeholder == "electronics": - # Extract electronic items - electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # Example electronics IDs - electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids] - - if electronics_objects: - electronics_names = [obj["class_name"] for obj in electronics_objects[:3]] - return ", ".join(set(electronics_names)) - return "electronic devices" - - elif placeholder == "people_count": - # Count people - people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) - - if people_count == 0: - return "no people" - elif people_count == 1: - return "one person" - elif people_count < 5: - return f"{people_count} people" - else: - return "several people" - - elif placeholder == "seating": - # Extract seating items - seating_ids = [56, 57] # chair, sofa - seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids] - - if seating_objects: - seating_names = [obj["class_name"] for obj in seating_objects[:2]] - return ", ".join(set(seating_names)) - return "seating arrangements" - - # Default case - empty string - return "" - - def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str: - """ - Generate basic details when templates aren't available. - - Args: - scene_type: Identified scene type - detected_objects: List of detected objects - - Returns: - str: Basic scene details - """ - # Handle specific scene types with custom logic - if scene_type == "living_room": - tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62] # TV - sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57] # Sofa - - if tv_objs and sofa_objs: - tv_region = tv_objs[0]["region"] - sofa_region = sofa_objs[0]["region"] - - arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, " - arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. " - - return f"{arrangement}This appears to be a space designed for relaxation and entertainment." - - elif scene_type == "bedroom": - bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed - - if bed_objs: - bed_region = bed_objs[0]["region"] - extra_items = [] - - for obj in detected_objects: - if obj["class_id"] == 74: # Clock - extra_items.append("clock") - elif obj["class_id"] == 73: # Book - extra_items.append("book") - - extras = "" - if extra_items: - extras = f" There is also a {' and a '.join(extra_items)} visible." - - return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}" - - elif scene_type in ["dining_area", "kitchen"]: - # Count food and dining-related items - food_items = [] - for obj in detected_objects: - if obj["class_id"] in [39, 41, 42, 43, 44, 45]: # Kitchen items - food_items.append(obj["class_name"]) - - food_str = "" - if food_items: - unique_items = list(set(food_items)) - if len(unique_items) <= 3: - food_str = f" with {', '.join(unique_items)}" - else: - food_str = f" with {', '.join(unique_items[:3])} and other items" - - return f"{food_str}." - - elif scene_type == "city_street": - # Count people and vehicles - people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) - vehicle_count = len([obj for obj in detected_objects - if obj["class_id"] in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck - - traffic_desc = "" - if people_count > 0 and vehicle_count > 0: - traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and " - traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" - elif people_count > 0: - traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}" - elif vehicle_count > 0: - traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" - - return f"{traffic_desc}." - - # Handle more specialized scenes - elif scene_type == "asian_commercial_street": - # Look for key urban elements - people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) - vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]]) - - # Analyze pedestrian distribution - people_positions = [] - for obj in detected_objects: - if obj["class_id"] == 0: # Person - people_positions.append(obj["normalized_center"]) - - # Check if people are distributed along a line (indicating a walking path) - structured_path = False - if len(people_positions) >= 3: - # Simplified check - see if y-coordinates are similar for multiple people - y_coords = [pos[1] for pos in people_positions] - y_mean = sum(y_coords) / len(y_coords) - y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords) - if y_variance < 0.05: # Low variance indicates linear arrangement - structured_path = True - - street_desc = "A commercial street with " - if people_count > 0: - street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}" - if vehicle_count > 0: - street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" - elif vehicle_count > 0: - street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" - else: - street_desc += "various commercial elements" - - if structured_path: - street_desc += ". The pedestrians appear to be following a defined walking path" - - # Add cultural elements - street_desc += ". The signage and architectural elements suggest an Asian urban setting." - - return street_desc - - # Default general description - return "The scene contains various elements characteristic of this environment." - - def _detect_viewpoint(self, detected_objects: List[Dict]) -> str: - """ - 改進視角檢測,特別加強對空中俯視視角的識別。 - - Args: - detected_objects: 檢測到的物體列表 - - Returns: - str: 檢測到的視角類型 - """ - if not detected_objects: - return "eye_level" # default - - # extract space and size - top_region_count = 0 - bottom_region_count = 0 - total_objects = len(detected_objects) - - # 追蹤大小分布以檢測空中視角 - sizes = [] - - # 垂直大小比例用於低角度檢測 - height_width_ratios = [] - - # 用於檢測規則圖案的變數 - people_positions = [] - crosswalk_pattern_detected = False - - for obj in detected_objects: - # 計算頂部or底部區域中的物體 - region = obj["region"] - if "top" in region: - top_region_count += 1 - elif "bottom" in region: - bottom_region_count += 1 - - # 計算標準化大小(Area) - if "normalized_area" in obj: - sizes.append(obj["normalized_area"]) - - # 計算高度or寬度比例 - if "normalized_size" in obj: - width, height = obj["normalized_size"] - if width > 0: - height_width_ratios.append(height / width) - - # 收集人的位置 - if obj["class_id"] == 0: # 人 - if "normalized_center" in obj: - people_positions.append(obj["normalized_center"]) - - # 專門為斑馬線的十字路口添加檢測邏輯 - # 檢查是否有明顯的垂直和水平行人分布 - people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人 - - if len(people_objs) >= 8: # 需要足夠多的人才能形成十字路口模式 - # 檢查是否有斑馬線模式 - 新增功能 - if len(people_positions) >= 4: - # 對位置進行聚類分析,尋找線性分布 - x_coords = [pos[0] for pos in people_positions] - y_coords = [pos[1] for pos in people_positions] - - # 計算 x 和 y 坐標的變異數和範圍 - x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 - y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 - - x_range = max(x_coords) - min(x_coords) - y_range = max(y_coords) - min(y_coords) - - # 嘗試檢測十字形分布 - # 如果 x 和 y 方向都有較大範圍,且範圍相似,就有可能是十字路口 - if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: - - # 計算到中心點的距離 - center_x = np.mean(x_coords) - center_y = np.mean(y_coords) - - # 將點映射到十字架的軸上(水平和��直) - x_axis_distance = [abs(x - center_x) for x in x_coords] - y_axis_distance = [abs(y - center_y) for y in y_coords] - - # 點應該接近軸線(水平或垂直) - # 對於每個點,檢查它是否接近水平或垂直軸線 - close_to_axis_count = 0 - for i in range(len(x_coords)): - if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1: - close_to_axis_count += 1 - - # 如果足夠多的點接近軸線,認為是十字路口 - if close_to_axis_count >= len(x_coords) * 0.6: - crosswalk_pattern_detected = True - - # 如果沒有檢測到十字形,嘗試檢測線性聚類分布 - if not crosswalk_pattern_detected: - # 檢查 x 和 y 方向的聚類 - x_clusters = self._detect_linear_clusters(x_coords) - y_clusters = self._detect_linear_clusters(y_coords) - - # 如果在 x 和 y 方向上都有多個聚類,可能是交叉的斑馬線 - if len(x_clusters) >= 2 and len(y_clusters) >= 2: - crosswalk_pattern_detected = True - - # 檢測斑馬線模式 - 優先判斷 - if crosswalk_pattern_detected: - return "aerial" - - # 檢測行人分布情況 - if len(people_objs) >= 10: - people_region_counts = {} - for obj in people_objs: - region = obj["region"] - if region not in people_region_counts: - people_region_counts[region] = 0 - people_region_counts[region] += 1 - - # 計算不同區域中的行人數量 - region_count = len([r for r, c in people_region_counts.items() if c >= 2]) - - # 如果行人分布在多個區域中,可能是空中視角 - if region_count >= 4: - # 檢查行人分布的模式 - # 特別是檢查不同區域中行人數量的差異 - region_counts = list(people_region_counts.values()) - region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0 - region_counts_mean = np.mean(region_counts) if region_counts else 0 - - # 如果行人分布較為均勻(變異係數小),可能是空中視角 - if region_counts_mean > 0: - variation_coefficient = region_counts_variance / region_counts_mean - if variation_coefficient < 0.5: - return "aerial" - - # 計算指標 - top_ratio = top_region_count / total_objects if total_objects > 0 else 0 - bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0 - - # 大小變異數(標準化) - size_variance = 0 - if sizes: - mean_size = sum(sizes) / len(sizes) - size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes) - size_variance = size_variance / (mean_size ** 2) # 標準化 - - # 平均高度/寬度比例 - avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0 - - # 空中視角:低大小差異,物體均勻分布,底部很少或沒有物體 - if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and - bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]): - return "aerial" - - # 低角度視角:物體傾向於比寬高,頂部較多物體 - elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and - top_ratio > self.viewpoint_params["low_angle_threshold"]): - return "low_angle" - - # 高視角:底部較多物體,頂部較少 - elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and - top_ratio < self.viewpoint_params["elevated_top_threshold"]): - return "elevated" - - # 默認:平視角 - return "eye_level" - - def _detect_linear_clusters(self, coords, threshold=0.05): - """ - 檢測坐標中的線性聚類 - - Args: - coords: 一維坐標列表 - threshold: 聚類閾值 - - Returns: - list: 聚類列表 - """ - if not coords: - return [] - - # 排序坐標 - sorted_coords = sorted(coords) - - clusters = [] - current_cluster = [sorted_coords[0]] - - for i in range(1, len(sorted_coords)): - # 如果當前坐標與前一個接近,添加到當前聚類 - if sorted_coords[i] - sorted_coords[i-1] < threshold: - current_cluster.append(sorted_coords[i]) - else: - # 否則開始新的聚類 - if len(current_cluster) >= 2: # 至少需要2個點形成聚類 - clusters.append(current_cluster) - current_cluster = [sorted_coords[i]] - - # 添加最後一個cluster - if len(current_cluster) >= 2: - clusters.append(current_cluster) - - return clusters - - def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]: - """ - Detect the likely cultural context of the scene. - - Args: - scene_type: Identified scene type - detected_objects: List of detected objects - - Returns: - Optional[str]: Detected cultural context (asian, european, etc.) or None - """ - # Scene types with explicit cultural contexts - cultural_scene_mapping = { - "asian_commercial_street": "asian", - "asian_night_market": "asian", - "asian_temple_area": "asian", - "european_plaza": "european" - } - - # Check if scene type directly indicates cultural context - if scene_type in cultural_scene_mapping: - return cultural_scene_mapping[scene_type] - - # No specific cultural context detected - return None - - def _generate_cultural_elements(self, cultural_context: str) -> str: - """ - Generate description of cultural elements for the detected context. - - Args: - cultural_context: Detected cultural context - - Returns: - str: Description of cultural elements - """ - # Get template for this cultural context - cultural_templates = self.templates.get("cultural_templates", {}) - - if cultural_context in cultural_templates: - template = cultural_templates[cultural_context] - elements = template.get("elements", []) - - if elements: - # Select 1-2 random elements - num_elements = min(len(elements), random.randint(1, 2)) - selected_elements = random.sample(elements, num_elements) - - # Format elements list - elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0] - - # Fill template - return template.get("description", "").format(elements=elements_text) - - return "" - - def _optimize_object_description(self, description: str) -> str: - """ - 優化物品描述,避免重複列舉相同物品 - """ - import re - - # 處理床鋪重複描述 - if "bed in the room" in description: - description = description.replace("a bed in the room", "a bed") - - # 處理重複的物品列表 - object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description) - - for obj_list in object_lists: - # 計算每個物品出現次數 - items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list) - item_counts = {} - - for item in items: - item = item.strip() - if item and item not in ["and", "with"]: - if item not in item_counts: - item_counts[item] = 0 - item_counts[item] += 1 - - # 生成優化後的物品列表 - if item_counts: - new_items = [] - for item, count in item_counts.items(): - if count > 1: - new_items.append(f"{count} {item}s") - else: - new_items.append(item) - - # 格式化新列表 - if len(new_items) == 1: - new_list = new_items[0] - elif len(new_items) == 2: - new_list = f"{new_items[0]} and {new_items[1]}" - else: - new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}" - - # 替換原始列表 - description = description.replace(obj_list, new_list) - - return description - - def _describe_functional_zones(self, functional_zones: Dict) -> str: - """ - 生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題。 - - Args: - functional_zones: 識別出的功能區域字典 - - Returns: - str: 功能區域描述 - """ - if not functional_zones: - return "" - - # 處理不同類型的 functional_zones 參數 - if isinstance(functional_zones, list): - # 如果是列表,轉換為字典格式 - zones_dict = {} - for i, zone in enumerate(functional_zones): - if isinstance(zone, dict) and 'name' in zone: - zone_name = zone['name'] - else: - zone_name = f"zone_{i}" - zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)} - functional_zones = zones_dict - elif not isinstance(functional_zones, dict): - return "" - - # 計算場景中的總人數 - total_people_count = 0 - people_by_zone = {} - - # 計算每個區域的人數並累計總人數 - for zone_name, zone_info in functional_zones.items(): - if "objects" in zone_info: - zone_people_count = zone_info["objects"].count("person") - people_by_zone[zone_name] = zone_people_count - total_people_count += zone_people_count - - # 分類區域為行人區域和其他區域 - pedestrian_zones = [] - other_zones = [] - - for zone_name, zone_info in functional_zones.items(): - # 檢查是否是行人相關區域 - if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]): - pedestrian_zones.append((zone_name, zone_info)) - else: - other_zones.append((zone_name, zone_info)) - - # 獲取最重要的行人區域和其他區域 - main_pedestrian_zones = sorted(pedestrian_zones, - key=lambda z: people_by_zone.get(z[0], 0), - reverse=True)[:1] # 最多1個主要行人區域 - - top_other_zones = sorted(other_zones, - key=lambda z: len(z[1].get("objects", [])), - reverse=True)[:2] # 最多2個其他區域 - - # 合併區域 - top_zones = main_pedestrian_zones + top_other_zones - - if not top_zones: - return "" - - # 生成匯總描述 - summary = "" - max_mentioned_people = 0 # track已經提到的最大人數 - - # 如果總人數顯著且還沒在主描述中提到,添加總人數描述 - if total_people_count > 5: - summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). " - max_mentioned_people = total_people_count # update已提到的最大人數 - - # 處理每個區域的描述,確保人數信息的一致性 - processed_zones = [] - - for zone_name, zone_info in top_zones: - zone_desc = zone_info.get("description", "a functional zone") - zone_people_count = people_by_zone.get(zone_name, 0) - - # 檢查描述中是否包含人數資訊 - contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower()) - - # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述 - if contains_people_info and zone_people_count < max_mentioned_people: - parts = zone_desc.split("with") - if len(parts) > 1: - # 移除人數部分 - zone_desc = parts[0].strip() + " area" - - processed_zones.append((zone_name, {"description": zone_desc})) - - # 根據處理後的區域數量生成最終描述 - final_desc = "" - - if len(processed_zones) == 1: - _, zone_info = processed_zones[0] - zone_desc = zone_info["description"] - final_desc = summary + f"The scene includes {zone_desc}." - elif len(processed_zones) == 2: - _, zone1_info = processed_zones[0] - _, zone2_info = processed_zones[1] - zone1_desc = zone1_info["description"] - zone2_desc = zone2_info["description"] - final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}." - else: - zones_desc = ["The scene contains multiple functional areas including"] - zone_descriptions = [z[1]["description"] for z in processed_zones] - - # 格式化最終的多區域描述 - if len(zone_descriptions) == 3: - formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}" - else: - formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}" - - final_desc = summary + f"{zones_desc[0]} {formatted_desc}." - - return self._optimize_object_description(final_desc)