Spaces:

DawnC
/

VisionScout

Running on Zero

File size: 51,909 Bytes

import os
import re
import json
import logging
import random
import numpy as np
from typing import Dict, List, Tuple, Any, Optional

from scene_type import SCENE_TYPES
from scene_detail_templates import SCENE_DETAIL_TEMPLATES
from object_template_fillers import OBJECT_TEMPLATE_FILLERS
from lighting_conditions import LIGHTING_CONDITIONS
from viewpoint_templates import VIEWPOINT_TEMPLATES
from cultural_templates import CULTURAL_TEMPLATES
from confidence_templates import CONFIDENCE_TEMPLATES
from landmark_data import ALL_LANDMARKS
from region_analyzer import RegionAnalyzer
from viewpoint_detector import ViewpointDetector, ViewpointDetectionError
from template_manager import TemplateManager, TemplateLoadingError, TemplateFillError
from object_description_generator import ObjectDescriptionGenerator, ObjectDescriptionError
from cultural_context_analyzer import CulturalContextAnalyzer, CulturalContextError
from text_formatter import TextFormatter, TextFormattingError

class EnhancedSceneDescriberError(Exception):
    """場景描述生成過程中的自定義異常"""
    pass

class EnhancedSceneDescriber:
    """
    增強場景描述器 - 提供詳細自然語言場景描述的主要窗口，其他相關class匯集於此

    此class會協調多個專門組件來生成高質量的場景描述，包括視角檢測、
    模板管理、物件描述、文化語境分析和文本格式化。
    """

    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
        """
        初始化增強場景描述器

        Args:
            templates_db: 可選的自定義模板數據庫
            scene_types: 場景類型定義字典
            spatial_analyzer_instance: 空間分析器實例（保持兼容性）
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.INFO)

        # 如果沒有logger，就加一個
        if not self.logger.hasHandlers():
            handler = logging.StreamHandler()
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

        try:
            # 載入場景類型定義
            self.scene_types = scene_types or self._load_default_scene_types()

            # 初始化子組件
            self._initialize_components(templates_db)

            # 保存空間分析器實例以保持兼容性
            self.spatial_analyzer_instance = spatial_analyzer_instance

            self.logger.info("EnhancedSceneDescriber initialized successfully with %d scene types",
                           len(self.scene_types))

        except Exception as e:
            error_msg = f"Failed to initialize EnhancedSceneDescriber: {str(e)}"
            self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}")
            raise EnhancedSceneDescriberError(error_msg) from e

    def _load_default_scene_types(self) -> Dict:
        """
        載入默認場景類型

        Returns:
            Dict: 場景類型定義
        """
        try:
            return SCENE_TYPES
        except Exception as e:
            self.logger.error(f"Failed to import SCENE_TYPES: {str(e)}")
            return {}  # 返回空字典

    def _initialize_components(self, templates_db: Optional[Dict]):
        """
        初始化所有子組件

        Args:
            templates_db: 可選的模板數據庫
        """
        try:
            # 初始化視角檢測器
            self.viewpoint_detector = ViewpointDetector()

            # 初始化區域分析器
            self.region_analyzer = RegionAnalyzer()

            # 初始化模板管理器
            self.template_manager = TemplateManager(custom_templates_db=templates_db)

            # 初始化物件描述生成器，傳入區域分析器
            self.object_description_generator = ObjectDescriptionGenerator(
                region_analyzer=self.region_analyzer
            )

            # 初始化文化語境分析器
            self.cultural_context_analyzer = CulturalContextAnalyzer()

            # 初始化文本格式化器
            self.text_formatter = TextFormatter()

            self.logger.debug("All components initialized successfully")

        except Exception as e:
            error_msg = f"Component initialization failed: {str(e)}"
            self.logger.error(error_msg)
            # 初始化基本組件而不是拋出異常
            self._initialize_fallback_components()


    def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
                           lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
                           scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
                           image_dimensions: Optional[Tuple[int, int]] = None, # 改為 Tuple
                           places365_info: Optional[Dict] = None,
                           object_statistics: Optional[Dict] = None) -> str:
        try:
            traffic_list = [obj for obj in detected_objects if obj.get("class_name", "") == "traffic light"]
            # print(f"[DEBUG] generate_description 一開始接收到的 traffic light 數量: {len(traffic_list)}") # 原始的 print
            self.logger.debug(f"Initial traffic light count in generate_description: {len(traffic_list)}") # 改用 logger
            # for idx, tl in enumerate(traffic_list): # 這部分 log 可能過於詳細，先註解
            #     self.logger.debug(f"    idx={idx}, confidence={tl.get('confidence', 0):.4f}, bbox={tl.get('bbox')}, region={tl.get('region')}")

            if scene_type == "unknown" or confidence < 0.4:
                generic_desc = self._generate_generic_description(detected_objects, lighting_info)
                return self.text_formatter.format_final_description(generic_desc)

            current_detected_objects = detected_objects
            if not enable_landmark:
                current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]

            places365_context = ""
            if places365_info and places365_info.get('confidence', 0) > 0.3:
                scene_label = places365_info.get('scene_label', '')
                attributes = places365_info.get('attributes', [])
                is_indoor = places365_info.get('is_indoor', None)
                if scene_label:
                    places365_context = f"Scene context: {scene_label}"
                    if attributes:
                        places365_context += f" with characteristics: {', '.join(attributes[:3])}"
                    if is_indoor is not None:
                        indoor_outdoor = "indoor" if is_indoor else "outdoor"
                        places365_context += f" ({indoor_outdoor} environment)"
                self.logger.debug(f"Enhanced description incorporating Places365 context: {places365_context}")

            landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
            has_landmark_in_scene = len(landmark_objects_in_scene) > 0

            if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
                landmark_desc = self._generate_landmark_description(
                    scene_type, current_detected_objects, confidence,
                    lighting_info, functional_zones, landmark_objects_in_scene
                )
                return self.text_formatter.format_final_description(landmark_desc)

            viewpoint = self.viewpoint_detector.detect_viewpoint(current_detected_objects)
            current_scene_type = scene_type

            if viewpoint == "aerial":
                if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects):
                    current_scene_type = "aerial_view_intersection"
                elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
                    current_scene_type = "aerial_view_commercial_area"
                elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
                    current_scene_type = "aerial_view_plaza"
                else:
                    current_scene_type = "aerial_view_general"

            current_scene_type = self._sanitize_scene_type_for_description(current_scene_type)

            # 偵測文化背景資訊
            cultural_context = None
            if viewpoint != "aerial":
                cultural_context = self.cultural_context_analyzer.detect_cultural_context(current_scene_type, current_detected_objects)

             # 設定基礎描述
            base_description = "A scene"
            if viewpoint == "aerial":
                if current_scene_type in self.scene_types: # 確保 self.scene_types 已有
                    base_description = self.scene_types.get(current_scene_type, {}).get("description", "An aerial view showing the layout and movement patterns from above")
                else:
                    base_description = "An aerial view showing the layout and movement patterns from above"
            elif current_scene_type in self.scene_types: # 確保 self.scene_types 已有
                 base_description = self.scene_types.get(current_scene_type, {}).get("description", "A scene")

            # 假設 template_manager 內部可以處理 List[str] 的 functional_zones
            selected_template = self.template_manager.get_template_by_scene_type(
                scene_type=current_scene_type,
                detected_objects=current_detected_objects,
                functional_zones=functional_zones or [] # 傳入 List[str]
            )

            # 用於 fill_template 中的某些佔位符
            processed_functional_zones = {}
            if functional_zones:
                if isinstance(functional_zones, dict): # 如果外部傳入的就是dict
                     processed_functional_zones = functional_zones
                elif isinstance(functional_zones, list): # 如果是 list of strings
                     processed_functional_zones = {f"zone_{i}": {"description": zone_desc} for i, zone_desc in enumerate(functional_zones)}


            # 組織場景資料
            scene_data = {
                "detected_objects": current_detected_objects,
                "functional_zones": processed_functional_zones, # 傳入處理過的字典
                "scene_type": current_scene_type,
                "object_statistics": object_statistics or {},
                "lighting_info": lighting_info,
                "spatial_analysis": spatial_analysis,
                "places365_info": places365_info
            }

            # 應用模板產生核心場景描述
            core_scene_details = self.template_manager.apply_template(selected_template, scene_data)

            # 組合基礎描述與核心場景細節
            description = base_description
            if core_scene_details and core_scene_details.strip():
                cleaned_scene_details = self._validate_and_clean_scene_details(core_scene_details)
                if base_description.lower() == "a scene" and len(cleaned_scene_details) > len(base_description):
                    description = cleaned_scene_details
                else:
                    description = self.text_formatter.smart_append(description, cleaned_scene_details)
            elif not core_scene_details and not description: # 如果兩者都為空
                description = self._generate_generic_description(current_detected_objects, lighting_info)

            # 添加次要描述資訊
            if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
                secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
                if secondary_desc:
                    description = self.text_formatter.smart_append(description, secondary_desc)

            # 處理人物相關的描述
            people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
            if people_objs:
                people_count = len(people_objs)
                if people_count == 1: people_phrase = "a single person"
                elif 1 < people_count <= 3: people_phrase = f"{people_count} people"
                elif 3 < people_count <= 7: people_phrase = "several people"
                else: people_phrase = "multiple people"
                if not any(p_word in description.lower() for p_word in ["person", "people", "pedestrian"]):
                    description = self.text_formatter.smart_append(description, f"The scene includes {people_phrase}.")

            # 添加文化背景元素(非空中視角）
            if cultural_context and viewpoint != "aerial":
                cultural_elements = self.cultural_context_analyzer.generate_cultural_elements(cultural_context)
                if cultural_elements:
                    description = self.text_formatter.smart_append(description, cultural_elements)

            # 處理光照條件描述
            lighting_description_text = ""
            if lighting_info and "time_of_day" in lighting_info:
                lighting_type = lighting_info["time_of_day"]
                lighting_desc_template = self.template_manager.get_lighting_template(lighting_type)
                if lighting_desc_template: lighting_description_text = lighting_desc_template
            if lighting_description_text and lighting_description_text.lower() not in description.lower():
                description = self.text_formatter.smart_append(description, lighting_description_text)

             # 添加視角特定的觀察描述
            if viewpoint != "eye_level":
                viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint)
                prefix = viewpoint_template.get('prefix', '')
                observation_template = viewpoint_template.get("observation", "")
                scene_elements_for_vp = "the overall layout and objects"
                if viewpoint == "aerial": scene_elements_for_vp = "crossing patterns and general layout"
                viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
                full_viewpoint_text = ""
                if prefix:
                    full_viewpoint_text = prefix.strip() + " "
                    if viewpoint_observation_text and viewpoint_observation_text[0].islower():
                        full_viewpoint_text += viewpoint_observation_text
                    elif viewpoint_observation_text:
                        full_viewpoint_text = prefix + (viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else viewpoint_observation_text)
                elif viewpoint_observation_text:
                    full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
                if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
                    description = self.text_formatter.smart_append(description, full_viewpoint_text)

            # 需要轉換或調整 describe_functional_zones
            if functional_zones and len(functional_zones) > 0:
                if isinstance(functional_zones, dict):
                     zones_desc_text = self.object_description_generator.describe_functional_zones(functional_zones)
                else: # 如果是 list of strings
                     temp_zones_dict = {f"area_{i}": {"description": desc} for i, desc in enumerate(functional_zones)}
                     zones_desc_text = self.object_description_generator.describe_functional_zones(temp_zones_dict)

                if zones_desc_text:
                    description = self.text_formatter.smart_append(description, zones_desc_text)

            # 避免重複提到
            if hasattr(self.text_formatter, 'deduplicate_sentences_in_description'):
                deduplicated_description = self.text_formatter.deduplicate_sentences_in_description(description)
                self.logger.info(f"Description before pre-LLM deduplication (len {len(description)}): '{description[:150]}...'")
                self.logger.info(f"Description after pre-LLM deduplication (len {len(deduplicated_description)}): '{deduplicated_description[:150]}...'")
                description = deduplicated_description # 更新 description 為去除重複後的版本
            else:
                self.logger.warning("TextFormatter does not have 'deduplicate_sentences_in_description'. Skipping pre-LLM deduplication of the internally generated description.")

            # 格式化最終描述
            final_formatted_description = self.text_formatter.format_final_description(description)

            # 如果禁用地標，過濾地標引用
            if not enable_landmark:
                final_formatted_description = self.text_formatter.filter_landmark_references(final_formatted_description, enable_landmark=False)

            # 如果描述為空，使用備用描述
            if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
                self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
                final_formatted_description = self.text_formatter.format_final_description(
                    self._generate_generic_description(current_detected_objects, lighting_info)
                )

            return final_formatted_description

        except Exception as e:
            error_msg = f"Error generating scene description: {str(e)}"
            self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}")
            try:
                fallback_desc = self._generate_generic_description(detected_objects, lighting_info)
                return self.text_formatter.format_final_description(fallback_desc)
            except:
                return "A scene with various elements is visible."

    def _extract_placeholders(self, template: str) -> List[str]:
        """提取模板中的佔位符"""
        import re
        return re.findall(r'\{([^}]+)\}', template)

    def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict],
                                    functional_zones: List, scene_type: str,
                                    object_statistics: Dict) -> str:
        """生成佔位符內容"""
        all_replacements = self._generate_default_replacements()
        return self._get_placeholder_replacement(
            placeholder, {}, all_replacements, detected_objects, scene_type
        )

    def _preprocess_functional_zones(self, functional_zones: List) -> Dict:
        """預處理功能區域數據"""
        if isinstance(functional_zones, list):
            # 將列表轉換為字典格式
            zones_dict = {}
            for i, zone in enumerate(functional_zones):
                if isinstance(zone, str):
                    zones_dict[f"area {i+1}"] = {"description": zone}
                elif isinstance(zone, dict):
                    zones_dict[f"area {i+1}"] = zone
            return zones_dict
        elif isinstance(functional_zones, dict):
            return functional_zones
        else:
            return {}

    def _standardize_placeholder_content(self, content: str, placeholder_type: str) -> str:
        """標準化佔位符內容"""
        if not content:
            return "various elements"
        return content.strip()

    def _finalize_description_output(self, description: str) -> str:
        """最終化描述輸出"""
        if not description:
            return "A scene featuring various elements and organized areas of activity."

        # 基本清理
        import re
        finalized = re.sub(r'\s+', ' ', description).strip()

        # 確保適當結尾
        if finalized and not finalized.endswith(('.', '!', '?')):
            finalized += '.'

        # 首字母大寫
        if finalized:
            finalized = finalized[0].upper() + finalized[1:] if len(finalized) > 1 else finalized.upper()

        return finalized

    def _sanitize_scene_type_for_description(self, scene_type: str) -> str:
        """
        清理場景類型名稱，確保不包含內部標識符格式

        Args:
            scene_type: 原始場景類型名稱

        Returns:
            str: 清理後的場景類型名稱
        """
        try:
            # 移除下劃線並轉換為空格分隔的自然語言
            cleaned_type = scene_type.replace('_', ' ')

            # 確保不直接在描述中使用技術性場景類型名稱
            return cleaned_type

        except Exception as e:
            self.logger.warning(f"Error sanitizing scene type '{scene_type}': {str(e)}")
            return "general scene"

    def _validate_and_clean_scene_details(self, scene_details: str) -> str:
        """
        驗證並清理場景詳細信息，移除可能的模板填充錯誤

        Args:
            scene_details: 原始場景詳細信息

        Returns:
            str: 清理後的場景詳細信息
        """
        try:
            if not scene_details or not scene_details.strip():
                return ""

            cleaned = scene_details.strip()

            # 移除常見的模板填充錯誤模式
            import re

            # 修復 "In ," 類型的錯誤
            cleaned = re.sub(r'\bIn\s*,\s*', 'In this scene, ', cleaned)
            cleaned = re.sub(r'\bAt\s*,\s*', 'At this location, ', cleaned)
            cleaned = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', cleaned)

            # 移除內部標識符格式
            cleaned = re.sub(r'\b\w+_\w+(?:_\w+)*\b(?!\s+(area|zone|region))',
                            lambda m: m.group(0).replace('_', ' '), cleaned)

            # 確保句子完整性
            if cleaned and not cleaned.endswith(('.', '!', '?')):
                cleaned += '.'

            return cleaned

        except Exception as e:
            self.logger.warning(f"Error validating scene details: {str(e)}")
            return scene_details if scene_details else ""

    def _generate_landmark_description(self,
                                     scene_type: str,
                                     detected_objects: List[Dict],
                                     confidence: float,
                                     lighting_info: Optional[Dict] = None,
                                     functional_zones: Optional[Dict] = None,
                                     landmark_objects: Optional[List[Dict]] = None) -> str:
        """
        生成包含地標信息的場景描述

        Args:
            scene_type: 識別的場景類型
            detected_objects: 檢測到的物件列表
            confidence: 場景分類置信度
            lighting_info: 照明條件信息
            functional_zones: 功能區域信息
            landmark_objects: 識別為地標的物件列表

        Returns:
            str: 包含地標信息的自然語言場景描述
        """
        try:
            # 如果沒有提供地標物件，從檢測物件中篩選
            if landmark_objects is None:
                landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]

            # 如果沒有地標，退回到標準描述
            if not landmark_objects:
                if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
                    base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
                else:
                    return self.text_formatter.format_final_description(self._generate_scene_details(
                        scene_type,
                        detected_objects,
                        lighting_info,
                        self.viewpoint_detector.detect_viewpoint(detected_objects)
                    ))
            else:
                # 獲取主要地標
                primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
                landmark_name = primary_landmark.get("class_name", "landmark")
                # 先取原生 location
                landmark_location = primary_landmark.get("location", "")
                # 如果 location 為空，就從全域 ALL_LANDMARKS 補上
                lm_id = primary_landmark.get("landmark_id")
                if not landmark_location and lm_id and lm_id in ALL_LANDMARKS:
                    landmark_location = ALL_LANDMARKS[lm_id].get("location", "")

                # 根據地標類型選擇適當的描述模板，並插入 location
                if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
                    base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
                elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
                    base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
                else:
                    base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."

            # 添加地標的額外信息
            landmark_details = []
            for landmark in landmark_objects:
                details = []

                if "year_built" in landmark:
                    details.append(f"built in {landmark['year_built']}")

                if "architectural_style" in landmark:
                    details.append(f"featuring {landmark['architectural_style']} architectural style")

                if "significance" in landmark:
                    details.append(landmark["significance"])

                # 補 location（如果該物件沒有 location，就再從 ALL_LANDMARKS 撈一次）
                loc = landmark.get("location", "")
                lm_id_iter = landmark.get("landmark_id")
                if not loc and lm_id_iter and lm_id_iter in ALL_LANDMARKS:
                    loc = ALL_LANDMARKS[lm_id_iter].get("location", "")
                if loc:
                    details.append(f"located in {loc}")

                if details:
                    landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")

            # 將詳細信息添加到基本描述中
            if landmark_details:
                description = base_description + " The scene features " + ", ".join(landmark_details) + "."
            else:
                description = base_description

            # 獲取視角
            viewpoint = self.viewpoint_detector.detect_viewpoint(detected_objects)

            # 生成人員活動描述
            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])

            if people_count > 0:
                if people_count == 1:
                    people_description = "There is one person in the scene, likely a tourist or visitor."
                elif people_count < 5:
                    people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
                else:
                    people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."

                description = self.text_formatter.smart_append(description, people_description)

            # 添加照明信息
            if lighting_info and "time_of_day" in lighting_info:
                lighting_type = lighting_info["time_of_day"]
                lighting_description = self.template_manager.get_lighting_template(lighting_type)
                description = self.text_formatter.smart_append(description, lighting_description)

            # 添加視角描述
            if viewpoint != "eye_level":
                viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint)

                prefix = viewpoint_template.get('prefix', '')
                if prefix and not description.startswith(prefix):
                    if description and description[0].isupper():
                        description = prefix + description[0].lower() + description[1:]
                    else:
                        description = prefix + description

                viewpoint_desc = viewpoint_template.get("observation", "").format(
                    scene_elements="the landmark and surrounding area"
                )

                if viewpoint_desc and viewpoint_desc not in description:
                    description = self.text_formatter.smart_append(description, viewpoint_desc)

            # 添加功能區域描述
            if functional_zones and len(functional_zones) > 0:
                zones_desc = self.object_description_generator.describe_functional_zones(functional_zones)
                if zones_desc:
                    description = self.text_formatter.smart_append(description, zones_desc)

            # 描述可能的活動
            landmark_activities = []

            if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
                landmark_activities = [
                    "nature photography",
                    "scenic viewing",
                    "hiking or walking",
                    "guided nature tours",
                    "outdoor appreciation"
                ]
            elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
                landmark_activities = [
                    "historical sightseeing",
                    "educational tours",
                    "cultural appreciation",
                    "photography of historical architecture",
                    "learning about historical significance"
                ]
            else:
                landmark_activities = [
                    "sightseeing",
                    "taking photographs",
                    "guided tours",
                    "cultural tourism",
                    "souvenir shopping"
                ]

            # 添加活動描述
            if landmark_activities:
                activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
                description = self.text_formatter.smart_append(description, activities_text)

            return self.text_formatter.format_final_description(description)

        except Exception as e:
            self.logger.warning(f"Error generating landmark description: {str(e)}")
            # 備用處理
            return self.text_formatter.format_final_description(
                "A landmark scene with notable architectural or natural features."
            )


    def _is_intersection(self, detected_objects: List[Dict]) -> bool:
        """
        通過分析物件分布來判斷場景是否為十字路口

        Args:
            detected_objects: 檢測到的物件列表

        Returns:
            bool: 是否為十字路口
        """
        try:
            pedestrians = [obj for obj in detected_objects if obj.get("class_id") == 0]

            if len(pedestrians) >= 8:
                positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]

                x_coords = [pos[0] for pos in positions]
                y_coords = [pos[1] for pos in positions]

                x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
                y_variance = np.var(y_coords) if len(y_coords) > 1 else 0

                x_range = max(x_coords) - min(x_coords)
                y_range = max(y_coords) - min(y_coords)

                if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
                    return True

            return False

        except Exception as e:
            self.logger.warning(f"Error detecting intersection: {str(e)}")
            return False

    def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
        """
        當場景類型未知或置信度極低時生成通用描述

        Args:
            detected_objects: 檢測到的物件列表
            lighting_info: 可選的照明條件信息

        Returns:
            str: 基於檢測物件的通用描述
        """
        try:
            obj_counts = {}
            for obj in detected_objects:
                class_name = obj.get("class_name", "unknown object")
                if class_name not in obj_counts:
                    obj_counts[class_name] = 0
                obj_counts[class_name] += 1

            top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]

            if not top_objects:
                base_desc = "This scene displays various elements, though specific objects are not clearly identifiable."
            else:
                objects_text = []
                for name, count in top_objects:
                    # 確保物件名稱不包含技術性格式
                    clean_name = name.replace('_', ' ') if isinstance(name, str) else str(name)
                    if count > 1:
                        objects_text.append(f"{count} {clean_name}s")
                    else:
                        objects_text.append(f"a {clean_name}" if clean_name[0].lower() not in 'aeiou' else f"an {clean_name}")

                if len(objects_text) == 1:
                    objects_list = objects_text[0]
                elif len(objects_text) == 2:
                    objects_list = f"{objects_text[0]} and {objects_text[1]}"
                else:
                    objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"

                base_desc = f"This scene features {objects_list}."

            # 添加照明信息
            if lighting_info and "time_of_day" in lighting_info:
                lighting_type = lighting_info["time_of_day"]
                lighting_desc = self.template_manager.get_lighting_template(lighting_type)
                base_desc += f" {lighting_desc}"

            return base_desc

        except Exception as e:
            self.logger.warning(f"Error generating generic description: {str(e)}")
            return "A general scene is visible with various elements."

    def _generate_scene_details(self,
                              scene_type: str,
                              detected_objects: List[Dict],
                              lighting_info: Optional[Dict] = None,
                              viewpoint: str = "eye_level",
                              spatial_analysis: Optional[Dict] = None,
                              image_dimensions: Optional[Tuple[int, int]] = None,
                              places365_info: Optional[Dict] = None,
                              object_statistics: Optional[Dict] = None) -> str:
        """
        基於場景類型和檢測物件生成詳細描述

        Args:
            scene_type: 識別的場景類型
            detected_objects: 檢測到的物件列表
            lighting_info: 可選的照明條件信息
            viewpoint: 檢測到的視角
            spatial_analysis: 可選的空間分析結果
            image_dimensions: 可選的圖像尺寸
            places365_info: 可選的 Places365 場景分類結果
            object_statistics: 可選的詳細物件統計信息

        Returns:
            str: 詳細場景描述
        """
        try:
            scene_details = ""

            # 日常場景類型列表
            everyday_scene_types = [
                "general_indoor_space", "generic_street_view",
                "desk_area_workspace", "outdoor_gathering_spot",
                "kitchen_counter_or_utility_area", "unknown"
            ]

            # 預處理場景類型以避免內部格式洩漏
            processed_scene_type = self._sanitize_scene_type_for_description(scene_type)

            # 確定場景描述方法
            is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in self.template_manager.get_scene_detail_templates(scene_type)
            treat_as_everyday = scene_type in everyday_scene_types

            if hasattr(self, 'enable_landmark') and not self.enable_landmark:
                if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
                    treat_as_everyday = True

            if treat_as_everyday or not is_confident_specific_scene:
                self.logger.debug(f"Generating dynamic description for scene_type: {scene_type}")
                scene_details = self.object_description_generator.generate_dynamic_everyday_description(
                    detected_objects,
                    lighting_info,
                    viewpoint,
                    spatial_analysis,
                    image_dimensions,
                    places365_info,
                    object_statistics
                )
            else:
                self.logger.debug(f"Using template for scene_type: {scene_type}")
                templates_list = self.template_manager.get_scene_detail_templates(scene_type, viewpoint)

                if templates_list:
                    detail_template = random.choice(templates_list)
                    scene_details = self.template_manager.fill_template(
                        detail_template,
                        detected_objects,
                        scene_type,
                        places365_info,
                        object_statistics
                    )
                else:
                    scene_details = self.object_description_generator.generate_dynamic_everyday_description(
                        detected_objects, lighting_info, viewpoint, spatial_analysis,
                        image_dimensions, places365_info, object_statistics
                    )

            # 如果禁用地標檢測，過濾地標引用
            if hasattr(self, 'enable_landmark') and not self.enable_landmark:
                scene_details = self.text_formatter.filter_landmark_references(scene_details, enable_landmark=False)

            return scene_details if scene_details else "A scene with some visual elements."

        except Exception as e:
            self.logger.warning(f"Error generating scene details: {str(e)}")
            return "A scene with various elements."

    def filter_landmark_references(self, text, enable_landmark=True):
        """
        動態過濾文本中的地標引用

        Args:
            text: 需要過濾的文本
            enable_landmark: 是否啟用地標功能

        Returns:
            str: 過濾後的文本
        """
        return self.text_formatter.filter_landmark_references(text, enable_landmark)

    def get_prominent_objects(self, detected_objects: List[Dict],
                          min_prominence_score: float = 0.5,
                          max_categories_to_return: Optional[int] = None,
                          max_total_objects: Optional[int] = None) -> List[Dict]:
        """
        獲取最重要的物件

        Args:
            detected_objects: 檢測到的物件列表
            min_prominence_score: 最小重要性分數閾值，預設為0.5
            max_categories_to_return: 可選的最大返回類別數量限制
            max_total_objects: 可選的最大返回物件總數限制

        Returns:
            List[Dict]: 重要物件列表
        """
        try:
            # 傳遞所有參數
            prominent_objects = self.object_description_generator.get_prominent_objects(
                detected_objects,
                min_prominence_score,
                max_categories_to_return
            )

            # 如果指定了最大物件總數限制，進行額外過濾
            if max_total_objects is not None and max_total_objects > 0:
                # 限制總物件數量，保持重要性排序
                prominent_objects = prominent_objects[:max_total_objects]

            # 如果指定了最大類別數量限制，則進行額外過濾
            if max_categories_to_return is not None and max_categories_to_return > 0:
                # 按類別分組物件
                categories_seen = set()
                filtered_objects = []

                for obj in prominent_objects:
                    class_name = obj.get("class_name", "unknown")
                    if class_name not in categories_seen:
                        categories_seen.add(class_name)
                        filtered_objects.append(obj)

                        # 如果已達到最大類別數量，停止添加新類別
                        if len(categories_seen) >= max_categories_to_return:
                            break
                    elif class_name in categories_seen:
                        # 如果是已見過的類別，仍然添加該物件
                        filtered_objects.append(obj)

                return filtered_objects

            return prominent_objects

        except Exception as e:
            self.logger.warning(f"Error getting prominent objects: {str(e)}")
            return []

    def detect_viewpoint(self, detected_objects: List[Dict]) -> str:
        """
        檢測圖像視角類型

        Args:
            detected_objects: 檢測到的物件列表

        Returns:
            str: 檢測到的視角類型
        """
        try:
            return self.viewpoint_detector.detect_viewpoint(detected_objects)
        except Exception as e:
            self.logger.warning(f"Error detecting viewpoint: {str(e)}")
            return "eye_level"

    def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
        """
        檢測場景的文化語境

        Args:
            scene_type: 識別的場景類型
            detected_objects: 檢測到的物件列表

        Returns:
            Optional[str]: 檢測到的文化語境或None
        """
        try:
            return self.cultural_context_analyzer.detect_cultural_context(scene_type, detected_objects)
        except CulturalContextError as e:
            self.logger.warning(f"Error detecting cultural context: {str(e)}")
            return None

    def generate_cultural_elements(self, cultural_context: str) -> str:
        """
        為檢測到的文化語境生成描述元素

        Args:
            cultural_context: 檢測到的文化語境

        Returns:
            str: 文化元素描述
        """
        try:
            return self.cultural_context_analyzer.generate_cultural_elements(cultural_context)
        except CulturalContextError as e:
            self.logger.warning(f"Error generating cultural elements: {str(e)}")
            return ""

    def format_object_list_for_description(self, objects: List[Dict],
                                         use_indefinite_article_for_one: bool = False,
                                         count_threshold_for_generalization: int = -1,
                                         max_types_to_list: int = 5) -> str:
        """
        將物件列表格式化為人類可讀的字符串

        Args:
            objects: 物件字典列表
            use_indefinite_article_for_one: 單個物件是否使用 "a/an"
            count_threshold_for_generalization: 計數閾值
            max_types_to_list: 最大物件類型數量

        Returns:
            str: 格式化的物件描述字符串
        """
        try:
            return self.object_description_generator.format_object_list_for_description(
                objects, use_indefinite_article_for_one, count_threshold_for_generalization, max_types_to_list
            )
        except ObjectDescriptionError as e:
            self.logger.warning(f"Error formatting object list: {str(e)}")
            return "various objects"

    def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None,
                              image_height: Optional[int] = None) -> str:
        """
        為物件生成空間位置描述

        Args:
            obj: 物件字典
            image_width: 可選的圖像寬度
            image_height: 可選的圖像高度

        Returns:
            str: 空間描述字符串
        """
        try:
            return self.object_description_generator.get_spatial_description(obj, image_width, image_height)
        except ObjectDescriptionError as e:
            self.logger.warning(f"Error generating spatial description: {str(e)}")
            return "in the scene"

    def optimize_object_description(self, description: str) -> str:
        """
        優化物件描述，避免重複列舉相同物件

        Args:
            description: 原始描述文本

        Returns:
            str: 優化後的描述文本
        """
        try:
            return self.object_description_generator.optimize_object_description(description)
        except ObjectDescriptionError as e:
            self.logger.warning(f"Error optimizing object description: {str(e)}")
            return description

    def describe_functional_zones(self, functional_zones: Dict) -> str:
        """
        生成場景功能區域的描述

        Args:
            functional_zones: 識別出的功能區域字典

        Returns:
            str: 功能區域描述
        """
        try:
            return self.object_description_generator.describe_functional_zones(functional_zones)
        except ObjectDescriptionError as e:
            self.logger.warning(f"Error describing functional zones: {str(e)}")
            return ""

    def smart_append(self, current_text: str, new_fragment: str) -> str:
        """
        智能地將新文本片段附加到現有文本

        Args:
            current_text: 要附加到的現有文本
            new_fragment: 要附加的新文本片段

        Returns:
            str: 合併後的文本
        """
        try:
            return self.text_formatter.smart_append(current_text, new_fragment)
        except TextFormattingError as e:
            self.logger.warning(f"Error in smart append: {str(e)}")
            return f"{current_text} {new_fragment}" if current_text else new_fragment

    def format_final_description(self, text: str) -> str:
        """
        格式化最終描述文本

        Args:
            text: 要格式化的文本

        Returns:
            str: 格式化後的文本
        """
        try:
            return self.text_formatter.format_final_description(text)
        except TextFormattingError as e:
            self.logger.warning(f"Error formatting final description: {str(e)}")
            return text

    def get_template(self, category: str, key: Optional[str] = None):
        """
        獲取指定類別的模板

        Args:
            category: 模板類別名稱
            key: 可選的具體模板鍵值

        Returns:
            模板內容
        """
        try:
            return self.template_manager.get_template(category, key)
        except (TemplateLoadingError, TemplateFillError) as e:
            self.logger.warning(f"Error getting template: {str(e)}")
            return None

    def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]:
        """
        獲取視角檢測結果及其信心度

        Args:
            detected_objects: 檢測到的物件列表

        Returns:
            Tuple[str, float]: (視角類型, 信心度)
        """
        try:
            return self.viewpoint_detector.get_viewpoint_confidence(detected_objects)
        except ViewpointDetectionError as e:
            self.logger.warning(f"Error getting viewpoint confidence: {str(e)}")
            return "eye_level", 0.5

    def get_supported_cultures(self) -> List[str]:
        """
        獲取所有支援的文化語境列表

        Returns:
            List[str]: 支援的文化語境名稱列表
        """
        return self.cultural_context_analyzer.get_supported_cultures()

    def has_cultural_context(self, cultural_context: str) -> bool:
        """
        檢查是否支援指定的文化語境

        Args:
            cultural_context: 文化語境名稱

        Returns:
            bool: 是否支援該文化語境
        """
        return self.cultural_context_analyzer.has_cultural_context(cultural_context)

    def validate_text_quality(self, text: str) -> Dict[str, bool]:
        """
        驗證文本質量

        Args:
            text: 要驗證的文本

        Returns:
            Dict[str, bool]: 質量檢查結果
        """
        try:
            return self.text_formatter.validate_text_quality(text)
        except TextFormattingError as e:
            self.logger.warning(f"Error validating text quality: {str(e)}")
            return {"error": True}

    def get_text_statistics(self, text: str) -> Dict[str, int]:
        """
        獲取文本統計信息

        Args:
            text: 要分析的文本

        Returns:
            Dict[str, int]: 文本統計信息
        """
        try:
            return self.text_formatter.get_text_statistics(text)
        except TextFormattingError as e:
            self.logger.warning(f"Error getting text statistics: {str(e)}")
            return {"characters": 0, "words": 0, "sentences": 0}

    def reload_templates(self):
        """
        重新載入所有模板
        """
        try:
            self.template_manager.reload_templates()
            self.logger.info("Templates reloaded successfully")
        except (TemplateLoadingError, TemplateFillError) as e:
            self.logger.error(f"Error reloading templates: {str(e)}")
            raise EnhancedSceneDescriberError(f"Failed to reload templates: {str(e)}") from e

    def get_configuration(self) -> Dict[str, Any]:
        """
        獲取當前配置信息

        Returns:
            Dict[str, Any]: 配置信息字典
        """
        try:
            return {
                "scene_types_count": len(self.scene_types),
                "viewpoint_detector_config": self.viewpoint_detector.viewpoint_params,
                "object_generator_config": self.object_description_generator.get_configuration(),
                "supported_cultures": self.cultural_context_analyzer.get_supported_cultures(),
                "template_categories": self.template_manager.get_template_categories()
            }
        except Exception as e:
            self.logger.warning(f"Error getting configuration: {str(e)}")
            return {"error": str(e)}

    def _initialize_fallback_components(self):
        """備用組件初始化"""
        try:
            self.region_analyzer = RegionAnalyzer()
            self.object_description_generator = ObjectDescriptionGenerator(
                region_analyzer=self.region_analyzer
            )
        except Exception as e:
            self.logger.error(f"Fallback component initialization failed: {str(e)}")