diff --git "a/enhance_scene_describer.py" "b/enhance_scene_describer.py"
deleted file mode 100644--- "a/enhance_scene_describer.py"
+++ /dev/null
@@ -1,2350 +0,0 @@
-import os
-import re
-import json
-import logging
-import random
-import numpy as np
-from typing import Dict, List, Tuple, Any, Optional
-
-from scene_type import SCENE_TYPES
-from scene_detail_templates import SCENE_DETAIL_TEMPLATES
-from object_template_fillers import OBJECT_TEMPLATE_FILLERS
-from lighting_conditions import LIGHTING_CONDITIONS
-from viewpoint_templates import VIEWPOINT_TEMPLATES
-from cultural_templates import CULTURAL_TEMPLATES
-from confidence_templates import CONFIDENCE_TEMPLATES
-from landmark_data import ALL_LANDMARKS
-
-class EnhancedSceneDescriber:
-    """
-    Enhanced scene description generator with improved template handling,
-    viewpoint awareness, and cultural context recognition.
-    Provides detailed natural language descriptions of scenes based on
-    detection results and scene classification.
-    """
-
-    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
-        """
-        Initialize the enhanced scene describer.
-
-        Args:
-            templates_db: Optional custom templates database
-            scene_types: Dictionary of scene type definitions
-        """
-        self.logger = logging.getLogger(self.__class__.__name__) # Use class name for logger
-        self.logger.setLevel(logging.INFO) # Or your desired logging level
-        # Optional: Add a handler if not configured globally
-        if not self.logger.hasHandlers():
-            handler = logging.StreamHandler()
-            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-            handler.setFormatter(formatter)
-            self.logger.addHandler(handler)
-
-        # Load or use provided scene types
-        self.scene_types = scene_types or self._load_default_scene_types()
-
-        # Load templates database
-        self.templates = templates_db or self._load_templates()
-
-        # Initialize viewpoint detection parameters
-        self._initialize_viewpoint_parameters()
-
-    def _load_default_scene_types(self) -> Dict:
-        """
-        Load default scene types.
-
-        Returns:
-            Dict: Scene type definitions
-        """
-
-        return SCENE_TYPES
-
-    def _load_templates(self) -> Dict:
-        """
-        Load description templates from imported Python modules.
-
-        Returns:
-            Dict: Template collections for different description components
-        """
-        templates = {}
-
-        # 載入事先準備的模板
-        templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
-        templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
-        templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
-        templates["cultural_templates"] = CULTURAL_TEMPLATES
-
-        # 從 LIGHTING_CONDITIONS 獲取照明模板
-        templates["lighting_templates"] = {
-            key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items()
-        }
-
-        # 設置默認的置信度模板
-        templates["confidence_templates"] = {
-            "high": "{description} {details}",
-            "medium": "This appears to be {description} {details}",
-            "low": "This might be {description}, but the confidence is low. {details}"
-        }
-
-        # 初始化其他必要的模板（現在這個函數簡化了很多）
-        self._initialize_default_templates(templates)
-
-        return templates
-
-    def _initialize_default_templates(self, templates: Dict):
-        """
-        檢查模板字典並填充任何缺失的默認模板。
-
-        在將模板移至專門的模組後，此方法主要作為安全機制，
-        確保即使導入失敗或某些模板未在外部定義，系統仍能正常運行。
-
-        Args:
-            templates: 要檢查和更新的模板字典
-        """
-        # 檢查關鍵模板類型是否存在，如果不存在則添加默認值
-
-        # 置信度模板 - 用於控制描述的語氣
-        if "confidence_templates" not in templates:
-            templates["confidence_templates"] = {
-                "high": "{description} {details}",
-                "medium": "This appears to be {description} {details}",
-                "low": "This might be {description}, but the confidence is low. {details}"
-            }
-
-        # 場景細節模板
-        if "scene_detail_templates" not in templates:
-            templates["scene_detail_templates"] = {
-                "default": ["A space with various objects."]
-            }
-
-        # 物體填充模板，用於生成物體描述
-        if "object_template_fillers" not in templates:
-            templates["object_template_fillers"] = {
-                "default": ["various items"]
-            }
-
-        # 視角模板，雖然現在從專門模組導入，但可作為備份
-        if "viewpoint_templates" not in templates:
-            # 使用簡化版的默認視角模板
-            templates["viewpoint_templates"] = {
-                "eye_level": {
-                    "prefix": "From eye level, ",
-                    "observation": "the scene is viewed straight on."
-                },
-                "aerial": {
-                    "prefix": "From above, ",
-                    "observation": "the scene is viewed from a bird's-eye perspective."
-                }
-            }
-
-        # 文化模板
-        if "cultural_templates" not in templates:
-            templates["cultural_templates"] = {
-                "asian": {
-                    "elements": ["cultural elements"],
-                    "description": "The scene has Asian characteristics."
-                },
-                "european": {
-                    "elements": ["architectural features"],
-                    "description": "The scene has European characteristics."
-                }
-            }
-
-        # 照明模板 - 用於描述光照條件
-        if "lighting_templates" not in templates:
-            templates["lighting_templates"] = {
-                "day_clear": "The scene is captured during daylight.",
-                "night": "The scene is captured at night.",
-                "unknown": "The lighting conditions are not easily determined."
-            }
-
-
-    def _initialize_viewpoint_parameters(self):
-        """
-        Initialize parameters used for viewpoint detection.
-        """
-        self.viewpoint_params = {
-            # Parameters for detecting aerial views
-            "aerial_threshold": 0.7,  # High object density viewed from top
-            "aerial_size_variance_threshold": 0.15,  # Low size variance in aerial views
-
-            # Parameters for detecting low angle views
-            "low_angle_threshold": 0.3,  # Bottom-heavy object distribution
-            "vertical_size_ratio_threshold": 1.8,  # Vertical objects appear taller
-
-            # Parameters for detecting elevated views
-            "elevated_threshold": 0.6,  # Objects mostly in middle/bottom
-            "elevated_top_threshold": 0.3  # Few objects at top of frame
-        }
-
-    def _generate_landmark_description(self,
-                                 scene_type: str,
-                                 detected_objects: List[Dict],
-                                 confidence: float,
-                                 lighting_info: Optional[Dict] = None,
-                                 functional_zones: Optional[Dict] = None,
-                                 landmark_objects: Optional[List[Dict]] = None) -> str:
-        """
-        生成包含地標信息的場景描述
-
-        Args:
-            scene_type: 識別的場景類型
-            detected_objects: 檢測到的物體列表
-            confidence: 場景分類置信度
-            lighting_info: 照明條件信息（可選）
-            functional_zones: 功能區域信息（可選）
-            landmark_objects: 識別為地標的物體列表（可選）
-
-        Returns:
-            str: 包含地標信息的自然語言場景描述
-        """
-        # 如果沒有提供地標物體，則從檢測物體中篩選
-        if landmark_objects is None:
-            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
-
-        # 如果沒有地標，退回到標準描述
-        if not landmark_objects:
-            if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
-                # 場景類型是地標但沒有具體地標物體
-                base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
-            else:
-                # 使用標準方法生成基本描述
-                return self._format_final_description(self._generate_scene_details(
-                    scene_type,
-                    detected_objects,
-                    lighting_info,
-                    self._detect_viewpoint(detected_objects)
-                ))
-        else:
-            # 獲取主要地標（信心度最高的）
-            primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
-            landmark_name = primary_landmark.get("class_name", "landmark")
-            landmark_location = primary_landmark.get("location", "")
-
-            # 根據地標類型選擇適當的描述模板
-            if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
-                base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
-            elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
-                base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
-            else:
-                base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."
-
-        # 加地標的額外信息
-        landmark_details = []
-        for landmark in landmark_objects:
-            details = []
-
-            # 加建造年份
-            if "year_built" in landmark:
-                details.append(f"built in {landmark['year_built']}")
-
-            # 加建築風格
-            if "architectural_style" in landmark:
-                details.append(f"featuring {landmark['architectural_style']} architectural style")
-
-            # 加重要性
-            if "significance" in landmark:
-                details.append(landmark["significance"])
-
-            # 如果有詳細信息，加到描述中
-            if details:
-                landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")
-
-        # 將詳細信息添加到基本描述中
-        if landmark_details:
-            description = base_description + " " + "The scene features " + ", ".join(landmark_details) + "."
-        else:
-            description = base_description
-
-        # 獲取視角
-        viewpoint = self._detect_viewpoint(detected_objects)
-
-        # 生成人員活動描述
-        people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])  # 人的類別ID通常為0
-
-        if people_count > 0:
-            if people_count == 1:
-                people_description = "There is one person in the scene, likely a tourist or visitor."
-            elif people_count < 5:
-                people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
-            else:
-                people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."
-
-            description = self._smart_append(description, people_description)
-
-        # 添加照明信息
-        if lighting_info and "time_of_day" in lighting_info:
-            lighting_type = lighting_info["time_of_day"]
-            if lighting_type in self.templates.get("lighting_templates", {}):
-                lighting_description = self.templates["lighting_templates"][lighting_type]
-                description = self._smart_append(description, lighting_description)
-
-        # 添加視角描述
-        if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
-            viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
-
-            # 添加視角前綴
-            prefix = viewpoint_template.get('prefix', '')
-            if prefix and not description.startswith(prefix):
-                # 保持句子流暢性
-                if description and description[0].isupper():
-                    description = prefix + description[0].lower() + description[1:]
-                else:
-                    description = prefix + description
-
-            # 添加視角觀察描述
-            viewpoint_desc = viewpoint_template.get("observation", "").format(
-                scene_elements="the landmark and surrounding area"
-            )
-
-            if viewpoint_desc and viewpoint_desc not in description:
-                description = self._smart_append(description, viewpoint_desc)
-
-        # 添加功能區域描述
-        if functional_zones and len(functional_zones) > 0:
-            zones_desc = self._describe_functional_zones(functional_zones)
-            if zones_desc:
-                description = self._smart_append(description, zones_desc)
-
-        # 描述可能的活動
-        landmark_activities = []
-
-        # 根據地標類型生成通用活動
-        if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
-            landmark_activities = [
-                "nature photography",
-                "scenic viewing",
-                "hiking or walking",
-                "guided nature tours",
-                "outdoor appreciation"
-            ]
-        elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
-            landmark_activities = [
-                "historical sightseeing",
-                "educational tours",
-                "cultural appreciation",
-                "photography of historical architecture",
-                "learning about historical significance"
-            ]
-        else:
-            landmark_activities = [
-                "sightseeing",
-                "taking photographs",
-                "guided tours",
-                "cultural tourism",
-                "souvenir shopping"
-            ]
-
-        # 添加活動描述
-        if landmark_activities:
-            activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
-            description = self._smart_append(description, activities_text)
-
-        # 最後格式化描述
-        return self._format_final_description(description)
-
-    def filter_landmark_references(self, text, enable_landmark=True):
-        """
-        動態過濾文本中的地標引用
-
-        Args:
-            text: 需���過濾的文本
-            enable_landmark: 是否啟用地標功能
-
-        Returns:
-            str: 過濾後的文本
-        """
-        if enable_landmark or not text:
-            return text
-
-        try:
-            # 動態收集所有地標名稱和位置
-            landmark_names = []
-            locations = []
-
-            for landmark_id, info in ALL_LANDMARKS.items():
-                # 收集地標名稱及其別名
-                landmark_names.append(info["name"])
-                landmark_names.extend(info.get("aliases", []))
-
-                # 收集地理位置
-                if "location" in info:
-                    location = info["location"]
-                    locations.append(location)
-
-                    # 處理分離的城市和國家名稱
-                    parts = location.split(",")
-                    if len(parts) >= 1:
-                        locations.append(parts[0].strip())
-                    if len(parts) >= 2:
-                        locations.append(parts[1].strip())
-
-            # 使用正則表達式動態替換所有地標名稱
-            import re
-            for name in landmark_names:
-                if name and len(name) > 2:  # 避免過短的名稱
-                    text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
-
-            # 動態替換所有位置引用
-            for location in locations:
-                if location and len(location) > 2:
-                    # 替換常見位置表述模式
-                    text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
-                    text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
-                    text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
-
-        except ImportError:
-            # 如果無法導入，使用基本模式
-            pass
-
-        # 通用地標描述模式替換
-        landmark_patterns = [
-            (r'a (tourist|popular|famous) landmark', r'an urban structure'),
-            (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
-            (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
-            (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
-            (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
-            (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
-            (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
-            (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'),
-            (r'landmark scene', r'urban scene'),
-            (r'tourist destination', r'urban area'),
-            (r'tourist attraction', r'urban area')
-        ]
-
-        for pattern, replacement in landmark_patterns:
-            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
-
-        return text
-
-
-    def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
-                    lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
-                    scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
-                    image_dimensions: Optional[Dict] = None, places365_info: Optional[Dict] = None,
-                    object_statistics: Optional[Dict] = None) -> str:
-        """
-        Generate enhanced scene description based on detection results, scene type,
-        and additional contextual information.
-        This version ensures that the main scene_details (from the first call)
-        is properly integrated and not overwritten by a simplified second call.
-        """
-        # Handle unknown scene type or very low confidence as an early exit
-        if scene_type == "unknown" or confidence < 0.4:
-            # _generate_generic_description should also ideally use image_dimensions if it does spatial reasoning
-            generic_desc = self._generate_generic_description(detected_objects, lighting_info)
-            return self._format_final_description(generic_desc)
-
-        # Filter out landmark objects if landmark detection is disabled for this run
-        current_detected_objects = detected_objects
-        if not enable_landmark:
-            current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
-
-        # Log Places365 context if available
-        places365_context = ""
-        if places365_info and places365_info.get('confidence', 0) > 0.3:
-            scene_label = places365_info.get('scene_label', '')
-            attributes = places365_info.get('attributes', [])
-            is_indoor = places365_info.get('is_indoor', None)
-
-            if scene_label:
-                places365_context = f"Scene context: {scene_label}"
-                if attributes:
-                    places365_context += f" with characteristics: {', '.join(attributes[:3])}"
-                if is_indoor is not None:
-                    indoor_outdoor = "indoor" if is_indoor else "outdoor"
-                    places365_context += f" ({indoor_outdoor} environment)"
-
-            print(f"Enhanced description incorporating Places365 context: {places365_context}")
-
-        landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
-        has_landmark_in_scene = len(landmark_objects_in_scene) > 0
-
-        # If landmark processing is enabled and it's a landmark scene or landmarks are detected
-        if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
-            landmark_desc = self._generate_landmark_description(
-                scene_type,
-                current_detected_objects, # Pass potentially filtered list
-                confidence,
-                lighting_info,
-                functional_zones,
-                landmark_objects_in_scene # Pass the explicitly filtered landmark objects
-            )
-            return self._format_final_description(landmark_desc)
-
-        # **[Start of main description construction for non-landmark or landmark-disabled everyday scenes]**
-
-        # Detect viewpoint based on current (potentially filtered) objects
-        viewpoint = self._detect_viewpoint(current_detected_objects)
-        current_scene_type = scene_type # Use a mutable variable for scene_type if it can change
-
-        # Process aerial viewpoint scene types (may re-assign current_scene_type)
-        if viewpoint == "aerial":
-            if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): # Use lower for robustness
-                current_scene_type = "aerial_view_intersection"
-            elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
-                current_scene_type = "aerial_view_commercial_area"
-            elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
-                current_scene_type = "aerial_view_plaza"
-            else: # Default aerial if specific not matched
-                current_scene_type = "aerial_view_general" # Or use a specific default like aerial_view_intersection
-
-        # Detect cultural context (only for non-aerial viewpoints)
-        cultural_context = None
-        if viewpoint != "aerial":
-            cultural_context = self._detect_cultural_context(current_scene_type, current_detected_objects)
-
-        # Get base description for the (potentially updated) scene type
-        base_description = "A scene" # Default initialization
-        if viewpoint == "aerial":
-            # Check if current_scene_type (which might be an aerial type) has a base description
-            if current_scene_type in self.scene_types:
-                 base_description = self.scene_types[current_scene_type].get("description", "An aerial view showing the layout and movement patterns from above")
-            else:
-                 base_description = "An aerial view showing the layout and movement patterns from above"
-        elif current_scene_type in self.scene_types:
-            base_description = self.scene_types[current_scene_type].get("description", "A scene")
-
-        # spatial analysis, and image dimensions. This is where dynamic description or template filling happens.
-        core_scene_details = self._generate_scene_details(
-            current_scene_type, # Use the potentially updated scene_type
-            current_detected_objects,
-            lighting_info,
-            viewpoint,
-            spatial_analysis=spatial_analysis,    # Pass this through
-            image_dimensions=image_dimensions,     # Pass this through
-            places365_info=places365_info,        # Pass Places365 info
-            object_statistics=object_statistics   # Pass object statistics
-        )
-
-        # Start with the base description derived from SCENE_TYPES or a default.
-        description = base_description
-        if core_scene_details and core_scene_details.strip() != "": # Ensure core_scene_details is not empty
-            # If base_description is generic like "A scene", consider replacing it or appending smartly.
-            if base_description.lower() == "a scene" and len(core_scene_details) > len(base_description):
-                description = core_scene_details # Prioritize dynamic/template-filled details if base is too generic
-            else:
-                description = self._smart_append(description, core_scene_details)
-        elif not core_scene_details and not description: # If both are empty, use a generic fallback
-            description = self._generate_generic_description(current_detected_objects, lighting_info)
-
-
-        # Append secondary description from scene type template, if any
-        if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
-            secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
-            if secondary_desc:
-                description = self._smart_append(description, secondary_desc)
-
-        # Append people count information
-        people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
-        if people_objs:
-            people_count = len(people_objs)
-
-            if people_count == 1: people_phrase = "a single person"
-            elif people_count > 1 and people_count <= 3: people_phrase = f"{people_count} people" # Accurate for small counts
-            elif people_count > 3 and people_count <=7: people_phrase = "several people"
-            else: people_phrase = "multiple people" # For larger counts, or use "numerous"
-
-            # Only add if not already well covered in core_scene_details or base_description
-            if "person" not in description.lower() and "people" not in description.lower() and "pedestrian" not in description.lower():
-                description = self._smart_append(description, f"The scene includes {people_phrase}.")
-
-        # Append cultural context
-        if cultural_context and viewpoint != "aerial": # Already checked viewpoint
-            cultural_elements = self._generate_cultural_elements(cultural_context)
-            if cultural_elements:
-                description = self._smart_append(description, cultural_elements)
-
-        # Append lighting information
-        lighting_description_text = ""
-        if lighting_info and "time_of_day" in lighting_info:
-            lighting_type = lighting_info["time_of_day"]
-            lighting_desc_template = self.templates.get("lighting_templates", {}).get(lighting_type)
-            if lighting_desc_template:
-                lighting_description_text = lighting_desc_template
-        if lighting_description_text and lighting_description_text.lower() not in description.lower():
-            description = self._smart_append(description, lighting_description_text)
-
-        # Append viewpoint information (if not eye-level)
-        if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
-            viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
-            prefix = viewpoint_template.get('prefix', '')
-            observation_template = viewpoint_template.get("observation", "")
-
-            # Determine scene_elements for the observation template
-            scene_elements_for_vp = "the overall layout and objects" # Generic default
-            if viewpoint == "aerial":
-                scene_elements_for_vp = "crossing patterns and general layout"
-
-            viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
-
-            # Combine prefix and observation carefully
-            full_viewpoint_text = ""
-            if prefix:
-                full_viewpoint_text = prefix.strip() + " "
-                if viewpoint_observation_text and viewpoint_observation_text[0].islower():
-                    full_viewpoint_text += viewpoint_observation_text
-                elif viewpoint_observation_text:
-                    full_viewpoint_text = prefix + viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else prefix + viewpoint_observation_text
-
-            elif viewpoint_observation_text: # No prefix, but observation exists
-                 full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
-
-
-            if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
-                description = self._smart_append(description, full_viewpoint_text)
-
-
-        # Append functional zones information
-        if functional_zones and len(functional_zones) > 0:
-            zones_desc_text = self._describe_functional_zones(functional_zones)
-            if zones_desc_text:
-                description = self._smart_append(description, zones_desc_text)
-
-        final_formatted_description = self._format_final_description(description)
-
-        if not enable_landmark:
-            final_formatted_description = self.filter_landmark_references(final_formatted_description, enable_landmark=False)
-
-        # If after all processing, description is empty, fallback to a very generic one.
-        if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
-            self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
-            final_formatted_description = self._format_final_description(
-                self._generate_generic_description(current_detected_objects, lighting_info)
-            )
-
-        return final_formatted_description
-
-
-    def _smart_append(self, current_text: str, new_fragment: str) -> str:
-        """
-        Intelligently append a new text fragment to the current text,
-        handling punctuation and capitalization correctly.
-
-        Args:
-            current_text: The existing text to append to
-            new_fragment: The new text fragment to append
-
-        Returns:
-            str: The combined text with proper formatting
-        """
-        # Handle empty cases
-        if not new_fragment:
-            return current_text
-
-        if not current_text:
-            # Ensure first character is uppercase for the first fragment
-            return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
-
-        # Clean up existing text
-        current_text = current_text.rstrip()
-
-        # Check for ending punctuation
-        ends_with_sentence = current_text.endswith(('.', '!', '?'))
-        ends_with_comma = current_text.endswith(',')
-
-        # Specifically handle the "A xxx A yyy" pattern that's causing issues
-        if (current_text.startswith("A ") or current_text.startswith("An ")) and \
-        (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
-            return current_text + ". " + new_fragment
-
-        # 檢查新片段是否包含地標名稱（通常為專有名詞）
-        has_landmark_name = any(word[0].isupper() for word in new_fragment.split()
-                            if len(word) > 2 and not word.startswith(("A ", "An ", "The ")))
-
-        # Decide how to join the texts
-        if ends_with_sentence:
-            # After a sentence, start with uppercase and add proper spacing
-            joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
-        elif ends_with_comma:
-            # After a comma, maintain flow with lowercase unless it's a proper noun or special case
-            if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
-                joined_text = current_text + " " + new_fragment
-            else:
-                joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
-        elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
-            # When adding a new sentence about the scene, use a period
-            joined_text = current_text + ". " + new_fragment
-        else:
-            # For other cases, decide based on the content
-            if self._is_related_phrases(current_text, new_fragment):
-                if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
-                    joined_text = current_text + ", " + new_fragment
-                else:
-                    joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
-            else:
-                # Use period for unrelated phrases
-                joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
-
-        return joined_text
-
-    def _is_related_phrases(self, text1: str, text2: str) -> bool:
-        """
-        Determine if two phrases are related and should be connected with a comma
-        rather than separated with a period.
-
-        Args:
-            text1: The first text fragment
-            text2: The second text fragment to be appended
-
-        Returns:
-            bool: Whether the phrases appear to be related
-        """
-        # Check if either phrase starts with "A" or "An" - these are likely separate descriptions
-        if (text1.startswith("A ") or text1.startswith("An ")) and \
-        (text2.startswith("A ") or text2.startswith("An ")):
-            return False  # These are separate descriptions, not related phrases
-
-        # Check if the second phrase starts with a connecting word
-        connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
-                        "this", "these", "that", "those", "and", "or", "but"]
-
-        first_word = text2.split()[0].lower() if text2 else ""
-        if first_word in connecting_words:
-            return True
-
-        # Check if the first phrase ends with something that suggests continuity
-        ending_patterns = ["such as", "including", "like", "especially", "particularly",
-                        "for example", "for instance", "namely", "specifically"]
-
-        for pattern in ending_patterns:
-            if text1.lower().endswith(pattern):
-                return True
-
-        # Check if both phrases are about the scene
-        if "scene" in text1.lower() and "scene" in text2.lower():
-            return False  # Separate statements about the scene should be separate sentences
-
-        return False
-
-
-    def _format_final_description(self, text: str) -> str:
-        """
-        Format the final description text to ensure correct punctuation,
-        capitalization, and spacing.
-        """
-        if not text or not text.strip(): # Also check if text is just whitespace
-            return ""
-
-        # Trim leading/trailing whitespace first
-        text = text.strip()
-
-        # 1. Handle consecutive "A/An" segments (potentially split them into sentences)
-        text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE)
-        text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE)
-
-        # 2. Ensure first character of the entire text is uppercase
-        if text:
-            text = text[0].upper() + text[1:]
-
-        # 3. Normalize whitespace: multiple spaces to one
-        text = re.sub(r'\s{2,}', ' ', text)
-
-        # 4. Capitalize after sentence-ending punctuation (. ! ?)
-        def capitalize_after_punctuation(match):
-            return match.group(1) + match.group(2).upper()
-        text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text)
-
-        # 5. Handle capitalization after commas (your existing robust logic is good)
-        def fix_capitalization_after_comma(match):
-            leading_comma_space = match.group(1) # (,\s+)
-            word_after_comma = match.group(2)    # ([A-Z][a-zA-Z]*)
-
-            proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll",
-                                     "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
-                                     "January", "February", "March", "April", "May", "June", "July",
-                                     "August", "September", "October", "November", "December"]
-
-            if word_after_comma in proper_nouns_exceptions:
-                return match.group(0)
-            # If the word looks like a proper noun (e.g., multi-word capitalized, or a known location/brand)
-            # This heuristic can be tricky. For simplicity, if it's already capitalized and not a common word, keep it.
-            if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]:
-                 return match.group(0) # Keep it if it looks like a proper noun already
-
-            return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:]
-        text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # Added hyphen and apostrophe to word
-
-        # 6. Correct spacing around punctuation
-        text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Ensures one space AFTER punctuation, none before
-        text = text.replace(' .', '.').replace(' ,', ',') # Clean up potential space before period/comma from previous rule
-
-        # 7. Consolidate multiple sentence-ending punctuations (e.g., "!!", "?.", ".?")
-        text = re.sub(r'[.!?]{2,}', '.', text) # Convert multiple to a single period
-        text = re.sub(r',+', ',', text) # Multiple commas to one
-
-        # 8. Ensure text ends with a single sentence-ending punctuation mark
-        text = text.strip() # Remove trailing whitespace before checking last char
-        if text and not text[-1] in '.!?':
-            text += '.'
-
-        # 9. Remove any leading punctuation or extra spaces that might have been introduced
-        text = re.sub(r'^[.,;:!?\s]+', '', text)
-
-        # 10. Final check for first letter capitalization
-        if text:
-            text = text[0].upper() + text[1:]
-
-        # 11. Remove space before final punctuation mark if accidentally added by rule 7
-        text = re.sub(r'\s+([.!?])$', r'\1', text)
-
-        return text.strip() # Final strip
-
-    def _is_intersection(self, detected_objects: List[Dict]) -> bool:
-        """
-        通過分析物體分佈來判斷場景是否為十字路口
-        """
-        # 檢查行人分佈模式
-        pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
-
-        if len(pedestrians) >= 8:  # 需要足夠的行人來形成十字路口
-            # 抓取行人位置
-            positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
-
-            # 分析 x 和 y 坐標分佈
-            x_coords = [pos[0] for pos in positions]
-            y_coords = [pos[1] for pos in positions]
-
-            # 計算 x 和 y 坐標的變異數
-            x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
-            y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
-
-            # 計算範圍
-            x_range = max(x_coords) - min(x_coords)
-            y_range = max(y_coords) - min(y_coords)
-
-            # 如果 x 和 y 方向都有較大範圍且範圍相似，那就有可能是十字路口
-            if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
-                return True
-
-        return False
-
-    def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
-        """
-        Generate a generic description when scene type is unknown or confidence is very low.
-
-        Args:
-            detected_objects: List of detected objects
-            lighting_info: Optional lighting condition information
-
-        Returns:
-            str: Generic description based on detected objects
-        """
-        # Count object occurrences
-        obj_counts = {}
-        for obj in detected_objects:
-            class_name = obj["class_name"]
-            if class_name not in obj_counts:
-                obj_counts[class_name] = 0
-            obj_counts[class_name] += 1
-
-        # Get top objects by count
-        top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]
-
-        if not top_objects:
-            base_desc = "No clearly identifiable objects are visible in this scene."
-        else:
-            # Format object list
-            objects_text = []
-            for name, count in top_objects:
-                if count > 1:
-                    objects_text.append(f"{count} {name}s")
-                else:
-                    objects_text.append(name)
-
-            if len(objects_text) == 1:
-                objects_list = objects_text[0]
-            elif len(objects_text) == 2:
-                objects_list = f"{objects_text[0]} and {objects_text[1]}"
-            else:
-                objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"
-
-            base_desc = f"This scene contains {objects_list}."
-
-        # Add lighting information if available
-        if lighting_info and "time_of_day" in lighting_info:
-            lighting_type = lighting_info["time_of_day"]
-            if lighting_type in self.templates.get("lighting_templates", {}):
-                lighting_desc = self.templates["lighting_templates"][lighting_type]
-                base_desc += f" {lighting_desc}"
-
-        return base_desc
-
-    def _get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7) -> List[Dict]:
-        """
-        Helper function to get the most prominent objects.
-        Prioritizes high-confidence, large objects, and ensures a diversity of object types.
-
-        Args:
-            detected_objects: List of detected objects.
-            min_prominence_score: Minimum score for an object to be considered initially.
-            max_categories_to_return: Max number of different object categories to prioritize.
-            max_total_objects: Overall cap on the number of prominent objects returned.
-
-        Returns:
-            List of prominent detected objects.
-        """
-        if not detected_objects:
-            return []
-
-        scored_objects = []
-        for obj in detected_objects:
-            area = obj.get("normalized_area", 0.0) + 1e-6
-            confidence = obj.get("confidence", 0.0)
-
-            # Base score: area and confidence are key
-            score = (area * 0.65) + (confidence * 0.35) # Slightly more weight to area
-
-            # Bonus for generally important object classes (in a generic way)
-            # This is a simple heuristic. More advanced would be context-dependent.
-            # For example, 'person' is often more salient.
-            # Avoid hardcoding specific class_ids here if possible, or use broad categories if available.
-            # For simplicity, we'll keep the landmark bonus for now.
-            if obj.get("class_name") == "person": # Example: person is generally prominent
-                 score += 0.1
-            if obj.get("is_landmark"): # Landmarks are always prominent
-                score += 0.5
-
-            if score >= min_prominence_score:
-                 scored_objects.append((obj, score))
-
-        if not scored_objects:
-            return []
-
-        # Sort by score in descending order
-        scored_objects.sort(key=lambda x: x[1], reverse=True)
-
-        # Prioritize diversity of object categories first
-        prominent_by_category = {}
-        final_prominent_objects = []
-
-        for obj, score in scored_objects:
-            category = obj.get("class_name", "unknown")
-            if category not in prominent_by_category:
-                if len(prominent_by_category) < max_categories_to_return:
-                    prominent_by_category[category] = obj
-                    final_prominent_objects.append(obj)
-
-            elif len(final_prominent_objects) < max_total_objects and obj not in final_prominent_objects:
-                 if score > 0.3:
-                    final_prominent_objects.append(obj)
-
-        # If still under max_total_objects, fill with highest scored remaining objects regardless of category
-        if len(final_prominent_objects) < max_total_objects:
-            for obj, score in scored_objects:
-                if len(final_prominent_objects) >= max_total_objects:
-                    break
-                if obj not in final_prominent_objects:
-                    final_prominent_objects.append(obj)
-
-        # Re-sort the final list by original prominence score to maintain order
-        final_prominent_objects_with_scores = []
-        for obj in final_prominent_objects:
-            for original_obj, original_score in scored_objects:
-                if obj is original_obj: # Check for object identity
-                    final_prominent_objects_with_scores.append((obj, original_score))
-                    break
-
-        final_prominent_objects_with_scores.sort(key=lambda x: x[1], reverse=True)
-
-        return [obj for obj, score in final_prominent_objects_with_scores[:max_total_objects]]
-
-
-    def _format_object_list_for_description(self,
-                                            objects: List[Dict],
-                                            use_indefinite_article_for_one: bool = False,
-                                            count_threshold_for_generalization: int = -1, # Default to -1 for precise counts
-                                            max_types_to_list: int = 5
-                                           ) -> str:
-        """
-        Formats a list of detected objects into a human-readable string with counts.
-        Args:
-            objects: List of object dictionaries, each expected to have 'class_name'.
-            use_indefinite_article_for_one: If True, uses "a/an" for single items. If False, uses "one".
-            count_threshold_for_generalization: If count exceeds this, use general terms. -1 means precise counts.
-            max_types_to_list: Maximum number of different object types to include in the list.
-        """
-        if not objects:
-            return "no specific objects clearly identified"
-
-        counts: Dict[str, int] = {}
-        for obj in objects:
-            name = obj.get("class_name", "unknown object")
-            if name == "unknown object" or not name: # Skip unknown or empty names
-                continue
-            counts[name] = counts.get(name, 0) + 1
-
-        if not counts:
-            return "no specific objects clearly identified"
-
-        descriptions = []
-        # Sort by count (desc) then name (asc) for consistent output order
-        # Limit the number of distinct object types being listed
-        sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]
-
-
-        for name, count in sorted_counts:
-            if count == 1:
-                if use_indefinite_article_for_one:
-                    if name[0].lower() in 'aeiou':
-                        descriptions.append(f"an {name}")
-                    else:
-                        descriptions.append(f"a {name}")
-                else:
-                    descriptions.append(f"one {name}") # Output "one car" instead of "a car"
-            else: # count > 1
-                plural_name = name
-                if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
-                    plural_name = name[:-1] + "ies"
-                elif name.endswith(("s", "sh", "ch", "x", "z")):
-                    plural_name = name + "es"
-                elif not name.endswith("s"): # Avoid double 's' like "buss"
-                    plural_name = name + "s"
-
-                if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
-                    if count <= count_threshold_for_generalization + 3:
-                        descriptions.append(f"several {plural_name}")
-                    else:
-                        descriptions.append(f"many {plural_name}")
-                else: # Use exact count (e.g., "6 cars")
-                    descriptions.append(f"{count} {plural_name}")
-
-        if not descriptions:
-            return "no specific objects clearly identified"
-
-        if len(descriptions) == 1:
-            return descriptions[0]
-        elif len(descriptions) == 2:
-            return f"{descriptions[0]} and {descriptions[1]}"
-        else:
-            # Oxford comma for lists of 3 or more.
-            return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
-
-    def _get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str:
-        """
-        Generates a brief spatial description for an object.
-        (This is a new helper function)
-        """
-        region = obj.get("region")
-        if region:
-            # Convert region name to more descriptive terms
-            region_map = {
-                "top_left": "in the top-left", "top_center": "at the top-center", "top_right": "in the top-right",
-                "middle_left": "on the middle-left side", "middle_center": "in the center", "middle_right": "on the middle-right side",
-                "bottom_left": "in the bottom-left", "bottom_center": "at the bottom-center", "bottom_right": "in the bottom-right"
-            }
-            # More general terms if exact region is not critical
-            if "top" in region: general_v_pos = "towards the top"
-            elif "bottom" in region: general_v_pos = "towards the bottom"
-            else: general_v_pos = "in the middle vertically"
-
-            if "left" in region: general_h_pos = "towards the left"
-            elif "right" in region: general_h_pos = "towards the right"
-            else: general_h_pos = "in the center horizontally"
-
-            # Prioritize specific region if available, else use general
-            specific_desc = region_map.get(region, "")
-            if specific_desc:
-                return f"{specific_desc} of the frame"
-            else:
-                return f"{general_v_pos} and {general_h_pos} of the frame"
-
-        # Fallback if region info is not detailed enough or missing
-        # We can use normalized_center if available
-        norm_center = obj.get("normalized_center")
-        if norm_center and image_width and image_height: # Check if image_width/height are provided
-            x_norm, y_norm = norm_center
-            h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
-            v_pos = "top" if y_norm < 0.4 else "bottom" if y_norm > 0.6 else "middle"
-
-            if h_pos == "center" and v_pos == "middle":
-                return "near the center of the image"
-            return f"in the {v_pos}-{h_pos} area of the image"
-
-        return "in the scene" # Generic fallback
-
-
-    def _generate_dynamic_everyday_description(self,
-                                          detected_objects: List[Dict],
-                                          lighting_info: Optional[Dict] = None,
-                                          viewpoint: str = "eye_level",
-                                          spatial_analysis: Optional[Dict] = None,
-                                          image_dimensions: Optional[Tuple[int, int]] = None,
-                                          places365_info: Optional[Dict] = None,
-                                          object_statistics: Optional[Dict] = None
-                                          ) -> str:
-        """
-        Dynamically generates a description for everyday scenes based on ALL relevant detected_objects,
-        their counts, and context.
-        It aims to describe the overall scene first, then details of object groups including accurate counts.
-        """
-        description_segments = []
-        image_width, image_height = image_dimensions if image_dimensions else (None, None)
-
-        if hasattr(self, 'logger'):
-            self.logger.info(f"DynamicDesc: Start. Total Raw Objects: {len(detected_objects)}, View: {viewpoint}, Light: {lighting_info is not None}")
-
-        # 1. Overall Ambiance (Lighting and Viewpoint)
-        ambiance_parts = []
-        if lighting_info:
-            time_of_day = lighting_info.get("time_of_day", "unknown lighting")
-            is_indoor = lighting_info.get("is_indoor")
-            ambiance_statement = "This is"
-            if is_indoor is True: ambiance_statement += " an indoor scene"
-            elif is_indoor is False: ambiance_statement += " an outdoor scene"
-            else: ambiance_statement += " a scene"
-            lighting_map = self.templates.get("lighting_templates", {})
-            readable_lighting_base = lighting_map.get(time_of_day, f"with {time_of_day.replace('_', ' ')} lighting conditions")
-            readable_lighting = readable_lighting_base.lower().replace("the scene is captured", "").replace("the scene has", "").strip()
-            ambiance_statement += f", likely {readable_lighting}."
-            ambiance_parts.append(ambiance_statement)
-
-        if viewpoint and viewpoint != "eye_level":
-            vp_templates = self.templates.get("viewpoint_templates", {})
-            if viewpoint in vp_templates:
-                vp_prefix = vp_templates[viewpoint].get("prefix", "").strip()
-                if vp_prefix:
-                    if not ambiance_parts:
-                        ambiance_parts.append(f"{vp_prefix.capitalize()} the general layout of the scene is observed.")
-                    else:
-                        ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed {vp_templates[viewpoint].get('short_desc', viewpoint)}."
-
-        if ambiance_parts:
-            description_segments.append(" ".join(ambiance_parts))
-
-        # 2. Describe ALL detected objects, grouped by class, with accurate counts and locations
-        if not detected_objects:
-            # This part remains, but the conditions to reach here might change based on confident_objects check
-            if not description_segments:
-                 description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
-            else:
-                 description_segments.append("Within this setting, no specific objects were clearly identified.")
-        else:
-            objects_by_class: Dict[str, List[Dict]] = {}
-
-            # keeping 0.25 as a placeholder
-            confidence_filter_threshold = getattr(self, 'confidence_threshold_for_description', 0.25)
-            confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= confidence_filter_threshold]
-
-            if not confident_objects:
-                 # This message is more appropriate if objects existed but none met confidence
-                 no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
-                 if not description_segments: description_segments.append(no_confident_obj_msg)
-                 else: description_segments.append(no_confident_obj_msg.lower().capitalize()) # Append as a new sentence
-            else:
-                if object_statistics:
-                    # 使用預計算的統計信息，並採用動態置信度策略
-                    for class_name, stats in object_statistics.items():
-                        count = stats.get("count", 0)
-                        avg_confidence = stats.get("avg_confidence", 0)
-
-                        # 動態調整置信度閾值：裝飾性物品使用較低閾值
-                        dynamic_threshold = confidence_filter_threshold
-                        if class_name in ["potted plant", "vase", "clock", "book"]:
-                            dynamic_threshold = max(0.15, confidence_filter_threshold * 0.6)
-                        elif count >= 3:  # 數量多的物品降低閾值
-                            dynamic_threshold = max(0.2, confidence_filter_threshold * 0.8)
-
-                        if count > 0 and avg_confidence >= dynamic_threshold:
-                            matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
-                            if not matching_objects:
-                                # 如果高信心度的物體中沒有，從原始列表中尋找
-                                matching_objects = [obj for obj in detected_objects
-                                                if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]
-
-                            if matching_objects:
-                                actual_count = min(stats["count"], len(matching_objects))
-                                objects_by_class[class_name] = matching_objects[:actual_count]
-                else:
-                    # 回退邏輯同樣使用動態閾值
-                    for obj in confident_objects:
-                        name = obj.get("class_name", "unknown object")
-                        if name == "unknown object" or not name: continue
-                        if name not in objects_by_class:
-                            objects_by_class[name] = []
-                        objects_by_class[name].append(obj)
-
-                if not objects_by_class: # Should be rare if confident_objects was not empty and had valid names
-                    description_segments.append("No common objects were confidently identified for detailed description.")
-                else:
-                    def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
-                        class_name_key, obj_group_list = item_tuple
-                        priority = 3  # 預設優先級
-                        count = len(obj_group_list)
-
-                        # 動態優先級：基於場景相關性和數量
-                        if class_name_key == "person":
-                            priority = 0
-                        elif class_name_key in ["dining table", "chair", "sofa", "bed"]:
-                            priority = 1  # 室內主要家具
-                        elif class_name_key in ["car", "bus", "truck", "traffic light"]:
-                            priority = 2  # 交通相關物體
-                        elif count >= 3:  # 數量多的物體提升優先級
-                            priority = max(1, priority - 1)
-                        elif class_name_key in ["potted plant", "vase", "clock", "book"] and count >= 2:
-                            priority = 2  # 裝飾性物品有一定數量時提升優先級
-
-                        avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
-
-                        # 增加數量權重：多個同類物體更重要
-                        quantity_bonus = min(count / 5.0, 1.0)  # 最多1.0的加成
-
-                        return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)
-
-                    # 去除重複的邏輯
-                    deduplicated_objects_by_class = {}
-                    processed_positions = []
-
-                    for class_name, group_of_objects in objects_by_class.items():
-                        unique_objects = []
-
-                        for obj in group_of_objects:
-                            obj_position = obj.get("normalized_center", [0.5, 0.5])
-                            is_duplicate = False
-
-                            # 檢查是否與已處理的物體位置重疊
-                            for processed_pos in processed_positions:
-                                position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
-                                if position_distance < 0.15:  # 位置重疊閾值
-                                    is_duplicate = True
-                                    break
-
-                            if not is_duplicate:
-                                unique_objects.append(obj)
-                                processed_positions.append(obj_position)
-
-                        if unique_objects:
-                            deduplicated_objects_by_class[class_name] = unique_objects
-
-                    objects_by_class = deduplicated_objects_by_class
-
-                    sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)
-
-                    object_clauses = [] # Stores individual object group descriptions
-
-                    for class_name, group_of_objects in sorted_object_groups:
-                        count = len(group_of_objects)
-                        if count == 0: continue
-
-                        # 使用統計信息確保準確的數量描述
-                        if object_statistics and class_name in object_statistics:
-                            actual_count = object_statistics[class_name]["count"]
-                            # 根據實際統計數量生成描述
-                            if actual_count == 1:
-                                formatted_name_with_exact_count = f"one {class_name}"
-                            else:
-                                plural_form = f"{class_name}s" if not class_name.endswith('s') else class_name
-                                formatted_name_with_exact_count = f"{actual_count} {plural_form}"
-                        else:
-                            # 回退到原有的格式化邏輯
-                            formatted_name_with_exact_count = self._format_object_list_for_description(
-                                [group_of_objects[0]] * count,
-                                use_indefinite_article_for_one=False,
-                                count_threshold_for_generalization=-1
-                            )
-
-                        if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
-                            continue
-
-                        # Determine collective location for the group
-                        location_description_suffix = "" # e.g., "is in the center" or "are in the west area"
-                        if count == 1:
-                            location_description_suffix = f"is {self._get_spatial_description(group_of_objects[0], image_width, image_height)}"
-                        else:
-                            distinct_regions = sorted(list(set(obj.get("region", "unknown_region") for obj in group_of_objects)))
-                            known_regions = [r for r in distinct_regions if r != "unknown_region"]
-                            if not known_regions and "unknown_region" in distinct_regions:
-                                location_description_suffix = "are visible in the scene"
-                            elif len(known_regions) == 1:
-                                location_description_suffix = f"are primarily in the {known_regions[0].replace('_', ' ')} area"
-                            elif len(known_regions) == 2:
-                                location_description_suffix = f"are mainly across the {known_regions[0].replace('_',' ')} and {known_regions[1].replace('_',' ')} areas"
-                            elif len(known_regions) > 2:
-                                location_description_suffix = "are distributed in various parts of the scene"
-                            else:
-                                location_description_suffix = "are visible in the scene"
-
-                        # Capitalize the object description (e.g., "Six cars")
-                        formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
-                        object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")
-
-                    if object_clauses:
-                        # Join object clauses into one or more sentences.
-                        if not description_segments: # If no ambiance, start with the first object clause.
-                            if object_clauses:
-                                first_clause = object_clauses.pop(0) # Take the first one out
-                                description_segments.append(first_clause + ".")
-                        else: # Ambiance exists, prepend with "The scene features..." or similar
-                            if object_clauses:
-                                description_segments.append("The scene features:") # Or "Key elements include:"
-
-                        # Add remaining object clauses as separate points or a continuous sentence
-                        # For now, let's join them into a single continuous sentence string to be added.
-                        if object_clauses: # If there are more clauses after the first (or after "The scene features:")
-                            joined_object_clauses = ". ".join(object_clauses)
-                            if joined_object_clauses and not joined_object_clauses.endswith("."):
-                                joined_object_clauses += "."
-                            description_segments.append(joined_object_clauses)
-
-                    elif not description_segments : # No ambiance and no describable objects after filtering
-                        return "The image depicts a scene, but specific objects could not be described with confidence or detail."
-
-        # --- Final assembly and formatting ---
-        # Join all collected segments. _smart_append might be better if parts are not full sentences.
-        # Since we aim for full sentences in segments, simple join then format.
-        raw_description = ""
-        for i, segment in enumerate(filter(None, description_segments)):
-            segment = segment.strip()
-            if not segment: continue
-
-            if not raw_description: # First non-empty segment
-                raw_description = segment
-            else:
-                if not raw_description.endswith(('.', '!', '?')):
-                    raw_description += "."
-                raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())
-
-        if raw_description and not raw_description.endswith(('.', '!', '?')):
-            raw_description += "."
-
-        final_description = self._format_final_description(raw_description) # Crucial for final polish
-
-        if not final_description or len(final_description.strip()) < 20:
-            # Fallback if description is too short or empty after processing
-            # Use a more informative fallback if confident_objects existed
-            if 'confident_objects' in locals() and confident_objects:
-                 return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
-            else:
-                 return "A general scene is depicted with no objects identified with high confidence."
-
-        return final_description
-
-
-    def _generate_scene_details(self,
-                          scene_type: str,
-                          detected_objects: List[Dict],
-                          lighting_info: Optional[Dict] = None,
-                          viewpoint: str = "eye_level",
-                          spatial_analysis: Optional[Dict] = None,
-                          image_dimensions: Optional[Tuple[int, int]] = None,
-                          places365_info: Optional[Dict] = None,
-                          object_statistics: Optional[Dict] = None
-                          ) -> str:
-        """
-        Generate detailed description based on scene type and detected objects.
-        Enhanced to handle everyday scenes dynamically with accurate object counting.
-
-        Args:
-            scene_type: Identified scene type.
-            detected_objects: List of detected objects.
-            lighting_info: Optional lighting condition information.
-            viewpoint: Detected viewpoint (aerial, eye_level, etc.).
-            spatial_analysis: Optional results from SpatialAnalyzer.
-            image_dimensions: Optional tuple of (image_width, image_height).
-            places365_info: Optional Places365 scene classification results.
-            object_statistics: Optional detailed object statistics with counts and confidence.
-
-        Returns:
-            str: Detailed scene description.
-        """
-        scene_details = ""
-        scene_templates = self.templates.get("scene_detail_templates", {})
-
-        # List of scene types considered "everyday" or generic
-        everyday_scene_types = [
-            "general_indoor_space", "generic_street_view",
-            "desk_area_workspace", "outdoor_gathering_spot",
-            "kitchen_counter_or_utility_area", "unknown"
-        ]
-
-        # Extract Places365 attributes for enhanced description
-        places365_attributes = []
-        scene_specific_details = ""
-
-        if places365_info and places365_info.get('confidence', 0) > 0.4:
-            attributes = places365_info.get('attributes', [])
-            scene_label = places365_info.get('scene_label', '')
-
-            # Filter relevant attributes for description enhancement
-            relevant_attributes = [attr for attr in attributes if attr in [
-                'natural_lighting', 'artificial_lighting', 'commercial', 'residential',
-                'workplace', 'recreational', 'educational', 'open_space', 'enclosed_space'
-            ]]
-            places365_attributes = relevant_attributes[:2]
-
-            # Generate scene-specific contextual details using object statistics
-            if object_statistics:
-                if 'commercial' in attributes and object_statistics.get('person', {}).get('count', 0) > 0:
-                    person_count = object_statistics['person']['count']
-                    if person_count == 1:
-                        scene_specific_details = "This appears to be an active commercial environment with a customer present."
-                    else:
-                        scene_specific_details = f"This appears to be an active commercial environment with {person_count} people present."
-                elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
-                    scene_specific_details = "The setting suggests a comfortable residential living space."
-                elif 'workplace' in attributes and any(object_statistics.get(obj, {}).get('count', 0) > 0
-                                                    for obj in ['laptop', 'keyboard', 'monitor']):
-                    scene_specific_details = "The environment indicates an active workspace or office setting."
-            else:
-                # Fallback to original logic if object_statistics not available
-                if 'commercial' in attributes and any(obj['class_name'] in ['person', 'chair', 'table'] for obj in detected_objects):
-                    scene_specific_details = "This appears to be an active commercial environment with customer activity."
-                elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
-                    scene_specific_details = "The setting suggests a comfortable residential living space."
-                elif 'workplace' in attributes and any(obj['class_name'] in ['laptop', 'keyboard', 'monitor'] for obj in detected_objects):
-                    scene_specific_details = "The environment indicates an active workspace or office setting."
-
-        # Determine scene description approach
-        is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in scene_templates
-        treat_as_everyday = scene_type in everyday_scene_types
-
-        if hasattr(self, 'enable_landmark') and not self.enable_landmark:
-            if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
-                treat_as_everyday = True
-
-        if treat_as_everyday or not is_confident_specific_scene:
-            # Generate dynamic description for everyday scenes with object statistics
-            self.logger.info(f"Generating dynamic description for scene_type: {scene_type}")
-            scene_details = self._generate_dynamic_everyday_description(
-                detected_objects,
-                lighting_info,
-                viewpoint,
-                spatial_analysis,
-                image_dimensions,
-                places365_info,
-                object_statistics  # Pass object statistics to dynamic description
-            )
-        elif scene_type in scene_templates:
-            # Use template-based description with enhanced object information
-            self.logger.info(f"Using template for scene_type: {scene_type}")
-            viewpoint_key = f"{scene_type}_{viewpoint}"
-            templates_list = scene_templates.get(viewpoint_key, scene_templates.get(scene_type, []))
-
-            if templates_list:
-                detail_template = random.choice(templates_list)
-                scene_details = self._fill_detail_template(
-                    detail_template,
-                    detected_objects,
-                    scene_type,
-                    places365_info,
-                    object_statistics  # Pass object statistics to template filling
-                )
-            else:
-                scene_details = self._generate_dynamic_everyday_description(
-                    detected_objects, lighting_info, viewpoint, spatial_analysis,
-                    image_dimensions, places365_info, object_statistics
-                )
-        else:
-            # Fallback to dynamic description with object statistics
-            self.logger.info(f"No specific template for {scene_type}, generating dynamic description.")
-            scene_details = self._generate_dynamic_everyday_description(
-                detected_objects, lighting_info, viewpoint, spatial_analysis,
-                image_dimensions, places365_info, object_statistics
-            )
-
-        # Filter out landmark references if landmark detection is disabled
-        if hasattr(self, 'enable_landmark') and not self.enable_landmark:
-            scene_details = self.filter_landmark_references(scene_details, enable_landmark=False)
-
-        return scene_details if scene_details else "A scene with some visual elements."
-
-    def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str:
-        """
-        Fill a template with specific details based on detected objects.
-
-        Args:
-            template: Template string with placeholders
-            detected_objects: List of detected objects
-            scene_type: Identified scene type
-
-        Returns:
-            str: Filled template
-        """
-        # Find placeholders in the template using simple {placeholder} syntax
-        import re
-        placeholders = re.findall(r'\{([^}]+)\}', template)
-
-        filled_template = template
-
-        # Get object template fillers
-        fillers = self.templates.get("object_template_fillers", {})
-
-        # 基於物品的統計資訊形成更準確的模板填充內容
-        statistics_based_replacements = {}
-        if object_statistics:
-            # 根據統計信息生成具體的物體描述
-            for class_name, stats in object_statistics.items():
-                count = stats.get("count", 0)
-                if count > 0:
-                    # 為常見物體類別生成基於統計的描述
-                    if class_name == "potted plant":
-                        if count == 1:
-                            statistics_based_replacements["plant_elements"] = "a potted plant"
-                        elif count <= 3:
-                            statistics_based_replacements["plant_elements"] = f"{count} potted plants"
-                        else:
-                            statistics_based_replacements["plant_elements"] = f"multiple potted plants ({count} total)"
-
-                    elif class_name == "chair":
-                        if count == 1:
-                            statistics_based_replacements["seating"] = "a chair"
-                        elif count <= 4:
-                            statistics_based_replacements["seating"] = f"{count} chairs"
-                        else:
-                            statistics_based_replacements["seating"] = f"numerous chairs ({count} total)"
-
-                    elif class_name == "person":
-                        if count == 1:
-                            statistics_based_replacements["people_and_vehicles"] = "a person"
-                            statistics_based_replacements["pedestrian_flow"] = "an individual walking"
-                        elif count <= 5:
-                            statistics_based_replacements["people_and_vehicles"] = f"{count} people"
-                            statistics_based_replacements["pedestrian_flow"] = f"{count} people walking"
-                        else:
-                            statistics_based_replacements["people_and_vehicles"] = f"many people ({count} individuals)"
-                            statistics_based_replacements["pedestrian_flow"] = f"a crowd of {count} people"
-
-        # 為所有可能的變數設置默認值
-        default_replacements = {
-            # 室內相關
-            "furniture": "various furniture pieces",
-            "seating": "comfortable seating",
-            "electronics": "entertainment devices",
-            "bed_type": "a bed",
-            "bed_location": "room",
-            "bed_description": "sleeping arrangements",
-            "extras": "personal items",
-            "table_setup": "a dining table and chairs",
-            "table_description": "a dining surface",
-            "dining_items": "dining furniture and tableware",
-            "appliances": "kitchen appliances",
-            "kitchen_items": "cooking utensils and dishware",
-            "cooking_equipment": "cooking equipment",
-            "office_equipment": "work-related furniture and devices",
-            "desk_setup": "a desk and chair",
-            "computer_equipment": "electronic devices",
-
-            # 室外/城市相關
-            "traffic_description": "vehicles and pedestrians",
-            "people_and_vehicles": "people and various vehicles",
-            "street_elements": "urban infrastructure",
-            "park_features": "benches and greenery",
-            "outdoor_elements": "natural features",
-            "park_description": "outdoor amenities",
-            "store_elements": "merchandise displays",
-            "shopping_activity": "customers browse and shop",
-            "store_items": "products for sale",
-
-            # 高級餐廳相關
-            "design_elements": "elegant decor",
-            "lighting": "stylish lighting fixtures",
-
-            # 亞洲商業街相關
-            "storefront_features": "compact shops",
-            "pedestrian_flow": "people walking",
-            "asian_elements": "distinctive cultural elements",
-            "cultural_elements": "traditional design features",
-            "signage": "colorful signs",
-            "street_activities": "busy urban activity",
-
-            # 金融區相關
-            "buildings": "tall buildings",
-            "traffic_elements": "vehicles",
-            "skyscrapers": "high-rise buildings",
-            "road_features": "wide streets",
-            "architectural_elements": "modern architecture",
-            "city_landmarks": "prominent structures",
-
-            # 十字路口相關
-            "crossing_pattern": "marked pedestrian crossings",
-            "pedestrian_behavior": "careful walking",
-            "pedestrian_density": "groups of pedestrians",
-            "traffic_pattern": "regulated traffic flow",
-
-            # 交通樞紐相關
-            "transit_vehicles": "public transportation vehicles",
-            "passenger_activity": "commuter movement",
-            "transportation_modes": "various transit options",
-            "passenger_needs": "waiting areas",
-            "transit_infrastructure": "transit facilities",
-            "passenger_movement": "commuter flow",
-
-            # 購物區相關
-            "retail_elements": "shops and displays",
-            "store_types": "various retail establishments",
-            "walkway_features": "pedestrian pathways",
-            "commercial_signage": "store signs",
-            "consumer_behavior": "shopping activities",
-
-            # 空中視角相關
-            "commercial_layout": "organized retail areas",
-            "pedestrian_pattern": "people movement patterns",
-            "gathering_features": "public gathering spaces",
-            "movement_pattern": "crowd flow patterns",
-            "urban_elements": "city infrastructure",
-            "public_activity": "social interaction",
-
-            # 文化特定元素
-            "stall_elements": "vendor booths",
-            "lighting_features": "decorative lights",
-            "food_elements": "food offerings",
-            "vendor_stalls": "market stalls",
-            "nighttime_activity": "evening commerce",
-            "cultural_lighting": "traditional lighting",
-            "night_market_sounds": "lively market sounds",
-            "evening_crowd_behavior": "nighttime social activity",
-            "architectural_elements": "cultural buildings",
-            "religious_structures": "sacred buildings",
-            "decorative_features": "ornamental designs",
-            "cultural_practices": "traditional activities",
-            "temple_architecture": "religious structures",
-            "sensory_elements": "atmospheric elements",
-            "visitor_activities": "cultural experiences",
-            "ritual_activities": "ceremonial practices",
-            "cultural_symbols": "meaningful symbols",
-            "architectural_style": "historical buildings",
-            "historic_elements": "traditional architecture",
-            "urban_design": "city planning elements",
-            "social_behaviors": "public interactions",
-            "european_features": "European architectural details",
-            "tourist_activities": "visitor activities",
-            "local_customs": "regional practices",
-
-            # 時間特定元素
-            "lighting_effects": "artificial lighting",
-            "shadow_patterns": "light and shadow",
-            "urban_features": "city elements",
-            "illuminated_elements": "lit structures",
-            "evening_activities": "nighttime activities",
-            "light_sources": "lighting points",
-            "lit_areas": "illuminated spaces",
-            "shadowed_zones": "darker areas",
-            "illuminated_signage": "bright signs",
-            "colorful_lighting": "multicolored lights",
-            "neon_elements": "neon signs",
-            "night_crowd_behavior": "evening social patterns",
-            "light_displays": "lighting installations",
-            "building_features": "architectural elements",
-            "nightlife_activities": "evening entertainment",
-            "lighting_modifier": "bright",
-
-            # 混合環境元素
-            "transitional_elements": "connecting features",
-            "indoor_features": "interior elements",
-            "outdoor_setting": "exterior spaces",
-            "interior_amenities": "inside comforts",
-            "exterior_features": "outside elements",
-            "inside_elements": "interior design",
-            "outside_spaces": "outdoor areas",
-            "dual_environment_benefits": "combined settings",
-            "passenger_activities": "waiting behaviors",
-            "transportation_types": "transit vehicles",
-            "sheltered_elements": "covered areas",
-            "exposed_areas": "open sections",
-            "waiting_behaviors": "passenger activities",
-            "indoor_facilities": "inside services",
-            "platform_features": "transit platform elements",
-            "transit_routines": "transportation procedures",
-
-            # 專門場所元素
-            "seating_arrangement": "spectator seating",
-            "playing_surface": "athletic field",
-            "sporting_activities": "sports events",
-            "spectator_facilities": "viewer accommodations",
-            "competition_space": "sports arena",
-            "sports_events": "athletic competitions",
-            "viewing_areas": "audience sections",
-            "field_elements": "field markings and equipment",
-            "game_activities": "competitive play",
-            "construction_equipment": "building machinery",
-            "building_materials": "construction supplies",
-            "construction_activities": "building work",
-            "work_elements": "construction tools",
-            "structural_components": "building structures",
-            "site_equipment": "construction gear",
-            "raw_materials": "building supplies",
-            "construction_process": "building phases",
-            "medical_elements": "healthcare equipment",
-            "clinical_activities": "medical procedures",
-            "facility_design": "healthcare layout",
-            "healthcare_features": "medical facilities",
-            "patient_interactions": "care activities",
-            "equipment_types": "medical devices",
-            "care_procedures": "health services",
-            "treatment_spaces": "clinical areas",
-            "educational_furniture": "learning furniture",
-            "learning_activities": "educational practices",
-            "instructional_design": "teaching layout",
-            "classroom_elements": "school equipment",
-            "teaching_methods": "educational approaches",
-            "student_engagement": "learning participation",
-            "learning_spaces": "educational areas",
-            "educational_tools": "teaching resources",
-            "knowledge_transfer": "learning exchanges"
-        }
-
-        # 將統計的資訊形成的替換內容合併到默認替換中
-        default_replacements.update(statistics_based_replacements)
-
-        # Add Places365-specific template variables
-        places365_scene_context = ""
-        places365_atmosphere = ""
-
-        if places365_info and places365_info.get('confidence', 0) > 0.35:
-            scene_label = places365_info.get('scene_label', '').replace('_', ' ')
-            attributes = places365_info.get('attributes', [])
-
-            if scene_label and scene_label != scene_type:
-                places365_scene_context = f"characteristic of a {scene_label}"
-
-            if 'natural_lighting' in attributes:
-                places365_atmosphere = "with natural illumination"
-            elif 'artificial_lighting' in attributes:
-                places365_atmosphere = "under artificial lighting"
-
-        # Update default_replacements with Places365 context
-        if places365_scene_context:
-            default_replacements["places365_context"] = places365_scene_context
-        else:
-            default_replacements["places365_context"] = ""
-
-        if places365_atmosphere:
-            default_replacements["places365_atmosphere"] = places365_atmosphere
-        else:
-            default_replacements["places365_atmosphere"] = ""
-
-        # For each placeholder, try to fill with appropriate content
-        for placeholder in placeholders:
-            if placeholder in fillers:
-                # Get random filler for this placeholder
-                options = fillers[placeholder]
-                if options:
-                    # Select 1-3 items from the options list
-                    num_items = min(len(options), random.randint(1, 3))
-                    selected_items = random.sample(options, num_items)
-
-                    # Create a formatted list
-                    if len(selected_items) == 1:
-                        replacement = selected_items[0]
-                    elif len(selected_items) == 2:
-                        replacement = f"{selected_items[0]} and {selected_items[1]}"
-                    else:
-                        replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"
-
-                    # Replace the placeholder
-                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
-            else:
-                # Try to fill with scene-specific logic
-                replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type)
-                if replacement:
-                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
-                elif placeholder in default_replacements:
-                    # Use default replacement if available
-                    filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder])
-                else:
-                    # Last resort default
-                    filled_template = filled_template.replace(f"{{{placeholder}}}", "various items")
-
-        return filled_template
-
-    def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
-        """
-        Generate content for a template placeholder based on scene-specific logic.
-
-        Args:
-            placeholder: Template placeholder
-            detected_objects: List of detected objects
-            scene_type: Identified scene type
-
-        Returns:
-            str: Content for the placeholder
-        """
-        # Handle different types of placeholders with custom logic
-        if placeholder == "furniture":
-            # Extract furniture items
-            furniture_ids = [56, 57, 58, 59, 60, 61]  # Example furniture IDs
-            furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids]
-
-            if furniture_objects:
-                furniture_names = [obj["class_name"] for obj in furniture_objects[:3]]
-                return ", ".join(set(furniture_names))
-            return "various furniture items"
-
-        elif placeholder == "electronics":
-            # Extract electronic items
-            electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70]  # Example electronics IDs
-            electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids]
-
-            if electronics_objects:
-                electronics_names = [obj["class_name"] for obj in electronics_objects[:3]]
-                return ", ".join(set(electronics_names))
-            return "electronic devices"
-
-        elif placeholder == "people_count":
-            # Count people
-            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
-
-            if people_count == 0:
-                return "no people"
-            elif people_count == 1:
-                return "one person"
-            elif people_count < 5:
-                return f"{people_count} people"
-            else:
-                return "several people"
-
-        elif placeholder == "seating":
-            # Extract seating items
-            seating_ids = [56, 57]  # chair, sofa
-            seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids]
-
-            if seating_objects:
-                seating_names = [obj["class_name"] for obj in seating_objects[:2]]
-                return ", ".join(set(seating_names))
-            return "seating arrangements"
-
-        # Default case - empty string
-        return ""
-
-    def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
-        """
-        Generate basic details when templates aren't available.
-
-        Args:
-            scene_type: Identified scene type
-            detected_objects: List of detected objects
-
-        Returns:
-            str: Basic scene details
-        """
-        # Handle specific scene types with custom logic
-        if scene_type == "living_room":
-            tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62]  # TV
-            sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57]  # Sofa
-
-            if tv_objs and sofa_objs:
-                tv_region = tv_objs[0]["region"]
-                sofa_region = sofa_objs[0]["region"]
-
-                arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
-                arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "
-
-                return f"{arrangement}This appears to be a space designed for relaxation and entertainment."
-
-        elif scene_type == "bedroom":
-            bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59]  # Bed
-
-            if bed_objs:
-                bed_region = bed_objs[0]["region"]
-                extra_items = []
-
-                for obj in detected_objects:
-                    if obj["class_id"] == 74:  # Clock
-                        extra_items.append("clock")
-                    elif obj["class_id"] == 73:  # Book
-                        extra_items.append("book")
-
-                extras = ""
-                if extra_items:
-                    extras = f" There is also a {' and a '.join(extra_items)} visible."
-
-                return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"
-
-        elif scene_type in ["dining_area", "kitchen"]:
-            # Count food and dining-related items
-            food_items = []
-            for obj in detected_objects:
-                if obj["class_id"] in [39, 41, 42, 43, 44, 45]:  # Kitchen items
-                    food_items.append(obj["class_name"])
-
-            food_str = ""
-            if food_items:
-                unique_items = list(set(food_items))
-                if len(unique_items) <= 3:
-                    food_str = f" with {', '.join(unique_items)}"
-                else:
-                    food_str = f" with {', '.join(unique_items[:3])} and other items"
-
-            return f"{food_str}."
-
-        elif scene_type == "city_street":
-            # Count people and vehicles
-            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
-            vehicle_count = len([obj for obj in detected_objects
-                               if obj["class_id"] in [1, 2, 3, 5, 7]])  # Bicycle, car, motorbike, bus, truck
-
-            traffic_desc = ""
-            if people_count > 0 and vehicle_count > 0:
-                traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
-                traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
-            elif people_count > 0:
-                traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
-            elif vehicle_count > 0:
-                traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
-
-            return f"{traffic_desc}."
-
-        # Handle more specialized scenes
-        elif scene_type == "asian_commercial_street":
-            # Look for key urban elements
-            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
-            vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]])
-
-            # Analyze pedestrian distribution
-            people_positions = []
-            for obj in detected_objects:
-                if obj["class_id"] == 0:  # Person
-                    people_positions.append(obj["normalized_center"])
-
-            # Check if people are distributed along a line (indicating a walking path)
-            structured_path = False
-            if len(people_positions) >= 3:
-                # Simplified check - see if y-coordinates are similar for multiple people
-                y_coords = [pos[1] for pos in people_positions]
-                y_mean = sum(y_coords) / len(y_coords)
-                y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
-                if y_variance < 0.05:  # Low variance indicates linear arrangement
-                    structured_path = True
-
-            street_desc = "A commercial street with "
-            if people_count > 0:
-                street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
-                if vehicle_count > 0:
-                    street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
-            elif vehicle_count > 0:
-                street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
-            else:
-                street_desc += "various commercial elements"
-
-            if structured_path:
-                street_desc += ". The pedestrians appear to be following a defined walking path"
-
-            # Add cultural elements
-            street_desc += ". The signage and architectural elements suggest an Asian urban setting."
-
-            return street_desc
-
-        # Default general description
-        return "The scene contains various elements characteristic of this environment."
-
-    def _detect_viewpoint(self, detected_objects: List[Dict]) -> str:
-        """
-        改進視角檢測，特別加強對空中俯視視角的識別。
-
-        Args:
-            detected_objects: 檢測到的物體列表
-
-        Returns:
-            str: 檢測到的視角類型
-        """
-        if not detected_objects:
-            return "eye_level"  # default
-
-        # extract space and size
-        top_region_count = 0
-        bottom_region_count = 0
-        total_objects = len(detected_objects)
-
-        # 追蹤大小分布以檢測空中視角
-        sizes = []
-
-        # 垂直大小比例用於低角度檢測
-        height_width_ratios = []
-
-        # 用於檢測規則圖案的變數
-        people_positions = []
-        crosswalk_pattern_detected = False
-
-        for obj in detected_objects:
-            # 計算頂部or底部區域中的物體
-            region = obj["region"]
-            if "top" in region:
-                top_region_count += 1
-            elif "bottom" in region:
-                bottom_region_count += 1
-
-            # 計算標準化大小（Area）
-            if "normalized_area" in obj:
-                sizes.append(obj["normalized_area"])
-
-            # 計算高度or寬度比例
-            if "normalized_size" in obj:
-                width, height = obj["normalized_size"]
-                if width > 0:
-                    height_width_ratios.append(height / width)
-
-            # 收集人的位置
-            if obj["class_id"] == 0:  # 人
-                if "normalized_center" in obj:
-                    people_positions.append(obj["normalized_center"])
-
-        # 專門為斑馬線的十字路口添加檢測邏輯
-        # 檢查是否有明顯的垂直和水平行人分布
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人
-
-        if len(people_objs) >= 8:  # 需要足夠多的人才能形成十字路口模式
-            # 檢查是否有斑馬線模式 - 新增功能
-            if len(people_positions) >= 4:
-                # 對位置進行聚類分析，尋找線性分布
-                x_coords = [pos[0] for pos in people_positions]
-                y_coords = [pos[1] for pos in people_positions]
-
-                # 計算 x 和 y 坐標的變異數和範圍
-                x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
-                y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
-
-                x_range = max(x_coords) - min(x_coords)
-                y_range = max(y_coords) - min(y_coords)
-
-                # 嘗試檢測十字形分布
-                # 如果 x 和 y 方向都有較大範圍，且範圍相似，就有可能是十字路口
-                if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
-
-                    # 計算到中心點的距離
-                    center_x = np.mean(x_coords)
-                    center_y = np.mean(y_coords)
-
-                    # 將點映射到十字架的軸上（水平和��直）
-                    x_axis_distance = [abs(x - center_x) for x in x_coords]
-                    y_axis_distance = [abs(y - center_y) for y in y_coords]
-
-                    # 點應該接近軸線（水平或垂直）
-                    # 對於每個點，檢查它是否接近水平或垂直軸線
-                    close_to_axis_count = 0
-                    for i in range(len(x_coords)):
-                        if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1:
-                            close_to_axis_count += 1
-
-                    # 如果足夠多的點接近軸線，認為是十字路口
-                    if close_to_axis_count >= len(x_coords) * 0.6:
-                        crosswalk_pattern_detected = True
-
-                # 如果沒有檢測到十字形，嘗試檢測線性聚類分布
-                if not crosswalk_pattern_detected:
-                    # 檢查 x 和 y 方向的聚類
-                    x_clusters = self._detect_linear_clusters(x_coords)
-                    y_clusters = self._detect_linear_clusters(y_coords)
-
-                    # 如果在 x 和 y 方向上都有多個聚類，可能是交叉的斑馬線
-                    if len(x_clusters) >= 2 and len(y_clusters) >= 2:
-                        crosswalk_pattern_detected = True
-
-        # 檢測斑馬線模式 - 優先判斷
-        if crosswalk_pattern_detected:
-            return "aerial"
-
-        # 檢測行人分布情況
-        if len(people_objs) >= 10:
-            people_region_counts = {}
-            for obj in people_objs:
-                region = obj["region"]
-                if region not in people_region_counts:
-                    people_region_counts[region] = 0
-                people_region_counts[region] += 1
-
-            # 計算不同區域中的行人數量
-            region_count = len([r for r, c in people_region_counts.items() if c >= 2])
-
-            # 如果行人分布在多個區域中，可能是空中視角
-            if region_count >= 4:
-                # 檢查行人分布的模式
-                # 特別是檢查不同區域中行人數量的差異
-                region_counts = list(people_region_counts.values())
-                region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0
-                region_counts_mean = np.mean(region_counts) if region_counts else 0
-
-                # 如果行人分布較為均勻（變異係數小），可能是空中視角
-                if region_counts_mean > 0:
-                    variation_coefficient = region_counts_variance / region_counts_mean
-                    if variation_coefficient < 0.5:
-                        return "aerial"
-
-        # 計算指標
-        top_ratio = top_region_count / total_objects if total_objects > 0 else 0
-        bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0
-
-        # 大小變異數（標準化）
-        size_variance = 0
-        if sizes:
-            mean_size = sum(sizes) / len(sizes)
-            size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes)
-            size_variance = size_variance / (mean_size ** 2)  # 標準化
-
-        # 平均高度/寬度比例
-        avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0
-
-        # 空中視角：低大小差異，物體均勻分布，底部很少或沒有物體
-        if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and
-            bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]):
-            return "aerial"
-
-        # 低角度視角：物體傾向於比寬高，頂部較多物體
-        elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and
-            top_ratio > self.viewpoint_params["low_angle_threshold"]):
-            return "low_angle"
-
-        # 高視角：底部較多物體，頂部較少
-        elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and
-            top_ratio < self.viewpoint_params["elevated_top_threshold"]):
-            return "elevated"
-
-        # 默認：平視角
-        return "eye_level"
-
-    def _detect_linear_clusters(self, coords, threshold=0.05):
-        """
-        檢測坐標中的線性聚類
-
-        Args:
-            coords: 一維坐標列表
-            threshold: 聚類閾值
-
-        Returns:
-            list: 聚類列表
-        """
-        if not coords:
-            return []
-
-        # 排序坐標
-        sorted_coords = sorted(coords)
-
-        clusters = []
-        current_cluster = [sorted_coords[0]]
-
-        for i in range(1, len(sorted_coords)):
-            # 如果當前坐標與前一個接近，添加到當前聚類
-            if sorted_coords[i] - sorted_coords[i-1] < threshold:
-                current_cluster.append(sorted_coords[i])
-            else:
-                # 否則開始新的聚類
-                if len(current_cluster) >= 2:  # 至少需要2個點形成聚類
-                    clusters.append(current_cluster)
-                current_cluster = [sorted_coords[i]]
-
-        # 添加最後一個cluster
-        if len(current_cluster) >= 2:
-            clusters.append(current_cluster)
-
-        return clusters
-
-    def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
-        """
-        Detect the likely cultural context of the scene.
-
-        Args:
-            scene_type: Identified scene type
-            detected_objects: List of detected objects
-
-        Returns:
-            Optional[str]: Detected cultural context (asian, european, etc.) or None
-        """
-        # Scene types with explicit cultural contexts
-        cultural_scene_mapping = {
-            "asian_commercial_street": "asian",
-            "asian_night_market": "asian",
-            "asian_temple_area": "asian",
-            "european_plaza": "european"
-        }
-
-        # Check if scene type directly indicates cultural context
-        if scene_type in cultural_scene_mapping:
-            return cultural_scene_mapping[scene_type]
-
-        # No specific cultural context detected
-        return None
-
-    def _generate_cultural_elements(self, cultural_context: str) -> str:
-        """
-        Generate description of cultural elements for the detected context.
-
-        Args:
-            cultural_context: Detected cultural context
-
-        Returns:
-            str: Description of cultural elements
-        """
-        # Get template for this cultural context
-        cultural_templates = self.templates.get("cultural_templates", {})
-
-        if cultural_context in cultural_templates:
-            template = cultural_templates[cultural_context]
-            elements = template.get("elements", [])
-
-            if elements:
-                # Select 1-2 random elements
-                num_elements = min(len(elements), random.randint(1, 2))
-                selected_elements = random.sample(elements, num_elements)
-
-                # Format elements list
-                elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0]
-
-                # Fill template
-                return template.get("description", "").format(elements=elements_text)
-
-        return ""
-
-    def _optimize_object_description(self, description: str) -> str:
-        """
-        優化物品描述，避免重複列舉相同物品
-        """
-        import re
-
-        # 處理床鋪重複描述
-        if "bed in the room" in description:
-            description = description.replace("a bed in the room", "a bed")
-
-        # 處理重複的物品列表
-        object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
-
-        for obj_list in object_lists:
-            # 計算每個物品出現次數
-            items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
-            item_counts = {}
-
-            for item in items:
-                item = item.strip()
-                if item and item not in ["and", "with"]:
-                    if item not in item_counts:
-                        item_counts[item] = 0
-                    item_counts[item] += 1
-
-            # 生成優化後的物品列表
-            if item_counts:
-                new_items = []
-                for item, count in item_counts.items():
-                    if count > 1:
-                        new_items.append(f"{count} {item}s")
-                    else:
-                        new_items.append(item)
-
-                # 格式化新列表
-                if len(new_items) == 1:
-                    new_list = new_items[0]
-                elif len(new_items) == 2:
-                    new_list = f"{new_items[0]} and {new_items[1]}"
-                else:
-                    new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
-
-                # 替換原始列表
-                description = description.replace(obj_list, new_list)
-
-        return description
-
-    def _describe_functional_zones(self, functional_zones: Dict) -> str:
-        """
-        生成場景功能區域的描述，優化處理行人區域、人數統計和物品重複問題。
-
-        Args:
-            functional_zones: 識別出的功能區域字典
-
-        Returns:
-            str: 功能區域描述
-        """
-        if not functional_zones:
-            return ""
-
-        # 處理不同類型的 functional_zones 參數
-        if isinstance(functional_zones, list):
-            # 如果是列表，轉換為字典格式
-            zones_dict = {}
-            for i, zone in enumerate(functional_zones):
-                if isinstance(zone, dict) and 'name' in zone:
-                    zone_name = zone['name']
-                else:
-                    zone_name = f"zone_{i}"
-                zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
-            functional_zones = zones_dict
-        elif not isinstance(functional_zones, dict):
-            return ""
-
-        # 計算場景中的總人數
-        total_people_count = 0
-        people_by_zone = {}
-
-        # 計算每個區域的人數並累計總人數
-        for zone_name, zone_info in functional_zones.items():
-            if "objects" in zone_info:
-                zone_people_count = zone_info["objects"].count("person")
-                people_by_zone[zone_name] = zone_people_count
-                total_people_count += zone_people_count
-
-        # 分類區域為行人區域和其他區域
-        pedestrian_zones = []
-        other_zones = []
-
-        for zone_name, zone_info in functional_zones.items():
-            # 檢查是否是行人相關區域
-            if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
-                pedestrian_zones.append((zone_name, zone_info))
-            else:
-                other_zones.append((zone_name, zone_info))
-
-        # 獲取最重要的行人區域和其他區域
-        main_pedestrian_zones = sorted(pedestrian_zones,
-                                    key=lambda z: people_by_zone.get(z[0], 0),
-                                    reverse=True)[:1]  # 最多1個主要行人區域
-
-        top_other_zones = sorted(other_zones,
-                            key=lambda z: len(z[1].get("objects", [])),
-                            reverse=True)[:2]  # 最多2個其他區域
-
-        # 合併區域
-        top_zones = main_pedestrian_zones + top_other_zones
-
-        if not top_zones:
-            return ""
-
-        # 生成匯總描述
-        summary = ""
-        max_mentioned_people = 0  # track已經提到的最大人數
-
-        # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
-        if total_people_count > 5:
-            summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
-            max_mentioned_people = total_people_count  # update已提到的最大人數
-
-        # 處理每個區域的描述，確保人數信息的一致性
-        processed_zones = []
-
-        for zone_name, zone_info in top_zones:
-            zone_desc = zone_info.get("description", "a functional zone")
-            zone_people_count = people_by_zone.get(zone_name, 0)
-
-            # 檢查描述中是否包含人數資訊
-            contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
-
-            # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述
-            if contains_people_info and zone_people_count < max_mentioned_people:
-                parts = zone_desc.split("with")
-                if len(parts) > 1:
-                    # 移除人數部分
-                    zone_desc = parts[0].strip() + " area"
-
-            processed_zones.append((zone_name, {"description": zone_desc}))
-
-        # 根據處理後的區域數量生成最終描述
-        final_desc = ""
-
-        if len(processed_zones) == 1:
-            _, zone_info = processed_zones[0]
-            zone_desc = zone_info["description"]
-            final_desc = summary + f"The scene includes {zone_desc}."
-        elif len(processed_zones) == 2:
-            _, zone1_info = processed_zones[0]
-            _, zone2_info = processed_zones[1]
-            zone1_desc = zone1_info["description"]
-            zone2_desc = zone2_info["description"]
-            final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
-        else:
-            zones_desc = ["The scene contains multiple functional areas including"]
-            zone_descriptions = [z[1]["description"] for z in processed_zones]
-
-            # 格式化最終的多區域描述
-            if len(zone_descriptions) == 3:
-                formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
-            else:
-                formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"
-
-            final_desc = summary + f"{zones_desc[0]} {formatted_desc}."
-
-        return self._optimize_object_description(final_desc)