Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import re | |
import json | |
import logging | |
import random | |
import numpy as np | |
from typing import Dict, List, Tuple, Any, Optional | |
from scene_type import SCENE_TYPES | |
from scene_detail_templates import SCENE_DETAIL_TEMPLATES | |
from object_template_fillers import OBJECT_TEMPLATE_FILLERS | |
from lighting_conditions import LIGHTING_CONDITIONS | |
from viewpoint_templates import VIEWPOINT_TEMPLATES | |
from cultural_templates import CULTURAL_TEMPLATES | |
from confidence_templates import CONFIDENCE_TEMPLATES | |
from landmark_data import ALL_LANDMARKS | |
from region_analyzer import RegionAnalyzer | |
from viewpoint_detector import ViewpointDetector, ViewpointDetectionError | |
from template_manager import TemplateManager, TemplateLoadingError, TemplateFillError | |
from object_description_generator import ObjectDescriptionGenerator, ObjectDescriptionError | |
from cultural_context_analyzer import CulturalContextAnalyzer, CulturalContextError | |
from text_formatter import TextFormatter, TextFormattingError | |
class EnhancedSceneDescriberError(Exception): | |
"""場景描述生成過程中的自定義異常""" | |
pass | |
class EnhancedSceneDescriber: | |
""" | |
增強場景描述器 - 提供詳細自然語言場景描述的主要窗口,其他相關class匯集於此 | |
此class會協調多個專門組件來生成高質量的場景描述,包括視角檢測、 | |
模板管理、物件描述、文化語境分析和文本格式化。 | |
""" | |
def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None): | |
""" | |
初始化增強場景描述器 | |
Args: | |
templates_db: 可選的自定義模板數據庫 | |
scene_types: 場景類型定義字典 | |
spatial_analyzer_instance: 空間分析器實例(保持兼容性) | |
""" | |
self.logger = logging.getLogger(self.__class__.__name__) | |
self.logger.setLevel(logging.INFO) | |
# 如果沒有logger,就加一個 | |
if not self.logger.hasHandlers(): | |
handler = logging.StreamHandler() | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
handler.setFormatter(formatter) | |
self.logger.addHandler(handler) | |
try: | |
# 載入場景類型定義 | |
self.scene_types = scene_types or self._load_default_scene_types() | |
# 初始化子組件 | |
self._initialize_components(templates_db) | |
# 保存空間分析器實例以保持兼容性 | |
self.spatial_analyzer_instance = spatial_analyzer_instance | |
self.logger.info("EnhancedSceneDescriber initialized successfully with %d scene types", | |
len(self.scene_types)) | |
except Exception as e: | |
error_msg = f"Failed to initialize EnhancedSceneDescriber: {str(e)}" | |
self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}") | |
raise EnhancedSceneDescriberError(error_msg) from e | |
def _load_default_scene_types(self) -> Dict: | |
""" | |
載入默認場景類型 | |
Returns: | |
Dict: 場景類型定義 | |
""" | |
try: | |
return SCENE_TYPES | |
except Exception as e: | |
self.logger.error(f"Failed to import SCENE_TYPES: {str(e)}") | |
return {} # 返回空字典 | |
def _initialize_components(self, templates_db: Optional[Dict]): | |
""" | |
初始化所有子組件 | |
Args: | |
templates_db: 可選的模板數據庫 | |
""" | |
try: | |
# 初始化視角檢測器 | |
self.viewpoint_detector = ViewpointDetector() | |
# 初始化區域分析器 | |
self.region_analyzer = RegionAnalyzer() | |
# 初始化模板管理器 | |
self.template_manager = TemplateManager(custom_templates_db=templates_db) | |
# 初始化物件描述生成器,傳入區域分析器 | |
self.object_description_generator = ObjectDescriptionGenerator( | |
region_analyzer=self.region_analyzer | |
) | |
# 初始化文化語境分析器 | |
self.cultural_context_analyzer = CulturalContextAnalyzer() | |
# 初始化文本格式化器 | |
self.text_formatter = TextFormatter() | |
self.logger.debug("All components initialized successfully") | |
except Exception as e: | |
error_msg = f"Component initialization failed: {str(e)}" | |
self.logger.error(error_msg) | |
# 初始化基本組件而不是拋出異常 | |
self._initialize_fallback_components() | |
def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float, | |
lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True, | |
scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None, | |
image_dimensions: Optional[Tuple[int, int]] = None, # 改為 Tuple | |
places365_info: Optional[Dict] = None, | |
object_statistics: Optional[Dict] = None) -> str: | |
try: | |
traffic_list = [obj for obj in detected_objects if obj.get("class_name", "") == "traffic light"] | |
# print(f"[DEBUG] generate_description 一開始接收到的 traffic light 數量: {len(traffic_list)}") # 原始的 print | |
self.logger.debug(f"Initial traffic light count in generate_description: {len(traffic_list)}") # 改用 logger | |
# for idx, tl in enumerate(traffic_list): # 這部分 log 可能過於詳細,先註解 | |
# self.logger.debug(f" idx={idx}, confidence={tl.get('confidence', 0):.4f}, bbox={tl.get('bbox')}, region={tl.get('region')}") | |
if scene_type == "unknown" or confidence < 0.4: | |
generic_desc = self._generate_generic_description(detected_objects, lighting_info) | |
return self.text_formatter.format_final_description(generic_desc) | |
current_detected_objects = detected_objects | |
if not enable_landmark: | |
current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)] | |
places365_context = "" | |
if places365_info and places365_info.get('confidence', 0) > 0.3: | |
scene_label = places365_info.get('scene_label', '') | |
attributes = places365_info.get('attributes', []) | |
is_indoor = places365_info.get('is_indoor', None) | |
if scene_label: | |
places365_context = f"Scene context: {scene_label}" | |
if attributes: | |
places365_context += f" with characteristics: {', '.join(attributes[:3])}" | |
if is_indoor is not None: | |
indoor_outdoor = "indoor" if is_indoor else "outdoor" | |
places365_context += f" ({indoor_outdoor} environment)" | |
self.logger.debug(f"Enhanced description incorporating Places365 context: {places365_context}") | |
landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)] | |
has_landmark_in_scene = len(landmark_objects_in_scene) > 0 | |
if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene): | |
landmark_desc = self._generate_landmark_description( | |
scene_type, current_detected_objects, confidence, | |
lighting_info, functional_zones, landmark_objects_in_scene | |
) | |
return self.text_formatter.format_final_description(landmark_desc) | |
viewpoint = self.viewpoint_detector.detect_viewpoint(current_detected_objects) | |
current_scene_type = scene_type | |
if viewpoint == "aerial": | |
if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): | |
current_scene_type = "aerial_view_intersection" | |
elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]): | |
current_scene_type = "aerial_view_commercial_area" | |
elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]): | |
current_scene_type = "aerial_view_plaza" | |
else: | |
current_scene_type = "aerial_view_general" | |
current_scene_type = self._sanitize_scene_type_for_description(current_scene_type) | |
# 偵測文化背景資訊 | |
cultural_context = None | |
if viewpoint != "aerial": | |
cultural_context = self.cultural_context_analyzer.detect_cultural_context(current_scene_type, current_detected_objects) | |
# 設定基礎描述 | |
base_description = "A scene" | |
if viewpoint == "aerial": | |
if current_scene_type in self.scene_types: # 確保 self.scene_types 已有 | |
base_description = self.scene_types.get(current_scene_type, {}).get("description", "An aerial view showing the layout and movement patterns from above") | |
else: | |
base_description = "An aerial view showing the layout and movement patterns from above" | |
elif current_scene_type in self.scene_types: # 確保 self.scene_types 已有 | |
base_description = self.scene_types.get(current_scene_type, {}).get("description", "A scene") | |
# 假設 template_manager 內部可以處理 List[str] 的 functional_zones | |
selected_template = self.template_manager.get_template_by_scene_type( | |
scene_type=current_scene_type, | |
detected_objects=current_detected_objects, | |
functional_zones=functional_zones or [] # 傳入 List[str] | |
) | |
# 用於 fill_template 中的某些佔位符 | |
processed_functional_zones = {} | |
if functional_zones: | |
if isinstance(functional_zones, dict): # 如果外部傳入的就是dict | |
processed_functional_zones = functional_zones | |
elif isinstance(functional_zones, list): # 如果是 list of strings | |
processed_functional_zones = {f"zone_{i}": {"description": zone_desc} for i, zone_desc in enumerate(functional_zones)} | |
# 組織場景資料 | |
scene_data = { | |
"detected_objects": current_detected_objects, | |
"functional_zones": processed_functional_zones, # 傳入處理過的字典 | |
"scene_type": current_scene_type, | |
"object_statistics": object_statistics or {}, | |
"lighting_info": lighting_info, | |
"spatial_analysis": spatial_analysis, | |
"places365_info": places365_info | |
} | |
# 應用模板產生核心場景描述 | |
core_scene_details = self.template_manager.apply_template(selected_template, scene_data) | |
# 組合基礎描述與核心場景細節 | |
description = base_description | |
if core_scene_details and core_scene_details.strip(): | |
cleaned_scene_details = self._validate_and_clean_scene_details(core_scene_details) | |
if base_description.lower() == "a scene" and len(cleaned_scene_details) > len(base_description): | |
description = cleaned_scene_details | |
else: | |
description = self.text_formatter.smart_append(description, cleaned_scene_details) | |
elif not core_scene_details and not description: # 如果兩者都為空 | |
description = self._generate_generic_description(current_detected_objects, lighting_info) | |
# 添加次要描述資訊 | |
if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]: | |
secondary_desc = self.scene_types[current_scene_type]["secondary_description"] | |
if secondary_desc: | |
description = self.text_formatter.smart_append(description, secondary_desc) | |
# 處理人物相關的描述 | |
people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0] | |
if people_objs: | |
people_count = len(people_objs) | |
if people_count == 1: people_phrase = "a single person" | |
elif 1 < people_count <= 3: people_phrase = f"{people_count} people" | |
elif 3 < people_count <= 7: people_phrase = "several people" | |
else: people_phrase = "multiple people" | |
if not any(p_word in description.lower() for p_word in ["person", "people", "pedestrian"]): | |
description = self.text_formatter.smart_append(description, f"The scene includes {people_phrase}.") | |
# 添加文化背景元素(非空中視角) | |
if cultural_context and viewpoint != "aerial": | |
cultural_elements = self.cultural_context_analyzer.generate_cultural_elements(cultural_context) | |
if cultural_elements: | |
description = self.text_formatter.smart_append(description, cultural_elements) | |
# 處理光照條件描述 | |
lighting_description_text = "" | |
if lighting_info and "time_of_day" in lighting_info: | |
lighting_type = lighting_info["time_of_day"] | |
lighting_desc_template = self.template_manager.get_lighting_template(lighting_type) | |
if lighting_desc_template: lighting_description_text = lighting_desc_template | |
if lighting_description_text and lighting_description_text.lower() not in description.lower(): | |
description = self.text_formatter.smart_append(description, lighting_description_text) | |
# 添加視角特定的觀察描述 | |
if viewpoint != "eye_level": | |
viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint) | |
prefix = viewpoint_template.get('prefix', '') | |
observation_template = viewpoint_template.get("observation", "") | |
scene_elements_for_vp = "the overall layout and objects" | |
if viewpoint == "aerial": scene_elements_for_vp = "crossing patterns and general layout" | |
viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp) | |
full_viewpoint_text = "" | |
if prefix: | |
full_viewpoint_text = prefix.strip() + " " | |
if viewpoint_observation_text and viewpoint_observation_text[0].islower(): | |
full_viewpoint_text += viewpoint_observation_text | |
elif viewpoint_observation_text: | |
full_viewpoint_text = prefix + (viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else viewpoint_observation_text) | |
elif viewpoint_observation_text: | |
full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:] | |
if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower(): | |
description = self.text_formatter.smart_append(description, full_viewpoint_text) | |
# 需要轉換或調整 describe_functional_zones | |
if functional_zones and len(functional_zones) > 0: | |
if isinstance(functional_zones, dict): | |
zones_desc_text = self.object_description_generator.describe_functional_zones(functional_zones) | |
else: # 如果是 list of strings | |
temp_zones_dict = {f"area_{i}": {"description": desc} for i, desc in enumerate(functional_zones)} | |
zones_desc_text = self.object_description_generator.describe_functional_zones(temp_zones_dict) | |
if zones_desc_text: | |
description = self.text_formatter.smart_append(description, zones_desc_text) | |
# 避免重複提到 | |
if hasattr(self.text_formatter, 'deduplicate_sentences_in_description'): | |
deduplicated_description = self.text_formatter.deduplicate_sentences_in_description(description) | |
self.logger.info(f"Description before pre-LLM deduplication (len {len(description)}): '{description[:150]}...'") | |
self.logger.info(f"Description after pre-LLM deduplication (len {len(deduplicated_description)}): '{deduplicated_description[:150]}...'") | |
description = deduplicated_description # 更新 description 為去除重複後的版本 | |
else: | |
self.logger.warning("TextFormatter does not have 'deduplicate_sentences_in_description'. Skipping pre-LLM deduplication of the internally generated description.") | |
# 格式化最終描述 | |
final_formatted_description = self.text_formatter.format_final_description(description) | |
# 如果禁用地標,過濾地標引用 | |
if not enable_landmark: | |
final_formatted_description = self.text_formatter.filter_landmark_references(final_formatted_description, enable_landmark=False) | |
# 如果描述為空,使用備用描述 | |
if not final_formatted_description.strip() or final_formatted_description.strip() == ".": | |
self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.") | |
final_formatted_description = self.text_formatter.format_final_description( | |
self._generate_generic_description(current_detected_objects, lighting_info) | |
) | |
return final_formatted_description | |
except Exception as e: | |
error_msg = f"Error generating scene description: {str(e)}" | |
self.logger.error(f"{error_msg}\n{e.__class__.__name__}: {str(e)}") | |
try: | |
fallback_desc = self._generate_generic_description(detected_objects, lighting_info) | |
return self.text_formatter.format_final_description(fallback_desc) | |
except: | |
return "A scene with various elements is visible." | |
def _extract_placeholders(self, template: str) -> List[str]: | |
"""提取模板中的佔位符""" | |
import re | |
return re.findall(r'\{([^}]+)\}', template) | |
def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], | |
functional_zones: List, scene_type: str, | |
object_statistics: Dict) -> str: | |
"""生成佔位符內容""" | |
all_replacements = self._generate_default_replacements() | |
return self._get_placeholder_replacement( | |
placeholder, {}, all_replacements, detected_objects, scene_type | |
) | |
def _preprocess_functional_zones(self, functional_zones: List) -> Dict: | |
"""預處理功能區域數據""" | |
if isinstance(functional_zones, list): | |
# 將列表轉換為字典格式 | |
zones_dict = {} | |
for i, zone in enumerate(functional_zones): | |
if isinstance(zone, str): | |
zones_dict[f"area {i+1}"] = {"description": zone} | |
elif isinstance(zone, dict): | |
zones_dict[f"area {i+1}"] = zone | |
return zones_dict | |
elif isinstance(functional_zones, dict): | |
return functional_zones | |
else: | |
return {} | |
def _standardize_placeholder_content(self, content: str, placeholder_type: str) -> str: | |
"""標準化佔位符內容""" | |
if not content: | |
return "various elements" | |
return content.strip() | |
def _finalize_description_output(self, description: str) -> str: | |
"""最終化描述輸出""" | |
if not description: | |
return "A scene featuring various elements and organized areas of activity." | |
# 基本清理 | |
import re | |
finalized = re.sub(r'\s+', ' ', description).strip() | |
# 確保適當結尾 | |
if finalized and not finalized.endswith(('.', '!', '?')): | |
finalized += '.' | |
# 首字母大寫 | |
if finalized: | |
finalized = finalized[0].upper() + finalized[1:] if len(finalized) > 1 else finalized.upper() | |
return finalized | |
def _sanitize_scene_type_for_description(self, scene_type: str) -> str: | |
""" | |
清理場景類型名稱,確保不包含內部標識符格式 | |
Args: | |
scene_type: 原始場景類型名稱 | |
Returns: | |
str: 清理後的場景類型名稱 | |
""" | |
try: | |
# 移除下劃線並轉換為空格分隔的自然語言 | |
cleaned_type = scene_type.replace('_', ' ') | |
# 確保不直接在描述中使用技術性場景類型名稱 | |
return cleaned_type | |
except Exception as e: | |
self.logger.warning(f"Error sanitizing scene type '{scene_type}': {str(e)}") | |
return "general scene" | |
def _validate_and_clean_scene_details(self, scene_details: str) -> str: | |
""" | |
驗證並清理場景詳細信息,移除可能的模板填充錯誤 | |
Args: | |
scene_details: 原始場景詳細信息 | |
Returns: | |
str: 清理後的場景詳細信息 | |
""" | |
try: | |
if not scene_details or not scene_details.strip(): | |
return "" | |
cleaned = scene_details.strip() | |
# 移除常見的模板填充錯誤模式 | |
import re | |
# 修復 "In ," 類型的錯誤 | |
cleaned = re.sub(r'\bIn\s*,\s*', 'In this scene, ', cleaned) | |
cleaned = re.sub(r'\bAt\s*,\s*', 'At this location, ', cleaned) | |
cleaned = re.sub(r'\bWithin\s*,\s*', 'Within this area, ', cleaned) | |
# 移除內部標識符格式 | |
cleaned = re.sub(r'\b\w+_\w+(?:_\w+)*\b(?!\s+(area|zone|region))', | |
lambda m: m.group(0).replace('_', ' '), cleaned) | |
# 確保句子完整性 | |
if cleaned and not cleaned.endswith(('.', '!', '?')): | |
cleaned += '.' | |
return cleaned | |
except Exception as e: | |
self.logger.warning(f"Error validating scene details: {str(e)}") | |
return scene_details if scene_details else "" | |
def _generate_landmark_description(self, | |
scene_type: str, | |
detected_objects: List[Dict], | |
confidence: float, | |
lighting_info: Optional[Dict] = None, | |
functional_zones: Optional[Dict] = None, | |
landmark_objects: Optional[List[Dict]] = None) -> str: | |
""" | |
生成包含地標信息的場景描述 | |
Args: | |
scene_type: 識別的場景類型 | |
detected_objects: 檢測到的物件列表 | |
confidence: 場景分類置信度 | |
lighting_info: 照明條件信息 | |
functional_zones: 功能區域信息 | |
landmark_objects: 識別為地標的物件列表 | |
Returns: | |
str: 包含地標信息的自然語言場景描述 | |
""" | |
try: | |
# 如果沒有提供地標物件,從檢測物件中篩選 | |
if landmark_objects is None: | |
landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)] | |
# 如果沒有地標,退回到標準描述 | |
if not landmark_objects: | |
if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]: | |
base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable." | |
else: | |
return self.text_formatter.format_final_description(self._generate_scene_details( | |
scene_type, | |
detected_objects, | |
lighting_info, | |
self.viewpoint_detector.detect_viewpoint(detected_objects) | |
)) | |
else: | |
# 獲取主要地標 | |
primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0)) | |
landmark_name = primary_landmark.get("class_name", "landmark") | |
# 先取原生 location | |
landmark_location = primary_landmark.get("location", "") | |
# 如果 location 為空,就從全域 ALL_LANDMARKS 補上 | |
lm_id = primary_landmark.get("landmark_id") | |
if not landmark_location and lm_id and lm_id in ALL_LANDMARKS: | |
landmark_location = ALL_LANDMARKS[lm_id].get("location", "") | |
# 根據地標類型選擇適當的描述模板,並插入 location | |
if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural": | |
base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}." | |
elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument": | |
base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}." | |
else: | |
base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}." | |
# 添加地標的額外信息 | |
landmark_details = [] | |
for landmark in landmark_objects: | |
details = [] | |
if "year_built" in landmark: | |
details.append(f"built in {landmark['year_built']}") | |
if "architectural_style" in landmark: | |
details.append(f"featuring {landmark['architectural_style']} architectural style") | |
if "significance" in landmark: | |
details.append(landmark["significance"]) | |
# 補 location(如果該物件沒有 location,就再從 ALL_LANDMARKS 撈一次) | |
loc = landmark.get("location", "") | |
lm_id_iter = landmark.get("landmark_id") | |
if not loc and lm_id_iter and lm_id_iter in ALL_LANDMARKS: | |
loc = ALL_LANDMARKS[lm_id_iter].get("location", "") | |
if loc: | |
details.append(f"located in {loc}") | |
if details: | |
landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})") | |
# 將詳細信息添加到基本描述中 | |
if landmark_details: | |
description = base_description + " The scene features " + ", ".join(landmark_details) + "." | |
else: | |
description = base_description | |
# 獲取視角 | |
viewpoint = self.viewpoint_detector.detect_viewpoint(detected_objects) | |
# 生成人員活動描述 | |
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) | |
if people_count > 0: | |
if people_count == 1: | |
people_description = "There is one person in the scene, likely a tourist or visitor." | |
elif people_count < 5: | |
people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark." | |
else: | |
people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination." | |
description = self.text_formatter.smart_append(description, people_description) | |
# 添加照明信息 | |
if lighting_info and "time_of_day" in lighting_info: | |
lighting_type = lighting_info["time_of_day"] | |
lighting_description = self.template_manager.get_lighting_template(lighting_type) | |
description = self.text_formatter.smart_append(description, lighting_description) | |
# 添加視角描述 | |
if viewpoint != "eye_level": | |
viewpoint_template = self.template_manager.get_viewpoint_template(viewpoint) | |
prefix = viewpoint_template.get('prefix', '') | |
if prefix and not description.startswith(prefix): | |
if description and description[0].isupper(): | |
description = prefix + description[0].lower() + description[1:] | |
else: | |
description = prefix + description | |
viewpoint_desc = viewpoint_template.get("observation", "").format( | |
scene_elements="the landmark and surrounding area" | |
) | |
if viewpoint_desc and viewpoint_desc not in description: | |
description = self.text_formatter.smart_append(description, viewpoint_desc) | |
# 添加功能區域描述 | |
if functional_zones and len(functional_zones) > 0: | |
zones_desc = self.object_description_generator.describe_functional_zones(functional_zones) | |
if zones_desc: | |
description = self.text_formatter.smart_append(description, zones_desc) | |
# 描述可能的活動 | |
landmark_activities = [] | |
if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects): | |
landmark_activities = [ | |
"nature photography", | |
"scenic viewing", | |
"hiking or walking", | |
"guided nature tours", | |
"outdoor appreciation" | |
] | |
elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects): | |
landmark_activities = [ | |
"historical sightseeing", | |
"educational tours", | |
"cultural appreciation", | |
"photography of historical architecture", | |
"learning about historical significance" | |
] | |
else: | |
landmark_activities = [ | |
"sightseeing", | |
"taking photographs", | |
"guided tours", | |
"cultural tourism", | |
"souvenir shopping" | |
] | |
# 添加活動描述 | |
if landmark_activities: | |
activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "." | |
description = self.text_formatter.smart_append(description, activities_text) | |
return self.text_formatter.format_final_description(description) | |
except Exception as e: | |
self.logger.warning(f"Error generating landmark description: {str(e)}") | |
# 備用處理 | |
return self.text_formatter.format_final_description( | |
"A landmark scene with notable architectural or natural features." | |
) | |
def _is_intersection(self, detected_objects: List[Dict]) -> bool: | |
""" | |
通過分析物件分布來判斷場景是否為十字路口 | |
Args: | |
detected_objects: 檢測到的物件列表 | |
Returns: | |
bool: 是否為十字路口 | |
""" | |
try: | |
pedestrians = [obj for obj in detected_objects if obj.get("class_id") == 0] | |
if len(pedestrians) >= 8: | |
positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians] | |
x_coords = [pos[0] for pos in positions] | |
y_coords = [pos[1] for pos in positions] | |
x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 | |
y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 | |
x_range = max(x_coords) - min(x_coords) | |
y_range = max(y_coords) - min(y_coords) | |
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: | |
return True | |
return False | |
except Exception as e: | |
self.logger.warning(f"Error detecting intersection: {str(e)}") | |
return False | |
def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str: | |
""" | |
當場景類型未知或置信度極低時生成通用描述 | |
Args: | |
detected_objects: 檢測到的物件列表 | |
lighting_info: 可選的照明條件信息 | |
Returns: | |
str: 基於檢測物件的通用描述 | |
""" | |
try: | |
obj_counts = {} | |
for obj in detected_objects: | |
class_name = obj.get("class_name", "unknown object") | |
if class_name not in obj_counts: | |
obj_counts[class_name] = 0 | |
obj_counts[class_name] += 1 | |
top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5] | |
if not top_objects: | |
base_desc = "This scene displays various elements, though specific objects are not clearly identifiable." | |
else: | |
objects_text = [] | |
for name, count in top_objects: | |
# 確保物件名稱不包含技術性格式 | |
clean_name = name.replace('_', ' ') if isinstance(name, str) else str(name) | |
if count > 1: | |
objects_text.append(f"{count} {clean_name}s") | |
else: | |
objects_text.append(f"a {clean_name}" if clean_name[0].lower() not in 'aeiou' else f"an {clean_name}") | |
if len(objects_text) == 1: | |
objects_list = objects_text[0] | |
elif len(objects_text) == 2: | |
objects_list = f"{objects_text[0]} and {objects_text[1]}" | |
else: | |
objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}" | |
base_desc = f"This scene features {objects_list}." | |
# 添加照明信息 | |
if lighting_info and "time_of_day" in lighting_info: | |
lighting_type = lighting_info["time_of_day"] | |
lighting_desc = self.template_manager.get_lighting_template(lighting_type) | |
base_desc += f" {lighting_desc}" | |
return base_desc | |
except Exception as e: | |
self.logger.warning(f"Error generating generic description: {str(e)}") | |
return "A general scene is visible with various elements." | |
def _generate_scene_details(self, | |
scene_type: str, | |
detected_objects: List[Dict], | |
lighting_info: Optional[Dict] = None, | |
viewpoint: str = "eye_level", | |
spatial_analysis: Optional[Dict] = None, | |
image_dimensions: Optional[Tuple[int, int]] = None, | |
places365_info: Optional[Dict] = None, | |
object_statistics: Optional[Dict] = None) -> str: | |
""" | |
基於場景類型和檢測物件生成詳細描述 | |
Args: | |
scene_type: 識別的場景類型 | |
detected_objects: 檢測到的物件列表 | |
lighting_info: 可選的照明條件信息 | |
viewpoint: 檢測到的視角 | |
spatial_analysis: 可選的空間分析結果 | |
image_dimensions: 可選的圖像尺寸 | |
places365_info: 可選的 Places365 場景分類結果 | |
object_statistics: 可選的詳細物件統計信息 | |
Returns: | |
str: 詳細場景描述 | |
""" | |
try: | |
scene_details = "" | |
# 日常場景類型列表 | |
everyday_scene_types = [ | |
"general_indoor_space", "generic_street_view", | |
"desk_area_workspace", "outdoor_gathering_spot", | |
"kitchen_counter_or_utility_area", "unknown" | |
] | |
# 預處理場景類型以避免內部格式洩漏 | |
processed_scene_type = self._sanitize_scene_type_for_description(scene_type) | |
# 確定場景描述方法 | |
is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in self.template_manager.get_scene_detail_templates(scene_type) | |
treat_as_everyday = scene_type in everyday_scene_types | |
if hasattr(self, 'enable_landmark') and not self.enable_landmark: | |
if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]: | |
treat_as_everyday = True | |
if treat_as_everyday or not is_confident_specific_scene: | |
self.logger.debug(f"Generating dynamic description for scene_type: {scene_type}") | |
scene_details = self.object_description_generator.generate_dynamic_everyday_description( | |
detected_objects, | |
lighting_info, | |
viewpoint, | |
spatial_analysis, | |
image_dimensions, | |
places365_info, | |
object_statistics | |
) | |
else: | |
self.logger.debug(f"Using template for scene_type: {scene_type}") | |
templates_list = self.template_manager.get_scene_detail_templates(scene_type, viewpoint) | |
if templates_list: | |
detail_template = random.choice(templates_list) | |
scene_details = self.template_manager.fill_template( | |
detail_template, | |
detected_objects, | |
scene_type, | |
places365_info, | |
object_statistics | |
) | |
else: | |
scene_details = self.object_description_generator.generate_dynamic_everyday_description( | |
detected_objects, lighting_info, viewpoint, spatial_analysis, | |
image_dimensions, places365_info, object_statistics | |
) | |
# 如果禁用地標檢測,過濾地標引用 | |
if hasattr(self, 'enable_landmark') and not self.enable_landmark: | |
scene_details = self.text_formatter.filter_landmark_references(scene_details, enable_landmark=False) | |
return scene_details if scene_details else "A scene with some visual elements." | |
except Exception as e: | |
self.logger.warning(f"Error generating scene details: {str(e)}") | |
return "A scene with various elements." | |
def filter_landmark_references(self, text, enable_landmark=True): | |
""" | |
動態過濾文本中的地標引用 | |
Args: | |
text: 需要過濾的文本 | |
enable_landmark: 是否啟用地標功能 | |
Returns: | |
str: 過濾後的文本 | |
""" | |
return self.text_formatter.filter_landmark_references(text, enable_landmark) | |
def get_prominent_objects(self, detected_objects: List[Dict], | |
min_prominence_score: float = 0.5, | |
max_categories_to_return: Optional[int] = None, | |
max_total_objects: Optional[int] = None) -> List[Dict]: | |
""" | |
獲取最重要的物件 | |
Args: | |
detected_objects: 檢測到的物件列表 | |
min_prominence_score: 最小重要性分數閾值,預設為0.5 | |
max_categories_to_return: 可選的最大返回類別數量限制 | |
max_total_objects: 可選的最大返回物件總數限制 | |
Returns: | |
List[Dict]: 重要物件列表 | |
""" | |
try: | |
# 傳遞所有參數 | |
prominent_objects = self.object_description_generator.get_prominent_objects( | |
detected_objects, | |
min_prominence_score, | |
max_categories_to_return | |
) | |
# 如果指定了最大物件總數限制,進行額外過濾 | |
if max_total_objects is not None and max_total_objects > 0: | |
# 限制總物件數量,保持重要性排序 | |
prominent_objects = prominent_objects[:max_total_objects] | |
# 如果指定了最大類別數量限制,則進行額外過濾 | |
if max_categories_to_return is not None and max_categories_to_return > 0: | |
# 按類別分組物件 | |
categories_seen = set() | |
filtered_objects = [] | |
for obj in prominent_objects: | |
class_name = obj.get("class_name", "unknown") | |
if class_name not in categories_seen: | |
categories_seen.add(class_name) | |
filtered_objects.append(obj) | |
# 如果已達到最大類別數量,停止添加新類別 | |
if len(categories_seen) >= max_categories_to_return: | |
break | |
elif class_name in categories_seen: | |
# 如果是已見過的類別,仍然添加該物件 | |
filtered_objects.append(obj) | |
return filtered_objects | |
return prominent_objects | |
except Exception as e: | |
self.logger.warning(f"Error getting prominent objects: {str(e)}") | |
return [] | |
def detect_viewpoint(self, detected_objects: List[Dict]) -> str: | |
""" | |
檢測圖像視角類型 | |
Args: | |
detected_objects: 檢測到的物件列表 | |
Returns: | |
str: 檢測到的視角類型 | |
""" | |
try: | |
return self.viewpoint_detector.detect_viewpoint(detected_objects) | |
except Exception as e: | |
self.logger.warning(f"Error detecting viewpoint: {str(e)}") | |
return "eye_level" | |
def detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]: | |
""" | |
檢測場景的文化語境 | |
Args: | |
scene_type: 識別的場景類型 | |
detected_objects: 檢測到的物件列表 | |
Returns: | |
Optional[str]: 檢測到的文化語境或None | |
""" | |
try: | |
return self.cultural_context_analyzer.detect_cultural_context(scene_type, detected_objects) | |
except CulturalContextError as e: | |
self.logger.warning(f"Error detecting cultural context: {str(e)}") | |
return None | |
def generate_cultural_elements(self, cultural_context: str) -> str: | |
""" | |
為檢測到的文化語境生成描述元素 | |
Args: | |
cultural_context: 檢測到的文化語境 | |
Returns: | |
str: 文化元素描述 | |
""" | |
try: | |
return self.cultural_context_analyzer.generate_cultural_elements(cultural_context) | |
except CulturalContextError as e: | |
self.logger.warning(f"Error generating cultural elements: {str(e)}") | |
return "" | |
def format_object_list_for_description(self, objects: List[Dict], | |
use_indefinite_article_for_one: bool = False, | |
count_threshold_for_generalization: int = -1, | |
max_types_to_list: int = 5) -> str: | |
""" | |
將物件列表格式化為人類可讀的字符串 | |
Args: | |
objects: 物件字典列表 | |
use_indefinite_article_for_one: 單個物件是否使用 "a/an" | |
count_threshold_for_generalization: 計數閾值 | |
max_types_to_list: 最大物件類型數量 | |
Returns: | |
str: 格式化的物件描述字符串 | |
""" | |
try: | |
return self.object_description_generator.format_object_list_for_description( | |
objects, use_indefinite_article_for_one, count_threshold_for_generalization, max_types_to_list | |
) | |
except ObjectDescriptionError as e: | |
self.logger.warning(f"Error formatting object list: {str(e)}") | |
return "various objects" | |
def get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, | |
image_height: Optional[int] = None) -> str: | |
""" | |
為物件生成空間位置描述 | |
Args: | |
obj: 物件字典 | |
image_width: 可選的圖像寬度 | |
image_height: 可選的圖像高度 | |
Returns: | |
str: 空間描述字符串 | |
""" | |
try: | |
return self.object_description_generator.get_spatial_description(obj, image_width, image_height) | |
except ObjectDescriptionError as e: | |
self.logger.warning(f"Error generating spatial description: {str(e)}") | |
return "in the scene" | |
def optimize_object_description(self, description: str) -> str: | |
""" | |
優化物件描述,避免重複列舉相同物件 | |
Args: | |
description: 原始描述文本 | |
Returns: | |
str: 優化後的描述文本 | |
""" | |
try: | |
return self.object_description_generator.optimize_object_description(description) | |
except ObjectDescriptionError as e: | |
self.logger.warning(f"Error optimizing object description: {str(e)}") | |
return description | |
def describe_functional_zones(self, functional_zones: Dict) -> str: | |
""" | |
生成場景功能區域的描述 | |
Args: | |
functional_zones: 識別出的功能區域字典 | |
Returns: | |
str: 功能區域描述 | |
""" | |
try: | |
return self.object_description_generator.describe_functional_zones(functional_zones) | |
except ObjectDescriptionError as e: | |
self.logger.warning(f"Error describing functional zones: {str(e)}") | |
return "" | |
def smart_append(self, current_text: str, new_fragment: str) -> str: | |
""" | |
智能地將新文本片段附加到現有文本 | |
Args: | |
current_text: 要附加到的現有文本 | |
new_fragment: 要附加的新文本片段 | |
Returns: | |
str: 合併後的文本 | |
""" | |
try: | |
return self.text_formatter.smart_append(current_text, new_fragment) | |
except TextFormattingError as e: | |
self.logger.warning(f"Error in smart append: {str(e)}") | |
return f"{current_text} {new_fragment}" if current_text else new_fragment | |
def format_final_description(self, text: str) -> str: | |
""" | |
格式化最終描述文本 | |
Args: | |
text: 要格式化的文本 | |
Returns: | |
str: 格式化後的文本 | |
""" | |
try: | |
return self.text_formatter.format_final_description(text) | |
except TextFormattingError as e: | |
self.logger.warning(f"Error formatting final description: {str(e)}") | |
return text | |
def get_template(self, category: str, key: Optional[str] = None): | |
""" | |
獲取指定類別的模板 | |
Args: | |
category: 模板類別名稱 | |
key: 可選的具體模板鍵值 | |
Returns: | |
模板內容 | |
""" | |
try: | |
return self.template_manager.get_template(category, key) | |
except (TemplateLoadingError, TemplateFillError) as e: | |
self.logger.warning(f"Error getting template: {str(e)}") | |
return None | |
def get_viewpoint_confidence(self, detected_objects: List[Dict]) -> Tuple[str, float]: | |
""" | |
獲取視角檢測結果及其信心度 | |
Args: | |
detected_objects: 檢測到的物件列表 | |
Returns: | |
Tuple[str, float]: (視角類型, 信心度) | |
""" | |
try: | |
return self.viewpoint_detector.get_viewpoint_confidence(detected_objects) | |
except ViewpointDetectionError as e: | |
self.logger.warning(f"Error getting viewpoint confidence: {str(e)}") | |
return "eye_level", 0.5 | |
def get_supported_cultures(self) -> List[str]: | |
""" | |
獲取所有支援的文化語境列表 | |
Returns: | |
List[str]: 支援的文化語境名稱列表 | |
""" | |
return self.cultural_context_analyzer.get_supported_cultures() | |
def has_cultural_context(self, cultural_context: str) -> bool: | |
""" | |
檢查是否支援指定的文化語境 | |
Args: | |
cultural_context: 文化語境名稱 | |
Returns: | |
bool: 是否支援該文化語境 | |
""" | |
return self.cultural_context_analyzer.has_cultural_context(cultural_context) | |
def validate_text_quality(self, text: str) -> Dict[str, bool]: | |
""" | |
驗證文本質量 | |
Args: | |
text: 要驗證的文本 | |
Returns: | |
Dict[str, bool]: 質量檢查結果 | |
""" | |
try: | |
return self.text_formatter.validate_text_quality(text) | |
except TextFormattingError as e: | |
self.logger.warning(f"Error validating text quality: {str(e)}") | |
return {"error": True} | |
def get_text_statistics(self, text: str) -> Dict[str, int]: | |
""" | |
獲取文本統計信息 | |
Args: | |
text: 要分析的文本 | |
Returns: | |
Dict[str, int]: 文本統計信息 | |
""" | |
try: | |
return self.text_formatter.get_text_statistics(text) | |
except TextFormattingError as e: | |
self.logger.warning(f"Error getting text statistics: {str(e)}") | |
return {"characters": 0, "words": 0, "sentences": 0} | |
def reload_templates(self): | |
""" | |
重新載入所有模板 | |
""" | |
try: | |
self.template_manager.reload_templates() | |
self.logger.info("Templates reloaded successfully") | |
except (TemplateLoadingError, TemplateFillError) as e: | |
self.logger.error(f"Error reloading templates: {str(e)}") | |
raise EnhancedSceneDescriberError(f"Failed to reload templates: {str(e)}") from e | |
def get_configuration(self) -> Dict[str, Any]: | |
""" | |
獲取當前配置信息 | |
Returns: | |
Dict[str, Any]: 配置信息字典 | |
""" | |
try: | |
return { | |
"scene_types_count": len(self.scene_types), | |
"viewpoint_detector_config": self.viewpoint_detector.viewpoint_params, | |
"object_generator_config": self.object_description_generator.get_configuration(), | |
"supported_cultures": self.cultural_context_analyzer.get_supported_cultures(), | |
"template_categories": self.template_manager.get_template_categories() | |
} | |
except Exception as e: | |
self.logger.warning(f"Error getting configuration: {str(e)}") | |
return {"error": str(e)} | |
def _initialize_fallback_components(self): | |
"""備用組件初始化""" | |
try: | |
self.region_analyzer = RegionAnalyzer() | |
self.object_description_generator = ObjectDescriptionGenerator( | |
region_analyzer=self.region_analyzer | |
) | |
except Exception as e: | |
self.logger.error(f"Fallback component initialization failed: {str(e)}") | |