Spaces:

DawnC
/

VisionScout

Running on Zero

File size: 24,378 Bytes

e6a18b7

import re
import logging
import traceback
import numpy as np
from typing import Dict, List, Tuple, Optional, Any
from PIL import Image

from clip_zero_shot_classifier import CLIPZeroShotClassifier
from landmark_activities import LANDMARK_ACTIVITIES
from landmark_data import ALL_LANDMARKS


class LandmarkProcessingManager:
    """
    負責處理所有地標相關的檢測和處理邏輯，包括未知物體的地標識別、
    地標物體的創建和驗證，以及地標引用的清理。
    """

    def __init__(self, enable_landmark: bool = True, use_clip: bool = True):
        """
        初始化地標處理管理器。

        Args:
            enable_landmark: 是否啟用地標檢測功能
            use_clip: 是否啟用 CLIP 分析功能
        """
        self.logger = logging.getLogger(__name__)
        self.enable_landmark = enable_landmark
        self.use_clip = use_clip

        # 載入地標相關數據
        self.landmark_activities = {}
        self.all_landmarks = {}
        self._load_landmark_data()

        # 地標分類器將按需初始化
        self.landmark_classifier = None

    def _load_landmark_data(self):
        """載入地標相關的數據結構。"""
        try:
            self.landmark_activities = LANDMARK_ACTIVITIES
            self.logger.info("Loaded LANDMARK_ACTIVITIES successfully")
        except ImportError as e:
            self.logger.warning(f"Failed to load LANDMARK_ACTIVITIES: {e}")
            self.landmark_activities = {}

        try:
            self.all_landmarks = ALL_LANDMARKS
            self.logger.info("Loaded ALL_LANDMARKS successfully")
        except ImportError as e:
            self.logger.warning(f"Failed to load ALL_LANDMARKS: {e}")
            self.all_landmarks = {}

    def set_landmark_classifier(self, landmark_classifier):
        """
        設置地標分類器實例。

        Args:
            landmark_classifier: CLIPZeroShotClassifier 實例
        """
        self.landmark_classifier = landmark_classifier

    def process_unknown_objects(self, detection_result, detected_objects, clip_analyzer=None):
        """
        對 YOLO 未能識別或信心度低的物體進行地標檢測。

        Args:
            detection_result: YOLO 檢測結果
            detected_objects: 已識別的物體列表
            clip_analyzer: CLIP 分析器實例（用於按需初始化地標分類器）

        Returns:
            tuple: (更新後的物體列表, 地標物體列表)
        """
        if (not self.enable_landmark or not self.use_clip or
            not hasattr(self, 'use_landmark_detection') or not self.use_landmark_detection):
            # 未啟用地標識別時，確保返回的物體列表中不包含任何地標物體
            cleaned_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
            return cleaned_objects, []

        try:
            # 獲取原始圖像
            original_image = None
            if detection_result is not None and hasattr(detection_result, 'orig_img'):
                original_image = detection_result.orig_img

            # 檢查原始圖像是否存在
            if original_image is None:
                self.logger.warning("Original image not available for landmark detection")
                return detected_objects, []

            # 確保原始圖像為 PIL 格式或可轉換為 PIL 格式
            if not isinstance(original_image, Image.Image):
                if isinstance(original_image, np.ndarray):
                    try:
                        if original_image.ndim == 3 and original_image.shape[2] == 4:  # RGBA
                            original_image = original_image[:, :, :3]  # 轉換為 RGB
                        if original_image.ndim == 2:  # 灰度圖
                            original_image = Image.fromarray(original_image).convert("RGB")
                        else:  # 假設為 RGB 或 BGR
                            original_image = Image.fromarray(original_image)

                        if hasattr(original_image, 'mode') and original_image.mode == 'BGR':  # 從 OpenCV 明確將 BGR 轉換為 RGB
                            original_image = original_image.convert('RGB')
                    except Exception as e:
                        self.logger.warning(f"Error converting image for landmark detection: {e}")
                        return detected_objects, []
                else:
                    self.logger.warning(f"Cannot process image of type {type(original_image)}")
                    return detected_objects, []

            # 獲取圖像維度
            if isinstance(original_image, np.ndarray):
                h, w = original_image.shape[:2]
            elif isinstance(original_image, Image.Image):
                w, h = original_image.size
            else:
                self.logger.warning(f"Unable to determine image dimensions for type {type(original_image)}")
                return detected_objects, []

            # 收集可能含有地標的區域
            candidate_boxes = []
            low_conf_boxes = []

            # 即使沒有 YOLO 檢測到的物體，也嘗試進行更詳細的地標分析
            if len(detected_objects) == 0:
                # 創建一個包含整個圖像的框
                full_image_box = [0, 0, w, h]
                low_conf_boxes.append(full_image_box)
                candidate_boxes.append((full_image_box, "full_image"))

                # 加入網格分析以增加檢測成功率
                grid_size = 2  # 2x2 網格
                for i in range(grid_size):
                    for j in range(grid_size):
                        # 創建網格框
                        grid_box = [
                            j * w / grid_size,
                            i * h / grid_size,
                            (j + 1) * w / grid_size,
                            (i + 1) * h / grid_size
                        ]
                        low_conf_boxes.append(grid_box)
                        candidate_boxes.append((grid_box, "grid"))

                # 創建更大的中心框（覆蓋中心 70% 區域）
                center_box = [
                    w * 0.15, h * 0.15,
                    w * 0.85, h * 0.85
                ]
                low_conf_boxes.append(center_box)
                candidate_boxes.append((center_box, "center"))

                self.logger.info("No YOLO detections, attempting detailed landmark analysis with multiple regions")
            else:
                try:
                    # 獲取原始 YOLO 檢測結果中的低置信度物體
                    if (hasattr(detection_result, 'boxes') and
                        hasattr(detection_result.boxes, 'xyxy') and
                        hasattr(detection_result.boxes, 'conf') and
                        hasattr(detection_result.boxes, 'cls')):
                        all_boxes = (detection_result.boxes.xyxy.cpu().numpy()
                                   if hasattr(detection_result.boxes.xyxy, 'cpu')
                                   else detection_result.boxes.xyxy)
                        all_confs = (detection_result.boxes.conf.cpu().numpy()
                                   if hasattr(detection_result.boxes.conf, 'cpu')
                                   else detection_result.boxes.conf)
                        all_cls = (detection_result.boxes.cls.cpu().numpy()
                                 if hasattr(detection_result.boxes.cls, 'cpu')
                                 else detection_result.boxes.cls)

                        # 收集低置信度區域和可能含有地標的區域（如建築物）
                        for i, (box, conf, cls) in enumerate(zip(all_boxes, all_confs, all_cls)):
                            is_low_conf = conf < 0.4 and conf > 0.1

                            # 根據物體類別 ID 識別建築物 - 使用通用分類
                            common_building_classes = [11, 12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]  # 常見建築類別 ID
                            is_building = int(cls) in common_building_classes

                            # 計算相對面積 - 大物體
                            is_large_object = (box[2] - box[0]) * (box[3] - box[1]) > (0.1 * w * h)

                            if is_low_conf or is_building:
                                # 確保 box 是一個有效的數組或列表
                                if isinstance(box, (list, tuple, np.ndarray)) and len(box) >= 4:
                                    low_conf_boxes.append(box)
                                    if is_large_object:
                                        candidate_boxes.append((box, "building" if is_building else "low_conf"))
                except Exception as e:
                    self.logger.error(f"Error processing YOLO detections: {e}")
                    traceback.print_exc()

            # 按需初始化地標分類器
            if not self.landmark_classifier:
                if clip_analyzer and hasattr(clip_analyzer, 'get_clip_instance'):
                    try:
                        self.logger.info("Initializing landmark classifier for process_unknown_objects")
                        model, preprocess, device = clip_analyzer.get_clip_instance()
                        self.landmark_classifier = CLIPZeroShotClassifier(device=device)
                    except Exception as e:
                        self.logger.error(f"Error initializing landmark classifier: {e}")
                        return detected_objects, []
                else:
                    self.logger.warning("landmark_classifier not available and cannot be initialized")
                    return detected_objects, []

            # 使用智能地標搜索
            landmark_results = None
            try:
                # 確保有有效的框
                if not low_conf_boxes:
                    # 如果沒有低置信度框，添加全圖
                    low_conf_boxes.append([0, 0, w, h])

                landmark_results = self.landmark_classifier.intelligent_landmark_search(
                    original_image,
                    yolo_boxes=low_conf_boxes,
                    base_threshold=0.25
                )
            except Exception as e:
                self.logger.error(f"Error in intelligent_landmark_search: {e}")
                traceback.print_exc()
                return detected_objects, []

            # 處理識別結果
            landmark_objects = []

            # 如果有效的地標結果
            if landmark_results and landmark_results.get("is_landmark_scene", False):
                for landmark_info in landmark_results.get("detected_landmarks", []):
                    try:
                        # 使用 landmark_classifier 的閾值判斷
                        base_threshold = 0.25  # 基礎閾值

                        # 獲取地標類型並設定閾值
                        landmark_type = "architectural"  # 預設類型
                        type_threshold = 0.5  # 預設閾值

                        # 優先使用 landmark_classifier
                        if (hasattr(self.landmark_classifier, '_determine_landmark_type') and
                            landmark_info.get("landmark_id")):
                            landmark_type = self.landmark_classifier._determine_landmark_type(landmark_info.get("landmark_id"))
                            type_threshold = getattr(self.landmark_classifier, 'landmark_type_thresholds', {}).get(landmark_type, 0.5)
                        # 否則使用本地方法
                        elif hasattr(self, '_determine_landmark_type'):
                            landmark_type = self._determine_landmark_type(landmark_info.get("landmark_id", ""))
                            # 依據地標類型調整閾值
                            if landmark_type == "skyscraper":
                                type_threshold = 0.4
                            elif landmark_type == "natural":
                                type_threshold = 0.6
                        # 或者直接從地標 ID 推斷
                        else:
                            landmark_id = landmark_info.get("landmark_id", "").lower()
                            if any(term in landmark_id for term in ["mountain", "canyon", "waterfall", "lake", "river", "natural"]):
                                landmark_type = "natural"
                                type_threshold = 0.6
                            elif any(term in landmark_id for term in ["skyscraper", "building", "tower", "tall"]):
                                landmark_type = "skyscraper"
                                type_threshold = 0.4
                            elif any(term in landmark_id for term in ["monument", "memorial", "statue", "historical"]):
                                landmark_type = "monument"
                                type_threshold = 0.5

                        effective_threshold = base_threshold * (type_threshold / 0.5)

                        # 如果置信度足夠高
                        if landmark_info.get("confidence", 0) > effective_threshold:
                            # 獲取邊界框
                            if "box" in landmark_info:
                                box = landmark_info["box"]
                            else:
                                # 如果沒有邊界框，使用整個圖像的 90% 區域
                                margin_x, margin_y = w * 0.05, h * 0.05
                                box = [margin_x, margin_y, w - margin_x, h - margin_y]

                            # 計算中心點和其他必要信息
                            center_x = (box[0] + box[2]) / 2
                            center_y = (box[1] + box[3]) / 2
                            norm_center_x = center_x / w if w > 0 else 0.5
                            norm_center_y = center_y / h if h > 0 else 0.5

                            # 獲取區域位置（需要 spatial_analyzer 的支持）
                            region = "center"  # 預設

                            # 創建地標物體
                            landmark_obj = {
                                "class_id": (landmark_info.get("landmark_id", "")[:15]
                                           if isinstance(landmark_info.get("landmark_id", ""), str)
                                           else "-100"),  # 截斷過長的 ID
                                "class_name": landmark_info.get("landmark_name", "Unknown Landmark"),
                                "confidence": landmark_info.get("confidence", 0.0),
                                "box": box,
                                "center": (center_x, center_y),
                                "normalized_center": (norm_center_x, norm_center_y),
                                "size": (box[2] - box[0], box[3] - box[1]),
                                "normalized_size": (
                                    (box[2] - box[0]) / w if w > 0 else 0,
                                    (box[3] - box[1]) / h if h > 0 else 0
                                ),
                                "area": (box[2] - box[0]) * (box[3] - box[1]),
                                "normalized_area": (
                                    (box[2] - box[0]) * (box[3] - box[1]) / (w * h) if w * h > 0 else 0
                                ),
                                "region": region,
                                "is_landmark": True,
                                "landmark_id": landmark_info.get("landmark_id", ""),
                                "location": landmark_info.get("location", "Unknown Location")
                            }

                            # 添加額外信息
                            for key in ["year_built", "architectural_style", "significance"]:
                                if key in landmark_info:
                                    landmark_obj[key] = landmark_info[key]

                            # 添加地標類型
                            landmark_obj["landmark_type"] = landmark_type

                            # 添加到檢測物體列表
                            detected_objects.append(landmark_obj)
                            landmark_objects.append(landmark_obj)
                            self.logger.info(f"Detected landmark: {landmark_info.get('landmark_name', 'Unknown')} with confidence {landmark_info.get('confidence', 0.0):.2f}")
                    except Exception as e:
                        self.logger.error(f"Error processing landmark: {e}")
                        continue

                return detected_objects, landmark_objects

            return detected_objects, []

        except Exception as e:
            self.logger.error(f"Error in landmark detection: {e}")
            traceback.print_exc()
            return detected_objects, []

    def remove_landmark_references(self, text):
        """
        從文本中移除所有地標引用。

        Args:
            text: 輸入文本

        Returns:
            str: 清除地標引用後的文本
        """
        if not text:
            return text

        try:
            # 動態收集所有地標名稱和位置
            landmark_names = []
            locations = []

            for landmark_id, info in self.all_landmarks.items():
                # 收集地標名稱及其別名
                landmark_names.append(info["name"])
                landmark_names.extend(info.get("aliases", []))

                # 收集地理位置
                if "location" in info:
                    location = info["location"]
                    locations.append(location)

                    # 處理分離的城市和國家名稱
                    parts = location.split(",")
                    if len(parts) >= 1:
                        locations.append(parts[0].strip())
                    if len(parts) >= 2:
                        locations.append(parts[1].strip())

            # 使用正則表達式動態替換所有地標名稱
            for name in landmark_names:
                if name and len(name) > 2:  # 避免過短的名稱
                    text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)

            # 動態替換所有位置引用
            for location in locations:
                if location and len(location) > 2:
                    # 替換常見位置表述模式
                    text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
                    text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
                    text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)

        except Exception as e:
            self.logger.warning(f"Error in dynamic landmark reference removal, using generic patterns: {e}")
            # 通用地標描述模式
            landmark_patterns = [
                # 地標地點模式
                (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure'),
                (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure'),
                (r'(the [A-Z][a-zA-Z\s]+ Tower)', r'the tower'),
                (r'(the [A-Z][a-zA-Z\s]+ Building)', r'the building'),
                (r'(the CN Tower)', r'the tower'),
                (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),

                # 地標位置關係模式
                (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),

                # 地標活動模式
                (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),

                # 一般性地標形容模式
                (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
                (r'landmark scene', r'urban scene'),
                (r'tourist destination', r'urban area'),
                (r'tourist attraction', r'urban area')
            ]

            for pattern, replacement in landmark_patterns:
                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

        return text

    def get_alternative_scene_type(self, landmark_scene_type, detected_objects, scene_scores):
        """
        為地標場景類型選擇適合的替代類型。

        Args:
            landmark_scene_type: 原始地標場景類型
            detected_objects: 檢測到的物體列表
            scene_scores: 所有場景類型的分數

        Returns:
            str: 適合的替代場景類型
        """
        # 1. 嘗試從現有場景分數中找出第二高的非地標場景
        landmark_types = {"tourist_landmark", "natural_landmark", "historical_monument"}
        alternative_scores = {k: v for k, v in scene_scores.items() if k not in landmark_types and v > 0.2}

        if alternative_scores:
            # 返回分數最高的非地標場景類型
            return max(alternative_scores.items(), key=lambda x: x[1])[0]

        # 2. 基於物體組合推斷場景類型
        object_counts = {}
        for obj in detected_objects:
            class_name = obj.get("class_name", "")
            if class_name not in object_counts:
                object_counts[class_name] = 0
            object_counts[class_name] += 1

        # 根據物體組合決定場景類型
        if "car" in object_counts or "truck" in object_counts or "bus" in object_counts:
            # 有車輛，可能是街道或交叉路口
            if "traffic light" in object_counts or "stop sign" in object_counts:
                return "intersection"
            else:
                return "city_street"

        if "building" in object_counts and object_counts.get("person", 0) > 0:
            # 有建築物和人，可能是商業區
            return "commercial_district"

        if object_counts.get("person", 0) > 3:
            # 多個行人，可能是行人區
            return "pedestrian_area"

        if "bench" in object_counts or "potted plant" in object_counts:
            # 有長椅或盆栽，可能是公園區域
            return "park_area"

        # 3. 根據原始地標場景類型選擇合適的替代場景
        if landmark_scene_type == "natural_landmark":
            return "outdoor_natural_area"
        elif landmark_scene_type == "historical_monument":
            return "urban_architecture"

        # 默認回退到城市街道
        return "city_street"

    def extract_landmark_specific_activities(self, landmark_objects):
        """
        從識別的地標中提取特定活動。

        Args:
            landmark_objects: 地標物體列表

        Returns:
            List[str]: 地標特定活動列表
        """
        landmark_specific_activities = []

        # 優先收集來自識別地標的特定活動
        for lm_obj in landmark_objects:
            lm_id = lm_obj.get("landmark_id")
            if lm_id and lm_id in self.landmark_activities:
                landmark_specific_activities.extend(self.landmark_activities[lm_id])

        if landmark_specific_activities:
            landmark_names = [lm.get('landmark_name', 'unknown') for lm in landmark_objects if lm.get('is_landmark', False)]
            self.logger.info(f"Added {len(landmark_specific_activities)} landmark-specific activities for {', '.join(landmark_names)}")

        return landmark_specific_activities

    def update_enable_landmark_status(self, enable_landmark: bool):
        """
        更新地標檢測的啟用狀態。

        Args:
            enable_landmark: 是否啟用地標檢測
        """
        self.enable_landmark = enable_landmark

    def update_use_landmark_detection_status(self, use_landmark_detection: bool):
        """
        更新地標檢測使用狀態。

        Args:
            use_landmark_detection: 是否使用地標檢測
        """
        self.use_landmark_detection = use_landmark_detection