Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

VisionScout / response_processor.py

DawnC

Upload 6 files

bb01345 verified about 8 hours ago

raw

history blame contribute delete

47.5 kB

	import re
	import logging
	import traceback
	from typing import Dict, List, Any, Optional, Set


	class ResponseProcessingError(Exception):
	"""回應處理相關錯誤的自定義異常"""
	pass


	class ResponseProcessor:
	"""
	負責處理和清理LLM模型輸出的回應。
	包含格式清理、重複內容檢測、語法完整性確保等功能。
	"""

	def __init__(self):
	"""初始化回應處理器"""
	# set the logger
	self.logger = logging.getLogger(self.__class__.__name__)
	if not self.logger.handlers:
	handler = logging.StreamHandler()
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)
	self.logger.addHandler(handler)
	self.logger.setLevel(logging.INFO)

	# 初始化清理規則和替換字典
	self._initialize_cleaning_rules()
	self.logger.info("ResponseProcessor initialized successfully")


	def _initialize_cleaning_rules(self):
	"""初始化各種清理規則和替換字典，把常見有問題情況優化"""
	try:
	# 設置重複詞彙的替換字典
	self.replacement_alternatives = {
	'visible': ['present', 'evident', 'apparent', 'observable'],
	'positioned': ['arranged', 'placed', 'set', 'organized'],
	'located': ['found', 'placed', 'situated', 'established'],
	'situated': ['placed', 'positioned', 'arranged', 'set'],
	'appears': ['seems', 'looks', 'presents', 'exhibits'],
	'features': ['includes', 'contains', 'displays', 'showcases'],
	'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
	'displays': ['presents', 'exhibits', 'shows', 'reveals']
	}

	# 設置需要移除的前綴短語
	self.prefixes_to_remove = [
	"Here's the enhanced description:",
	"Enhanced description:",
	"Here is the enhanced scene description:",
	"I've enhanced the description while preserving all factual details:",
	"Enhanced Description:",
	"Scene Description:",
	"Description:",
	"Here is the enhanced description:",
	"Here's the enhanced description:",
	"Here is a rewritten scene description that adheres to the provided critical rules:",
	"Here is the rewritten scene description:",
	"Here's a rewritten scene description:",
	"The rewritten scene description is as follows:"
	]

	# 設置需要移除的後綴短語
	self.suffixes_to_remove = [
	"I've maintained all the key factual elements",
	"I've preserved all the factual details",
	"All factual elements have been maintained"
	]

	# 設置重複檢測模式
	self.repetitive_patterns = [
	(r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
	(r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
	(r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
	(r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
	(r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
	(r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
	(r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
	]

	# 斜線組合的形容詞替換字典(有時會有斜線格式問題)
	self.slash_replacements = {
	'sunrise/sunset': 'warm lighting',
	'sunset/sunrise': 'warm lighting',
	'day/night': 'ambient lighting',
	'night/day': 'ambient lighting',
	'morning/evening': 'soft lighting',
	'evening/morning': 'soft lighting',
	'dawn/dusk': 'gentle lighting',
	'dusk/dawn': 'gentle lighting',
	'sunny/cloudy': 'natural lighting',
	'cloudy/sunny': 'natural lighting',
	'bright/dark': 'varied lighting',
	'dark/bright': 'varied lighting',
	'light/shadow': 'contrasting illumination',
	'shadow/light': 'contrasting illumination',
	'indoor/outdoor': 'mixed environment',
	'outdoor/indoor': 'mixed environment',
	'inside/outside': 'transitional space',
	'outside/inside': 'transitional space',
	'urban/rural': 'diverse landscape',
	'rural/urban': 'diverse landscape',
	'modern/traditional': 'architectural blend',
	'traditional/modern': 'architectural blend',
	'old/new': 'varied architecture',
	'new/old': 'varied architecture',
	'busy/quiet': 'dynamic atmosphere',
	'quiet/busy': 'dynamic atmosphere',
	'crowded/empty': 'varying occupancy',
	'empty/crowded': 'varying occupancy',
	'hot/cold': 'comfortable temperature',
	'cold/hot': 'comfortable temperature',
	'wet/dry': 'mixed conditions',
	'dry/wet': 'mixed conditions',
	'summer/winter': 'seasonal atmosphere',
	'winter/summer': 'seasonal atmosphere',
	'spring/autumn': 'transitional season',
	'autumn/spring': 'transitional season',
	'left/right': 'balanced composition',
	'right/left': 'balanced composition',
	'near/far': 'layered perspective',
	'far/near': 'layered perspective',
	'high/low': 'varied elevation',
	'low/high': 'varied elevation',
	'big/small': 'diverse scale',
	'small/big': 'diverse scale',
	'wide/narrow': 'varied width',
	'narrow/wide': 'varied width',
	'open/closed': 'flexible space',
	'closed/open': 'flexible space',
	'public/private': 'community space',
	'private/public': 'community space',
	'formal/informal': 'relaxed setting',
	'informal/formal': 'relaxed setting',
	'commercial/residential': 'mixed-use area',
	'residential/commercial': 'mixed-use area'
	}

	# 新增：擴展的底線替換字典
	self.underscore_replacements = {
	'urban_intersection': 'urban intersection',
	'tourist_landmark': 'tourist landmark',
	'historical_site': 'historical site',
	'religious_building': 'religious building',
	'natural_landmark': 'natural landmark',
	'commercial_area': 'commercial area',
	'residential_area': 'residential area',
	'public_space': 'public space',
	'outdoor_scene': 'outdoor scene',
	'indoor_scene': 'indoor scene',
	'street_scene': 'street scene',
	'city_center': 'city center',
	'shopping_district': 'shopping district',
	'business_district': 'business district',
	'traffic_light': 'traffic light',
	'street_lamp': 'street lamp',
	'parking_meter': 'parking meter',
	'fire_hydrant': 'fire hydrant',
	'bus_stop': 'bus stop',
	'train_station': 'train station',
	'police_car': 'police car',
	'fire_truck': 'fire truck',
	'school_bus': 'school bus',
	'time_of_day': 'time of day',
	'weather_condition': 'weather condition',
	'lighting_condition': 'lighting condition',
	'atmospheric_condition': 'atmospheric condition',
	'human_activity': 'human activity',
	'pedestrian_traffic': 'pedestrian traffic',
	'vehicle_traffic': 'vehicle traffic',
	'social_gathering': 'social gathering',
	'object_detection': 'object detection',
	'scene_analysis': 'scene analysis',
	'image_classification': 'image classification',
	'computer_vision': 'computer vision'
	}

	self.logger.info("Cleaning rules initialized successfully")

	except Exception as e:
	error_msg = f"Failed to initialize cleaning rules: {str(e)}"
	self.logger.error(error_msg)
	self.logger.error(traceback.format_exc())
	raise ResponseProcessingError(error_msg) from e

	def clean_response(self, response: str, model_type: str = "general") -> str:
	"""
	清理LLM回應

	Args:
	response: 原始LLM回應
	model_type: 模型類型（用於特定清理規則）

	Returns:
	str: 清理後的回應

	Raises:
	ResponseProcessingError: 當回應處理失敗時
	"""
	if not response:
	raise ResponseProcessingError("Empty response provided for cleaning")

	try:
	self.logger.debug(f"Starting response cleaning (original length: {len(response)})")

	# 保存原始回應作為備份
	original_response = response

	# 根據模型類型選擇清理策略
	if "llama" in model_type.lower():
	cleaned_response = self._clean_llama_response(response)
	else:
	cleaned_response = self._clean_general_response(response)

	# 如果清理後內容過短，嘗試從原始回應中恢復
	if len(cleaned_response.strip()) < 40:
	self.logger.warning("Cleaned response too short, attempting recovery")
	cleaned_response = self._recover_from_overcleaning(original_response)

	# 最終驗證
	self._validate_cleaned_response(cleaned_response)

	self.logger.debug(f"Response cleaning completed (final length: {len(cleaned_response)})")
	return cleaned_response

	except Exception as e:
	error_msg = f"Response cleaning failed: {str(e)}"
	self.logger.error(error_msg)
	self.logger.error(traceback.format_exc())
	raise ResponseProcessingError(error_msg) from e

	def _clean_llama_response(self, response: str) -> str:
	"""
	專門處理Llama模型的回應清理

	Args:
	response: 原始Llama回應

	Returns:
	str: 清理後的回應
	"""
	# 首先應用通用清理
	response = self._clean_general_response(response)

	# Llama特有的前綴清理
	llama_prefixes = [
	"Here's the enhanced description:",
	"Enhanced description:",
	"Here is the enhanced scene description:",
	"I've enhanced the description while preserving all factual details:"
	]

	for prefix in llama_prefixes:
	if response.lower().startswith(prefix.lower()):
	response = response[len(prefix):].strip()

	# Llama特有的後綴清理
	llama_suffixes = [
	"I've maintained all the key factual elements",
	"I've preserved all the factual details",
	"All factual elements have been maintained"
	]

	for suffix in llama_suffixes:
	if response.lower().endswith(suffix.lower()):
	response = response[:response.rfind(suffix)].strip()

	return response

	def _clean_general_response(self, response: str) -> str:
	"""
	通用回應清理方法

	Args:
	response: 原始回應

	Returns:
	str: 清理後的回應
	"""
	response = self._critical_format_preprocess(response)

	# 1. 移除系統remark
	response = self._remove_system_markers(response)

	# 2. 移除介紹性prefix
	response = self._remove_introduction_prefixes(response)

	# 3. 移除格式標記和上下文標籤
	response = self._remove_format_markers(response)

	# 4. 清理場景類型引用
	response = self._clean_scene_type_references(response)

	# 5. 標準化標點符號
	response = self._normalize_punctuation(response)

	# 6. 移除重複句子
	response = self._remove_duplicate_sentences(response)

	# 7. 處理重複詞彙
	response = self._handle_repetitive_vocabulary(response)

	# 8. ensure completement
	response = self._ensure_grammatical_completeness(response)

	# 9. 控制字數長度
	response = self._control_word_length(response)

	# 10. 最終格式化
	response = self._final_formatting(response)

	return response


	def _critical_format_preprocess(self, response: str) -> str:
	"""
	關鍵格式預處理，處理最常見的格式問題

	Args:
	response: 原始回應

	Returns:
	str: 預處理後的回應
	"""
	if not response:
	return response

	try:
	import re

	# 第一優先級：處理斜線問題
	# 首先處理已知的斜線組合，使用形容詞替換
	for slash_combo, replacement in self.slash_replacements.items():
	if slash_combo.lower() in response.lower():
	# 保持原始大小寫格式
	if slash_combo.upper() in response:
	replacement_formatted = replacement.upper()
	elif slash_combo.title() in response:
	replacement_formatted = replacement.title()
	else:
	replacement_formatted = replacement

	# 執行替換（不區分大小寫）
	response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE)
	self.logger.debug(f"Replaced slash pattern '{slash_combo}' with '{replacement_formatted}'")

	# 處理其他未預定義的斜線模式
	# 標準斜線模式：word/word
	slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b'
	matches = list(re.finditer(slash_pattern, response))
	for match in reversed(matches): # 從後往前處理避免位置偏移
	word1, word2 = match.groups()
	# 選擇較短或更常見的詞作為替換
	if len(word1) <= len(word2):
	replacement = word1
	else:
	replacement = word2
	response = response[:match.start()] + replacement + response[match.end():]
	self.logger.debug(f"Replaced general slash pattern '{match.group(0)}' with '{replacement}'")

	# 第二優先級：處理底線格式
	# 首先處理已知的底線組合
	for underscore_combo, replacement in self.underscore_replacements.items():
	if underscore_combo in response:
	response = response.replace(underscore_combo, replacement)
	self.logger.debug(f"Replaced underscore pattern '{underscore_combo}' with '{replacement}'")

	# 處理三個詞的底線組合：word_word_word → word word word
	response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response)

	# 處理任何剩餘的底線模式：word_word → word word
	response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response)

	# 第三優先級：修正不完整句子
	incomplete_sentence_fixes = [
	(r'\bIn\s,\s', 'Throughout the area, '),
	(r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'),
	(r'\bThe overall atmosphere of\s+is\b', 'The overall atmosphere'),
	(r'\bwith its lights turned illuminating\b', 'with its lights illuminating'),
	(r'\bwhere it stands as\b', 'where it stands as'),
	]

	for pattern, replacement in incomplete_sentence_fixes:
	response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)

	# 第四優先級：語法修正處理(像是person and people)
	grammar_fixes = [
	(r'\b(\d+)\s+persons\b', r'\1 people'),
	(r'\bone\s+persons\b', 'one person'),
	(r'\btwo\s+persons\b', 'two people'),
	(r'\bthree\s+persons\b', 'three people'),
	(r'\bfour\s+persons\b', 'four people'),
	(r'\bfive\s+persons\b', 'five people'),
	(r'\bsix\s+persons\b', 'six people'),
	(r'\bseven\s+persons\b', 'seven people'),
	(r'\beight\s+persons\b', 'eight people'),
	(r'\bnine\s+persons\b', 'nine people'),
	(r'\bten\s+persons\b', 'ten people'),
	(r'\bmultiple\s+persons\b', 'multiple people'),
	(r'\bseveral\s+persons\b', 'several people'),
	(r'\bmany\s+persons\b', 'many people'),
	(r'\ba\s+few\s+persons\b', 'a few people'),
	(r'\bsome\s+persons\b', 'some people')
	]

	for pattern, replacement in grammar_fixes:
	response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)

	return response

	except Exception as e:
	self.logger.warning(f"Error in critical format preprocessing: {str(e)}")
	return response

	def _remove_system_markers(self, response: str) -> str:
	"""移除系統樣式標記"""
	# 移除對話remark
	response = re.sub(r'<\\|.*?\\|>', '', response)

	# 移除輸出remark
	output_start = response.find("[OUTPUT_START]")
	output_end = response.find("[OUTPUT_END]")
	if output_start != -1 and output_end != -1 and output_end > output_start:
	response = response[output_start + len("[OUTPUT_START]"):output_end].strip()

	# 移除其他remark
	section_markers = [
	r'\[.*?\]',
	r'OUTPUT_START\s:\|OUTPUT_END\s:',
	r'ENHANCED DESCRIPTION\s*:',
	r'Scene Type\s:.?(?=\n\|$)',
	r'Original Description\s:.?(?=\n\|$)',
	r'GOOD\s:\|BAD\s:',
	r'PROBLEM\s:.?(?=\n\|$)',
	r'</?\\|(?:assistant\|system\|user)\\|>',
	r'$Note:.*?$',
	r'$.?I\'ve.?$',
	r'$.?as per your request.?$'
	]

	for marker in section_markers:
	response = re.sub(marker, '', response, flags=re.IGNORECASE)

	return response

	def _remove_introduction_prefixes(self, response: str) -> str:
	"""移除介紹性前綴"""
	# 處理 "Here is..." 類型的prefix
	intro_prefixes = [
	r'^Here\s+is\s+(?:a\s+\|the\s+)?(?:rewritten\s+\|enhanced\s+)?scene\s+description.?:\s',
	r'^The\s+(?:rewritten\s+\|enhanced\s+)?(?:scene\s+)?description\s+is.?:\s',
	r'^Here\'s\s+(?:a\s+\|the\s+)?(?:rewritten\s+\|enhanced\s+)?description.?:\s'
	]

	for prefix_pattern in intro_prefixes:
	response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)

	# 處理固定prefix
	for prefix in self.prefixes_to_remove:
	if response.lower().startswith(prefix.lower()):
	response = response[len(prefix):].strip()

	return response

	def _remove_format_markers(self, response: str) -> str:
	"""移除格式標記和上下文標籤（保留括號內的地理與細節資訊）"""
	# 移除上下文相關remark
	response = re.sub(r'<\sContext:.?>', '', response)
	response = re.sub(r'Context:.*?(?=\n\|$)', '', response)
	response = re.sub(r'Note:.*?(?=\n\|$)', '', response, flags=re.IGNORECASE)

	# 移除Markdown格式
	response = re.sub(r'\\\|\*\|__\|\\|', '', response)

	# 移除任何剩餘的特殊標記 (避開括號內容，以免剔除地理位置等有用資訊)
	response = re.sub(r'</?\\|.*?\\|>', '', response)
	# ※ 以下移除「刪除整個括號及其內文」的方式已註解，以保留地理位置資訊
	# response = re.sub(r'$.*?$', '', response)

	return response


	def _clean_scene_type_references(self, response: str) -> str:
	"""清理不當的場景類型引用"""
	scene_type_pattern = r'This ([a-zA-Z_]+) (features\|shows\|displays\|contains)'
	match = re.search(scene_type_pattern, response)
	if match and '_' in match.group(1):
	fixed_text = f"This scene {match.group(2)}"
	response = re.sub(scene_type_pattern, fixed_text, response)

	return response

	def _normalize_punctuation(self, response: str) -> str:
	"""標準化標點符號"""
	# 減少破折號使用
	response = re.sub(r'—', ', ', response)
	response = re.sub(r' - ', ', ', response)

	# 處理連續標點符號
	response = re.sub(r'([.,;:!?])\1+', r'\1', response)

	# 修復不完整句子的標點
	response = re.sub(r',\s*$', '.', response)

	# 修復句號後缺少空格的問題
	response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)

	# 清理多餘空格和換行
	response = response.replace('\r', ' ')
	response = re.sub(r'\n+', ' ', response)
	response = re.sub(r'\s{2,}', ' ', response)

	return response


	def _remove_duplicate_sentences(self, response: str, similarity_threshold: float = 0.85) -> str:
	"""
	移除重複或高度相似的句子，使用 Jaccard 相似度進行比較。
	Args:
	response: 原始回應文本。
	similarity_threshold: 認定句子重複的相似度閾值 (0.0 到 1.0)。
	較高的閾值表示句子需要非常相似才會被移除。
	Returns:
	str: 移除重複句子後的文本。
	"""
	try:
	if not response or not response.strip():
	return ""

	# (?<=[.!?]) 會保留分隔符在句尾, \s+ 會消耗句尾的空格
	# 這樣用 ' ' join 回去時, 標點和下個句子間剛好一個空格
	sentences = re.split(r'(?<=[.!?])\s+', response.strip())

	unique_sentences_data = [] # Store tuples of (original_sentence, simplified_word_set)

	min_sentence_len_for_check = 8 # 簡化後詞彙數少於此值，除非完全相同否則不輕易判斷為重複

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	# 創建簡化版本用於比較 (小寫，移除標點，分割為詞彙集合)
	# 保留數字，因為數字可能是關鍵資訊
	simplified_text = re.sub(r'[^\w\s\d]', '', sentence.lower())
	current_sentence_words = set(simplified_text.split())

	if not current_sentence_words: # 如果處理後是空集合，跳過
	continue

	is_duplicate = False
	# 與已保留的唯一句子比較
	for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data):
	# Jaccard Index
	intersection_len = len(current_sentence_words.intersection(kept_sentence_words))
	union_len = len(current_sentence_words.union(kept_sentence_words))

	if union_len == 0: # 兩個都是空集合，代表相同句子
	jaccard_similarity = 1.0
	else:
	jaccard_similarity = intersection_len / union_len

	# 用Jaccard 相似度超過閾值，不是兩個都非常短的句子 (避免 "Yes." 和 "No." 被錯誤合併)
	# 新句子完全被舊句子包含 (且舊句子更長)
	# 舊句子完全被新句子包含 (且新句子更長) -> 這種情況就需要替換
	if jaccard_similarity >= similarity_threshold:
	# 如果當前句子比已保留的句子短，且高度相似，則認為是重複
	if len(current_sentence_words) < len(kept_sentence_words):
	is_duplicate = True
	self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (shorter, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
	break
	# 如果當前句子比已保留的句子長，且高度相似，則替換掉已保留的
	elif len(current_sentence_words) > len(kept_sentence_words):
	self.logger.debug(f"Sentence \"{kept_sentence_text[:30]}...\" replaced by longer similar sentence \"{sentence[:30]}...\" Jaccard: {jaccard_similarity:.2f}")
	unique_sentences_data.pop(i) # 移除舊的、較短的句子

	# 如果長度差不多，但相似度高，保留第一個出現的
	elif current_sentence_words != kept_sentence_words : # 避免完全相同的句子被錯誤地跳過替換邏輯
	is_duplicate = True # 保留先出現的
	self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (similar length, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
	break

	if not is_duplicate:
	unique_sentences_data.append((sentence, current_sentence_words))

	# 重組唯一句子
	final_sentences = [s_data[0] for s_data in unique_sentences_data]

	# 確保每個句子以標點結尾 (因為 split 可能會產生沒有標點的最後一個片段)
	reconstructed_response = ""
	for i, s in enumerate(final_sentences):
	s = s.strip()
	if not s: continue
	if not s[-1] in ".!?":
	s += "."
	reconstructed_response += s
	if i < len(final_sentences) - 1:
	reconstructed_response += " " # 在句子間添加空格

	return reconstructed_response.strip()

	except Exception as e:
	self.logger.error(f"Error in _remove_duplicate_sentences: {str(e)}")
	self.logger.error(traceback.format_exc())
	return response # 發生錯誤時返回原始回應

	def _handle_repetitive_vocabulary(self, response: str) -> str:
	"""處理重複詞彙，使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
	try:
	# 檢測重複模式 (僅警告)
	if hasattr(self, 'repetitive_patterns'):
	for pattern, issue in self.repetitive_patterns:
	if re.search(pattern, response, re.IGNORECASE \| re.DOTALL):
	self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")

	if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
	return response

	processed_response = response

	for word_to_replace, alternatives in self.replacement_alternatives.items():
	if not alternatives: # 如果沒有可用的替代詞，則跳過
	continue

	# 為每個詞創建一個獨立的計數器和替代索引
	# 使用閉包或一個小類來封裝狀態
	class WordReplacer:
	def __init__(self, alternatives_list):
	self.count = 0
	self.alternative_idx = 0
	self.alternatives_list = alternatives_list

	def __call__(self, match_obj):
	self.count += 1
	original_word = match_obj.group(0)
	if self.count > 1: # 從第二次出現開始替換
	replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
	self.alternative_idx += 1
	# 保持原始大小寫格式
	if original_word.isupper():
	return replacement.upper()
	elif original_word.istitle():
	return replacement.capitalize()
	return replacement
	return original_word # 因為第一次出現, 就不用替換

	replacer_instance = WordReplacer(alternatives)
	# 使用 \b 確保匹配的是整個單詞
	pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
	processed_response = pattern.sub(replacer_instance, processed_response)

	# 移除 identical 等重複性描述詞彙
	identical_cleanup_patterns = [
	(r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
	(r'\b(two\|three\|four\|five\|six\|seven\|eight\|nine\|ten\|eleven\|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
	(r'\bidentical\s+([a-zA-Z\s]+)', r'\1'),
	(r'\bcomprehensive arrangement of\b', 'arrangement of'),
	(r'\bcomprehensive view featuring\b', 'scene featuring'),
	(r'\bcomprehensive display of\b', 'display of'),
	]

	for pattern, replacement in identical_cleanup_patterns:
	processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)

	# 數字到文字
	number_conversions = {
	'2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
	'7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
	'11': 'eleven', '12': 'twelve'
	}

	# 處理各種語法結構中的數字
	for digit, word in number_conversions.items():
	# 模式1: 數字 + 單一複數詞 (如 "7 chairs")
	pattern1 = rf'\b{digit}\s+([a-zA-Z]+s)\b'
	processed_response = re.sub(pattern1, rf'{word} \1', processed_response)

	# 模式2: 數字 + 修飾詞 + 複數詞 (如 "7 more chairs")
	pattern2 = rf'\b{digit}\s+(more\|additional\|other\|identical)\s+([a-zA-Z]+s)\b'
	processed_response = re.sub(pattern2, rf'{word} \1 \2', processed_response, flags=re.IGNORECASE)

	# 模式3: 數字 + 形容詞 + 複數詞 (如 "2 dining tables")
	pattern3 = rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b'
	processed_response = re.sub(pattern3, rf'{word} \1 \2', processed_response)

	# 模式4: 介詞片語中的數字 (如 "around 2 tables")
	pattern4 = rf'\b(around\|approximately\|about)\s+{digit}\s+([a-zA-Z]+s)\b'
	processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE)

	return processed_response

	except Exception as e:
	self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
	self.logger.error(traceback.format_exc())
	return response # 發生錯誤時返回原始回應

	def _ensure_grammatical_completeness(self, response: str) -> str:
	"""
	確保語法完整性，處理不完整句子和格式問題

	Args:
	response: 待檢查的回應文本

	Returns:
	str: 語法完整的回應文本
	"""
	try:
	if not response or not response.strip():
	return response

	# 第一階段：檢查並修正不完整的句子模式
	incomplete_patterns = [
	# 介詞後直接結束的問題（針對 "over ." 等情況）
	(r'\b(over\|under\|through\|across\|along\|beneath\|beyond\|throughout)\s*\.', 'incomplete_preposition'),
	(r'\b(with\|without\|against\|towards\|beside\|between\|among)\s*\.', 'incomplete_preposition'),
	(r'\b(into\|onto\|upon\|within\|behind\|below\|above)\s*\.', 'incomplete_preposition'),

	# 處理 "In ," 這類缺失詞彙的問題
	(r'\bIn\s*,', 'incomplete_location'),
	(r'\bAt\s*,', 'incomplete_location'),
	(r'\bOn\s*,', 'incomplete_location'),
	(r'\bWith\s*,', 'incomplete_context'),

	# 不完整的描述模式
	(r'\b(fine\|the)\s+(the\s+)?(?:urban\|area\|scene)\b(?!\s+\w)', 'incomplete_description'),

	# 連詞或介詞後直接標點的問題
	(r'\b(and\|or\|but\|with\|from\|in\|at\|on\|by\|for\|to)\s*[.!?]', 'incomplete_conjunction'),

	# 重複詞彙
	(r'\b(\w+)\s+\1\b', 'word_repetition'),

	# 不完整的場景類型引用（如 "urban_intersection" 格式問題）
	(r'\b(\w+)_(\w+)\b', 'underscore_format'),

	# 地標場景特有問題
	(r'\btourist_landmark\b', 'underscore_format'),
	(r'\burban_intersection\b', 'underscore_format'),
	(r'\bIn\s,\s(?=\w)', 'incomplete_prepositional'),
	(r'\bOverall,\s+(?=exudes\|shows\|displays)(?!\s+(?:the\|this\|it))', 'missing_subject'),
	(r'\batmosphere of\s+is one of\b', 'redundant_structure'),
	(r'\bwith.*?turned\s+illuminating\b', 'redundant_participle')
	]

	for pattern, issue_type in incomplete_patterns:
	try:
	matches = list(re.finditer(pattern, response, re.IGNORECASE))

	for match in matches:
	if issue_type == 'incomplete_preposition':
	# 處理介詞後直接結束的情況
	response = self._fix_incomplete_preposition(response, match)

	elif issue_type == 'underscore_format':
	# 將下劃線格式轉換為空格分隔
	original = match.group(0)
	replacement = original.replace('_', ' ')
	response = response.replace(original, replacement)

	elif issue_type == 'word_repetition':
	# 移除重複的詞彙
	repeated_word = match.group(1)
	response = response.replace(f"{repeated_word} {repeated_word}", repeated_word)

	elif issue_type == 'incomplete_location' or issue_type == 'incomplete_context':
	# 移除不完整的位置或上下文引用
	response = response.replace(match.group(0), '')

	elif issue_type == 'incomplete_prepositional':
	# 處理不完整的介詞短語
	response = re.sub(r'\bIn\s,\s', 'Throughout the scene, ', response)

	elif issue_type == 'missing_subject':
	# 為Overall句子添加主語
	response = re.sub(r'\bOverall,\s+(?=exudes)', 'Overall, the scene ', response)

	elif issue_type == 'redundant_structure':
	# 簡化冗餘結構
	response = re.sub(r'\batmosphere of\s+is one of\b', 'atmosphere is one of', response)

	elif issue_type == 'redundant_participle':
	# 清理冗餘分詞
	response = re.sub(r'turned\s+illuminating', 'illuminating', response)

	else:
	# 對於其他不完整模式，直接移除
	response = response.replace(match.group(0), '')

	# 清理多餘空格
	response = re.sub(r'\s{2,}', ' ', response).strip()

	except re.error as e:
	self.logger.warning(f"Regular expression pattern error for {issue_type}: {pattern} - {str(e)}")
	continue

	# 第二階段：處理物件類別格式問題
	response = self._clean_object_class_references(response)

	# 第三階段：確保句子正確結束
	response = self._ensure_proper_sentence_ending(response)

	# 第四階段：最終語法檢查
	response = self._final_grammar_check(response)

	return response.strip()

	except Exception as e:
	self.logger.error(f"Error in _ensure_grammatical_completeness: {str(e)}")
	return response

	def _fix_incomplete_preposition(self, response: str, match) -> str:
	"""
	修正不完整的介詞短語

	Args:
	response: 回應文本
	match: 正則匹配對象

	Returns:
	str: 修正後的回應
	"""
	preposition = match.group(1)
	match_start = match.start()

	# 找到句子的開始位置
	sentence_start = response.rfind('.', 0, match_start)
	sentence_start = sentence_start + 1 if sentence_start != -1 else 0

	# 提取句子片段
	sentence_fragment = response[sentence_start:match_start].strip()

	# 如果句子片段有意義，嘗試移除不完整的介詞部分
	if len(sentence_fragment) > 10:
	# 移除介詞及其後的內容，添加適當的句號
	response = response[:match_start].rstrip() + '.'
	else:
	# 如果句子片段太短，移除整個不完整的句子
	response = response[:sentence_start] + response[match.end():]

	return response

	def _clean_object_class_references(self, response: str) -> str:
	"""
	清理物件類別引用中的格式問題

	Args:
	response: 回應文本

	Returns:
	str: 清理後的回應
	"""
	# 移除類別ID引用（如 "unknown-class 2", "Class 0" 等）
	class_id_patterns = [
	r'\bunknown[- ]?class\s\d+\sobjects?',
	r'\bclass[- ]?\d+\s*objects?',
	r'\b[Cc]lass\s\d+\sobjects?',
	r'\bunknown[- ][Cc]lass\s\d+\sobjects?'
	]

	for pattern in class_id_patterns:
	try:
	# 替換為更自然的描述
	response = re.sub(pattern, 'objects', response, flags=re.IGNORECASE)
	except re.error as e:
	self.logger.warning(f"Error cleaning class reference pattern {pattern}: {str(e)}")
	continue

	# 處理數量描述中的問題
	response = re.sub(r'\b(\w+)\s+unknown[- ]?\w\sobjects?', r'\1 objects', response, flags=re.IGNORECASE)

	return response

	def _ensure_proper_sentence_ending(self, response: str) -> str:
	"""
	確保句子有適當的結尾

	Args:
	response: 回應文本

	Returns:
	str: 具有適當結尾的回應
	"""
	if not response or not response.strip():
	return response

	response = response.strip()

	# 檢查是否以標點符號結尾
	if response and response[-1] not in ['.', '!', '?']:

	# 常見介詞和連詞列表
	problematic_endings = [
	"into", "onto", "about", "above", "across", "after", "along", "around",
	"at", "before", "behind", "below", "beneath", "beside", "between",
	"beyond", "by", "down", "during", "except", "for", "from", "in",
	"inside", "near", "of", "off", "on", "over", "through", "to",
	"toward", "under", "up", "upon", "with", "within", "and", "or", "but"
	]

	words = response.split()
	if words:
	last_word = words[-1].lower().rstrip('.,!?')

	if last_word in problematic_endings:
	# 找到最後完整的句子
	last_period_pos = max(
	response.rfind('.'),
	response.rfind('!'),
	response.rfind('?')
	)

	if last_period_pos > len(response) // 2: # 如果有較近的完整句子
	response = response[:last_period_pos + 1]
	else:
	# 移除問題詞彙並添加句號
	if len(words) > 1:
	response = " ".join(words[:-1]) + "."
	else:
	response = "The scene displays various elements."
	else:
	# 正常情況下添加句號
	response += "."

	return response

	def _final_grammar_check(self, response: str) -> str:
	"""
	最終語法檢查和清理

	Args:
	response: 回應文本

	Returns:
	str: 最終清理後的回應
	"""
	if not response:
	return response

	# 修正連續標點符號
	response = re.sub(r'([.!?]){2,}', r'\1', response)

	# 修正句號前的空格
	response = re.sub(r'\s+([.!?])', r'\1', response)

	# 修正句號後缺少空格的問題
	response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)

	# 確保首字母大寫
	if response and response[0].islower():
	response = response[0].upper() + response[1:]

	# 移除多餘的空格
	response = re.sub(r'\s{2,}', ' ', response)

	# 處理空句子或過短的回應
	if len(response.strip()) < 20:
	return "The scene contains various visual elements."

	return response.strip()

	def _control_word_length(self, response: str) -> str:
	"""控制文字長度在合理範圍內"""
	words = response.split()
	if len(words) > 200:
	# 找到接近字數限制的句子結束處
	truncated = ' '.join(words[:200])
	last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))

	if last_period > 0:
	response = truncated[:last_period+1]
	else:
	response = truncated + "."

	return response

	def _final_formatting(self, response: str) -> str:
	"""最終格式化處理"""
	# 確保首字母大寫
	if response and response[0].islower():
	response = response[0].upper() + response[1:]

	# 統一格式為單一段落
	response = re.sub(r'\s\n\s', ' ', response)
	response = ' '.join(response.split())

	return response.strip()

	def _recover_from_overcleaning(self, original_response: str) -> str:
	"""從過度清理中恢復內容"""
	try:
	# 嘗試從原始回應中找到最佳段落
	paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
	if paragraphs:
	# 選擇最長的段落作為主要描述
	best_para = max(paragraphs, key=len)
	# 使用基本清理規則
	best_para = re.sub(r'\[.*?\]', '', best_para)
	best_para = re.sub(r'\s{2,}', ' ', best_para).strip()

	if len(best_para) >= 40:
	return best_para

	return "Unable to generate a valid enhanced description."

	except Exception as e:
	self.logger.error(f"Recovery from overcleaning failed: {str(e)}")
	return "Description generation error."

	def _validate_cleaned_response(self, response: str):
	"""驗證清理後的回應"""
	if not response:
	raise ResponseProcessingError("Response is empty after cleaning")

	if len(response.strip()) < 20:
	raise ResponseProcessingError("Response is too short after cleaning")

	# 檢查是否包含基本的句子結構
	if not re.search(r'[.!?]', response):
	raise ResponseProcessingError("Response lacks proper sentence structure")

	def remove_explanatory_notes(self, response: str) -> str:
	"""
	移除解釋性注釋和說明

	Args:
	response: 包含可能注釋的回應

	Returns:
	str: 移除注釋後的回應
	"""
	try:
	# 識別常見的注釋和解釋模式
	note_patterns = [
	r'(?:^\|\n)Note:.*?(?:\n\|$)',
	r'(?:^\|\n)I have (?:followed\|adhered to\|ensured).*?(?:\n\|$)',
	r'(?:^\|\n)This description (?:follows\|adheres to\|maintains).*?(?:\n\|$)',
	r'(?:^\|\n)The enhanced description (?:maintains\|preserves).*?(?:\n\|$)'
	]

	# 尋找段落
	paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]

	# 如果只有一個段落，檢查並清理它
	if len(paragraphs) == 1:
	for pattern in note_patterns:
	paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
	return paragraphs[0].strip()

	# 如果有多個段落，移除注釋段落
	content_paragraphs = []
	for paragraph in paragraphs:
	is_note = False
	for pattern in note_patterns:
	if re.search(pattern, paragraph, flags=re.IGNORECASE):
	is_note = True
	break

	# 檢查段落是否以常見的注釋詞開頭
	if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
	is_note = True

	if not is_note:
	content_paragraphs.append(paragraph)

	return '\n\n'.join(content_paragraphs).strip()

	except Exception as e:
	self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
	return response

	def get_processor_info(self) -> Dict[str, Any]:
	"""
	獲取處理器信息

	Returns:
	Dict[str, Any]: 包含處理器狀態和配置的信息
	"""
	return {
	"replacement_alternatives_count": len(self.replacement_alternatives),
	"prefixes_to_remove_count": len(self.prefixes_to_remove),
	"suffixes_to_remove_count": len(self.suffixes_to_remove),
	"repetitive_patterns_count": len(self.repetitive_patterns),
	"initialization_status": "success"
	}