Spaces:
Running
on
Zero
Running
on
Zero
import re | |
import logging | |
import traceback | |
from typing import Dict, List, Any, Optional, Set | |
class ResponseProcessingError(Exception): | |
"""回應處理相關錯誤的自定義異常""" | |
pass | |
class ResponseProcessor: | |
""" | |
負責處理和清理LLM模型輸出的回應。 | |
包含格式清理、重複內容檢測、語法完整性確保等功能。 | |
""" | |
def __init__(self): | |
"""初始化回應處理器""" | |
# set the logger | |
self.logger = logging.getLogger(self.__class__.__name__) | |
if not self.logger.handlers: | |
handler = logging.StreamHandler() | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
handler.setFormatter(formatter) | |
self.logger.addHandler(handler) | |
self.logger.setLevel(logging.INFO) | |
# 初始化清理規則和替換字典 | |
self._initialize_cleaning_rules() | |
self.logger.info("ResponseProcessor initialized successfully") | |
def _initialize_cleaning_rules(self): | |
"""初始化各種清理規則和替換字典,把常見有問題情況優化""" | |
try: | |
# 設置重複詞彙的替換字典 | |
self.replacement_alternatives = { | |
'visible': ['present', 'evident', 'apparent', 'observable'], | |
'positioned': ['arranged', 'placed', 'set', 'organized'], | |
'located': ['found', 'placed', 'situated', 'established'], | |
'situated': ['placed', 'positioned', 'arranged', 'set'], | |
'appears': ['seems', 'looks', 'presents', 'exhibits'], | |
'features': ['includes', 'contains', 'displays', 'showcases'], | |
'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'], | |
'displays': ['presents', 'exhibits', 'shows', 'reveals'] | |
} | |
# 設置需要移除的前綴短語 | |
self.prefixes_to_remove = [ | |
"Here's the enhanced description:", | |
"Enhanced description:", | |
"Here is the enhanced scene description:", | |
"I've enhanced the description while preserving all factual details:", | |
"Enhanced Description:", | |
"Scene Description:", | |
"Description:", | |
"Here is the enhanced description:", | |
"Here's the enhanced description:", | |
"Here is a rewritten scene description that adheres to the provided critical rules:", | |
"Here is the rewritten scene description:", | |
"Here's a rewritten scene description:", | |
"The rewritten scene description is as follows:" | |
] | |
# 設置需要移除的後綴短語 | |
self.suffixes_to_remove = [ | |
"I've maintained all the key factual elements", | |
"I've preserved all the factual details", | |
"All factual elements have been maintained" | |
] | |
# 設置重複檢測模式 | |
self.repetitive_patterns = [ | |
(r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'), | |
(r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'), | |
(r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'), | |
(r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'), | |
(r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'), | |
(r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'), | |
(r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected') | |
] | |
# 斜線組合的形容詞替換字典(有時會有斜線格式問題) | |
self.slash_replacements = { | |
'sunrise/sunset': 'warm lighting', | |
'sunset/sunrise': 'warm lighting', | |
'day/night': 'ambient lighting', | |
'night/day': 'ambient lighting', | |
'morning/evening': 'soft lighting', | |
'evening/morning': 'soft lighting', | |
'dawn/dusk': 'gentle lighting', | |
'dusk/dawn': 'gentle lighting', | |
'sunny/cloudy': 'natural lighting', | |
'cloudy/sunny': 'natural lighting', | |
'bright/dark': 'varied lighting', | |
'dark/bright': 'varied lighting', | |
'light/shadow': 'contrasting illumination', | |
'shadow/light': 'contrasting illumination', | |
'indoor/outdoor': 'mixed environment', | |
'outdoor/indoor': 'mixed environment', | |
'inside/outside': 'transitional space', | |
'outside/inside': 'transitional space', | |
'urban/rural': 'diverse landscape', | |
'rural/urban': 'diverse landscape', | |
'modern/traditional': 'architectural blend', | |
'traditional/modern': 'architectural blend', | |
'old/new': 'varied architecture', | |
'new/old': 'varied architecture', | |
'busy/quiet': 'dynamic atmosphere', | |
'quiet/busy': 'dynamic atmosphere', | |
'crowded/empty': 'varying occupancy', | |
'empty/crowded': 'varying occupancy', | |
'hot/cold': 'comfortable temperature', | |
'cold/hot': 'comfortable temperature', | |
'wet/dry': 'mixed conditions', | |
'dry/wet': 'mixed conditions', | |
'summer/winter': 'seasonal atmosphere', | |
'winter/summer': 'seasonal atmosphere', | |
'spring/autumn': 'transitional season', | |
'autumn/spring': 'transitional season', | |
'left/right': 'balanced composition', | |
'right/left': 'balanced composition', | |
'near/far': 'layered perspective', | |
'far/near': 'layered perspective', | |
'high/low': 'varied elevation', | |
'low/high': 'varied elevation', | |
'big/small': 'diverse scale', | |
'small/big': 'diverse scale', | |
'wide/narrow': 'varied width', | |
'narrow/wide': 'varied width', | |
'open/closed': 'flexible space', | |
'closed/open': 'flexible space', | |
'public/private': 'community space', | |
'private/public': 'community space', | |
'formal/informal': 'relaxed setting', | |
'informal/formal': 'relaxed setting', | |
'commercial/residential': 'mixed-use area', | |
'residential/commercial': 'mixed-use area' | |
} | |
# 新增:擴展的底線替換字典 | |
self.underscore_replacements = { | |
'urban_intersection': 'urban intersection', | |
'tourist_landmark': 'tourist landmark', | |
'historical_site': 'historical site', | |
'religious_building': 'religious building', | |
'natural_landmark': 'natural landmark', | |
'commercial_area': 'commercial area', | |
'residential_area': 'residential area', | |
'public_space': 'public space', | |
'outdoor_scene': 'outdoor scene', | |
'indoor_scene': 'indoor scene', | |
'street_scene': 'street scene', | |
'city_center': 'city center', | |
'shopping_district': 'shopping district', | |
'business_district': 'business district', | |
'traffic_light': 'traffic light', | |
'street_lamp': 'street lamp', | |
'parking_meter': 'parking meter', | |
'fire_hydrant': 'fire hydrant', | |
'bus_stop': 'bus stop', | |
'train_station': 'train station', | |
'police_car': 'police car', | |
'fire_truck': 'fire truck', | |
'school_bus': 'school bus', | |
'time_of_day': 'time of day', | |
'weather_condition': 'weather condition', | |
'lighting_condition': 'lighting condition', | |
'atmospheric_condition': 'atmospheric condition', | |
'human_activity': 'human activity', | |
'pedestrian_traffic': 'pedestrian traffic', | |
'vehicle_traffic': 'vehicle traffic', | |
'social_gathering': 'social gathering', | |
'object_detection': 'object detection', | |
'scene_analysis': 'scene analysis', | |
'image_classification': 'image classification', | |
'computer_vision': 'computer vision' | |
} | |
self.logger.info("Cleaning rules initialized successfully") | |
except Exception as e: | |
error_msg = f"Failed to initialize cleaning rules: {str(e)}" | |
self.logger.error(error_msg) | |
self.logger.error(traceback.format_exc()) | |
raise ResponseProcessingError(error_msg) from e | |
def clean_response(self, response: str, model_type: str = "general") -> str: | |
""" | |
清理LLM回應 | |
Args: | |
response: 原始LLM回應 | |
model_type: 模型類型(用於特定清理規則) | |
Returns: | |
str: 清理後的回應 | |
Raises: | |
ResponseProcessingError: 當回應處理失敗時 | |
""" | |
if not response: | |
raise ResponseProcessingError("Empty response provided for cleaning") | |
try: | |
self.logger.debug(f"Starting response cleaning (original length: {len(response)})") | |
# 保存原始回應作為備份 | |
original_response = response | |
# 根據模型類型選擇清理策略 | |
if "llama" in model_type.lower(): | |
cleaned_response = self._clean_llama_response(response) | |
else: | |
cleaned_response = self._clean_general_response(response) | |
# 如果清理後內容過短,嘗試從原始回應中恢復 | |
if len(cleaned_response.strip()) < 40: | |
self.logger.warning("Cleaned response too short, attempting recovery") | |
cleaned_response = self._recover_from_overcleaning(original_response) | |
# 最終驗證 | |
self._validate_cleaned_response(cleaned_response) | |
self.logger.debug(f"Response cleaning completed (final length: {len(cleaned_response)})") | |
return cleaned_response | |
except Exception as e: | |
error_msg = f"Response cleaning failed: {str(e)}" | |
self.logger.error(error_msg) | |
self.logger.error(traceback.format_exc()) | |
raise ResponseProcessingError(error_msg) from e | |
def _clean_llama_response(self, response: str) -> str: | |
""" | |
專門處理Llama模型的回應清理 | |
Args: | |
response: 原始Llama回應 | |
Returns: | |
str: 清理後的回應 | |
""" | |
# 首先應用通用清理 | |
response = self._clean_general_response(response) | |
# Llama特有的前綴清理 | |
llama_prefixes = [ | |
"Here's the enhanced description:", | |
"Enhanced description:", | |
"Here is the enhanced scene description:", | |
"I've enhanced the description while preserving all factual details:" | |
] | |
for prefix in llama_prefixes: | |
if response.lower().startswith(prefix.lower()): | |
response = response[len(prefix):].strip() | |
# Llama特有的後綴清理 | |
llama_suffixes = [ | |
"I've maintained all the key factual elements", | |
"I've preserved all the factual details", | |
"All factual elements have been maintained" | |
] | |
for suffix in llama_suffixes: | |
if response.lower().endswith(suffix.lower()): | |
response = response[:response.rfind(suffix)].strip() | |
return response | |
def _clean_general_response(self, response: str) -> str: | |
""" | |
通用回應清理方法 | |
Args: | |
response: 原始回應 | |
Returns: | |
str: 清理後的回應 | |
""" | |
response = self._critical_format_preprocess(response) | |
# 1. 移除系統remark | |
response = self._remove_system_markers(response) | |
# 2. 移除介紹性prefix | |
response = self._remove_introduction_prefixes(response) | |
# 3. 移除格式標記和上下文標籤 | |
response = self._remove_format_markers(response) | |
# 4. 清理場景類型引用 | |
response = self._clean_scene_type_references(response) | |
# 5. 標準化標點符號 | |
response = self._normalize_punctuation(response) | |
# 6. 移除重複句子 | |
response = self._remove_duplicate_sentences(response) | |
# 7. 處理重複詞彙 | |
response = self._handle_repetitive_vocabulary(response) | |
# 8. ensure completement | |
response = self._ensure_grammatical_completeness(response) | |
# 9. 控制字數長度 | |
response = self._control_word_length(response) | |
# 10. 最終格式化 | |
response = self._final_formatting(response) | |
return response | |
def _critical_format_preprocess(self, response: str) -> str: | |
""" | |
關鍵格式預處理,處理最常見的格式問題 | |
Args: | |
response: 原始回應 | |
Returns: | |
str: 預處理後的回應 | |
""" | |
if not response: | |
return response | |
try: | |
import re | |
# 第一優先級:處理斜線問題 | |
# 首先處理已知的斜線組合,使用形容詞替換 | |
for slash_combo, replacement in self.slash_replacements.items(): | |
if slash_combo.lower() in response.lower(): | |
# 保持原始大小寫格式 | |
if slash_combo.upper() in response: | |
replacement_formatted = replacement.upper() | |
elif slash_combo.title() in response: | |
replacement_formatted = replacement.title() | |
else: | |
replacement_formatted = replacement | |
# 執行替換(不區分大小寫) | |
response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE) | |
self.logger.debug(f"Replaced slash pattern '{slash_combo}' with '{replacement_formatted}'") | |
# 處理其他未預定義的斜線模式 | |
# 標準斜線模式:word/word | |
slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b' | |
matches = list(re.finditer(slash_pattern, response)) | |
for match in reversed(matches): # 從後往前處理避免位置偏移 | |
word1, word2 = match.groups() | |
# 選擇較短或更常見的詞作為替換 | |
if len(word1) <= len(word2): | |
replacement = word1 | |
else: | |
replacement = word2 | |
response = response[:match.start()] + replacement + response[match.end():] | |
self.logger.debug(f"Replaced general slash pattern '{match.group(0)}' with '{replacement}'") | |
# 第二優先級:處理底線格式 | |
# 首先處理已知的底線組合 | |
for underscore_combo, replacement in self.underscore_replacements.items(): | |
if underscore_combo in response: | |
response = response.replace(underscore_combo, replacement) | |
self.logger.debug(f"Replaced underscore pattern '{underscore_combo}' with '{replacement}'") | |
# 處理三個詞的底線組合:word_word_word → word word word | |
response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response) | |
# 處理任何剩餘的底線模式:word_word → word word | |
response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response) | |
# 第三優先級:修正不完整句子 | |
incomplete_sentence_fixes = [ | |
(r'\bIn\s*,\s*', 'Throughout the area, '), | |
(r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'), | |
(r'\bThe overall atmosphere of\s+is\b', 'The overall atmosphere'), | |
(r'\bwith its lights turned illuminating\b', 'with its lights illuminating'), | |
(r'\bwhere it stands as\b', 'where it stands as'), | |
] | |
for pattern, replacement in incomplete_sentence_fixes: | |
response = re.sub(pattern, replacement, response, flags=re.IGNORECASE) | |
# 第四優先級:語法修正處理(像是person and people) | |
grammar_fixes = [ | |
(r'\b(\d+)\s+persons\b', r'\1 people'), | |
(r'\bone\s+persons\b', 'one person'), | |
(r'\btwo\s+persons\b', 'two people'), | |
(r'\bthree\s+persons\b', 'three people'), | |
(r'\bfour\s+persons\b', 'four people'), | |
(r'\bfive\s+persons\b', 'five people'), | |
(r'\bsix\s+persons\b', 'six people'), | |
(r'\bseven\s+persons\b', 'seven people'), | |
(r'\beight\s+persons\b', 'eight people'), | |
(r'\bnine\s+persons\b', 'nine people'), | |
(r'\bten\s+persons\b', 'ten people'), | |
(r'\bmultiple\s+persons\b', 'multiple people'), | |
(r'\bseveral\s+persons\b', 'several people'), | |
(r'\bmany\s+persons\b', 'many people'), | |
(r'\ba\s+few\s+persons\b', 'a few people'), | |
(r'\bsome\s+persons\b', 'some people') | |
] | |
for pattern, replacement in grammar_fixes: | |
response = re.sub(pattern, replacement, response, flags=re.IGNORECASE) | |
return response | |
except Exception as e: | |
self.logger.warning(f"Error in critical format preprocessing: {str(e)}") | |
return response | |
def _remove_system_markers(self, response: str) -> str: | |
"""移除系統樣式標記""" | |
# 移除對話remark | |
response = re.sub(r'<\|.*?\|>', '', response) | |
# 移除輸出remark | |
output_start = response.find("[OUTPUT_START]") | |
output_end = response.find("[OUTPUT_END]") | |
if output_start != -1 and output_end != -1 and output_end > output_start: | |
response = response[output_start + len("[OUTPUT_START]"):output_end].strip() | |
# 移除其他remark | |
section_markers = [ | |
r'\[.*?\]', | |
r'OUTPUT_START\s*:|OUTPUT_END\s*:', | |
r'ENHANCED DESCRIPTION\s*:', | |
r'Scene Type\s*:.*?(?=\n|$)', | |
r'Original Description\s*:.*?(?=\n|$)', | |
r'GOOD\s*:|BAD\s*:', | |
r'PROBLEM\s*:.*?(?=\n|$)', | |
r'</?\|(?:assistant|system|user)\|>', | |
r'\(Note:.*?\)', | |
r'\(.*?I\'ve.*?\)', | |
r'\(.*?as per your request.*?\)' | |
] | |
for marker in section_markers: | |
response = re.sub(marker, '', response, flags=re.IGNORECASE) | |
return response | |
def _remove_introduction_prefixes(self, response: str) -> str: | |
"""移除介紹性前綴""" | |
# 處理 "Here is..." 類型的prefix | |
intro_prefixes = [ | |
r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*', | |
r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*', | |
r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*' | |
] | |
for prefix_pattern in intro_prefixes: | |
response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE) | |
# 處理固定prefix | |
for prefix in self.prefixes_to_remove: | |
if response.lower().startswith(prefix.lower()): | |
response = response[len(prefix):].strip() | |
return response | |
def _remove_format_markers(self, response: str) -> str: | |
"""移除格式標記和上下文標籤(保留括號內的地理與細節資訊)""" | |
# 移除上下文相關remark | |
response = re.sub(r'<\s*Context:.*?>', '', response) | |
response = re.sub(r'Context:.*?(?=\n|$)', '', response) | |
response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) | |
# 移除Markdown格式 | |
response = re.sub(r'\*\*|\*|__|\|', '', response) | |
# 移除任何剩餘的特殊標記 (避開括號內容,以免剔除地理位置等有用資訊) | |
response = re.sub(r'</?\|.*?\|>', '', response) | |
# ※ 以下移除「刪除整個括號及其內文」的方式已註解,以保留地理位置資訊 | |
# response = re.sub(r'\(.*?\)', '', response) | |
return response | |
def _clean_scene_type_references(self, response: str) -> str: | |
"""清理不當的場景類型引用""" | |
scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)' | |
match = re.search(scene_type_pattern, response) | |
if match and '_' in match.group(1): | |
fixed_text = f"This scene {match.group(2)}" | |
response = re.sub(scene_type_pattern, fixed_text, response) | |
return response | |
def _normalize_punctuation(self, response: str) -> str: | |
"""標準化標點符號""" | |
# 減少破折號使用 | |
response = re.sub(r'—', ', ', response) | |
response = re.sub(r' - ', ', ', response) | |
# 處理連續標點符號 | |
response = re.sub(r'([.,;:!?])\1+', r'\1', response) | |
# 修復不完整句子的標點 | |
response = re.sub(r',\s*$', '.', response) | |
# 修復句號後缺少空格的問題 | |
response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response) | |
# 清理多餘空格和換行 | |
response = response.replace('\r', ' ') | |
response = re.sub(r'\n+', ' ', response) | |
response = re.sub(r'\s{2,}', ' ', response) | |
return response | |
def _remove_duplicate_sentences(self, response: str, similarity_threshold: float = 0.85) -> str: | |
""" | |
移除重複或高度相似的句子,使用 Jaccard 相似度進行比較。 | |
Args: | |
response: 原始回應文本。 | |
similarity_threshold: 認定句子重複的相似度閾值 (0.0 到 1.0)。 | |
較高的閾值表示句子需要非常相似才會被移除。 | |
Returns: | |
str: 移除重複句子後的文本。 | |
""" | |
try: | |
if not response or not response.strip(): | |
return "" | |
# (?<=[.!?]) 會保留分隔符在句尾, \s+ 會消耗句尾的空格 | |
# 這樣用 ' ' join 回去時, 標點和下個句子間剛好一個空格 | |
sentences = re.split(r'(?<=[.!?])\s+', response.strip()) | |
unique_sentences_data = [] # Store tuples of (original_sentence, simplified_word_set) | |
min_sentence_len_for_check = 8 # 簡化後詞彙數少於此值,除非完全相同否則不輕易判斷為重複 | |
for sentence in sentences: | |
sentence = sentence.strip() | |
if not sentence: | |
continue | |
# 創建簡化版本用於比較 (小寫,移除標點,分割為詞彙集合) | |
# 保留數字,因為數字可能是關鍵資訊 | |
simplified_text = re.sub(r'[^\w\s\d]', '', sentence.lower()) | |
current_sentence_words = set(simplified_text.split()) | |
if not current_sentence_words: # 如果處理後是空集合,跳過 | |
continue | |
is_duplicate = False | |
# 與已保留的唯一句子比較 | |
for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data): | |
# Jaccard Index | |
intersection_len = len(current_sentence_words.intersection(kept_sentence_words)) | |
union_len = len(current_sentence_words.union(kept_sentence_words)) | |
if union_len == 0: # 兩個都是空集合,代表相同句子 | |
jaccard_similarity = 1.0 | |
else: | |
jaccard_similarity = intersection_len / union_len | |
# 用Jaccard 相似度超過閾值,不是兩個都非常短的句子 (避免 "Yes." 和 "No." 被錯誤合併) | |
# 新句子完全被舊句子包含 (且舊句子更長) | |
# 舊句子完全被新句子包含 (且新句子更長) -> 這種情況就需要替換 | |
if jaccard_similarity >= similarity_threshold: | |
# 如果當前句子比已保留的句子短,且高度相似,則認為是重複 | |
if len(current_sentence_words) < len(kept_sentence_words): | |
is_duplicate = True | |
self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (shorter, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}") | |
break | |
# 如果當前句子比已保留的句子長,且高度相似,則替換掉已保留的 | |
elif len(current_sentence_words) > len(kept_sentence_words): | |
self.logger.debug(f"Sentence \"{kept_sentence_text[:30]}...\" replaced by longer similar sentence \"{sentence[:30]}...\" Jaccard: {jaccard_similarity:.2f}") | |
unique_sentences_data.pop(i) # 移除舊的、較短的句子 | |
# 如果長度差不多,但相似度高,保留第一個出現的 | |
elif current_sentence_words != kept_sentence_words : # 避免完全相同的句子被錯誤地跳過替換邏輯 | |
is_duplicate = True # 保留先出現的 | |
self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (similar length, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}") | |
break | |
if not is_duplicate: | |
unique_sentences_data.append((sentence, current_sentence_words)) | |
# 重組唯一句子 | |
final_sentences = [s_data[0] for s_data in unique_sentences_data] | |
# 確保每個句子以標點結尾 (因為 split 可能會產生沒有標點的最後一個片段) | |
reconstructed_response = "" | |
for i, s in enumerate(final_sentences): | |
s = s.strip() | |
if not s: continue | |
if not s[-1] in ".!?": | |
s += "." | |
reconstructed_response += s | |
if i < len(final_sentences) - 1: | |
reconstructed_response += " " # 在句子間添加空格 | |
return reconstructed_response.strip() | |
except Exception as e: | |
self.logger.error(f"Error in _remove_duplicate_sentences: {str(e)}") | |
self.logger.error(traceback.format_exc()) | |
return response # 發生錯誤時返回原始回應 | |
def _handle_repetitive_vocabulary(self, response: str) -> str: | |
"""處理重複詞彙,使用 re.sub 和可呼叫的替換函數以提高效率和準確性。""" | |
try: | |
# 檢測重複模式 (僅警告) | |
if hasattr(self, 'repetitive_patterns'): | |
for pattern, issue in self.repetitive_patterns: | |
if re.search(pattern, response, re.IGNORECASE | re.DOTALL): | |
self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"") | |
if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives: | |
return response | |
processed_response = response | |
for word_to_replace, alternatives in self.replacement_alternatives.items(): | |
if not alternatives: # 如果沒有可用的替代詞,則跳過 | |
continue | |
# 為每個詞創建一個獨立的計數器和替代索引 | |
# 使用閉包或一個小類來封裝狀態 | |
class WordReplacer: | |
def __init__(self, alternatives_list): | |
self.count = 0 | |
self.alternative_idx = 0 | |
self.alternatives_list = alternatives_list | |
def __call__(self, match_obj): | |
self.count += 1 | |
original_word = match_obj.group(0) | |
if self.count > 1: # 從第二次出現開始替換 | |
replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)] | |
self.alternative_idx += 1 | |
# 保持原始大小寫格式 | |
if original_word.isupper(): | |
return replacement.upper() | |
elif original_word.istitle(): | |
return replacement.capitalize() | |
return replacement | |
return original_word # 因為第一次出現, 就不用替換 | |
replacer_instance = WordReplacer(alternatives) | |
# 使用 \b 確保匹配的是整個單詞 | |
pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE) | |
processed_response = pattern.sub(replacer_instance, processed_response) | |
# 移除 identical 等重複性描述詞彙 | |
identical_cleanup_patterns = [ | |
(r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'), | |
(r'\b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'), | |
(r'\bidentical\s+([a-zA-Z\s]+)', r'\1'), | |
(r'\bcomprehensive arrangement of\b', 'arrangement of'), | |
(r'\bcomprehensive view featuring\b', 'scene featuring'), | |
(r'\bcomprehensive display of\b', 'display of'), | |
] | |
for pattern, replacement in identical_cleanup_patterns: | |
processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE) | |
# 數字到文字 | |
number_conversions = { | |
'2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', | |
'7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten', | |
'11': 'eleven', '12': 'twelve' | |
} | |
# 處理各種語法結構中的數字 | |
for digit, word in number_conversions.items(): | |
# 模式1: 數字 + 單一複數詞 (如 "7 chairs") | |
pattern1 = rf'\b{digit}\s+([a-zA-Z]+s)\b' | |
processed_response = re.sub(pattern1, rf'{word} \1', processed_response) | |
# 模式2: 數字 + 修飾詞 + 複數詞 (如 "7 more chairs") | |
pattern2 = rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b' | |
processed_response = re.sub(pattern2, rf'{word} \1 \2', processed_response, flags=re.IGNORECASE) | |
# 模式3: 數字 + 形容詞 + 複數詞 (如 "2 dining tables") | |
pattern3 = rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b' | |
processed_response = re.sub(pattern3, rf'{word} \1 \2', processed_response) | |
# 模式4: 介詞片語中的數字 (如 "around 2 tables") | |
pattern4 = rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b' | |
processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE) | |
return processed_response | |
except Exception as e: | |
self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}") | |
self.logger.error(traceback.format_exc()) | |
return response # 發生錯誤時返回原始回應 | |
def _ensure_grammatical_completeness(self, response: str) -> str: | |
""" | |
確保語法完整性,處理不完整句子和格式問題 | |
Args: | |
response: 待檢查的回應文本 | |
Returns: | |
str: 語法完整的回應文本 | |
""" | |
try: | |
if not response or not response.strip(): | |
return response | |
# 第一階段:檢查並修正不完整的句子模式 | |
incomplete_patterns = [ | |
# 介詞後直接結束的問題(針對 "over ." 等情況) | |
(r'\b(over|under|through|across|along|beneath|beyond|throughout)\s*\.', 'incomplete_preposition'), | |
(r'\b(with|without|against|towards|beside|between|among)\s*\.', 'incomplete_preposition'), | |
(r'\b(into|onto|upon|within|behind|below|above)\s*\.', 'incomplete_preposition'), | |
# 處理 "In ," 這類缺失詞彙的問題 | |
(r'\bIn\s*,', 'incomplete_location'), | |
(r'\bAt\s*,', 'incomplete_location'), | |
(r'\bOn\s*,', 'incomplete_location'), | |
(r'\bWith\s*,', 'incomplete_context'), | |
# 不完整的描述模式 | |
(r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', 'incomplete_description'), | |
# 連詞或介詞後直接標點的問題 | |
(r'\b(and|or|but|with|from|in|at|on|by|for|to)\s*[.!?]', 'incomplete_conjunction'), | |
# 重複詞彙 | |
(r'\b(\w+)\s+\1\b', 'word_repetition'), | |
# 不完整的場景類型引用(如 "urban_intersection" 格式問題) | |
(r'\b(\w+)_(\w+)\b', 'underscore_format'), | |
# 地標場景特有問題 | |
(r'\btourist_landmark\b', 'underscore_format'), | |
(r'\burban_intersection\b', 'underscore_format'), | |
(r'\bIn\s*,\s*(?=\w)', 'incomplete_prepositional'), | |
(r'\bOverall,\s+(?=exudes|shows|displays)(?!\s+(?:the|this|it))', 'missing_subject'), | |
(r'\batmosphere of\s+is one of\b', 'redundant_structure'), | |
(r'\bwith.*?turned\s+illuminating\b', 'redundant_participle') | |
] | |
for pattern, issue_type in incomplete_patterns: | |
try: | |
matches = list(re.finditer(pattern, response, re.IGNORECASE)) | |
for match in matches: | |
if issue_type == 'incomplete_preposition': | |
# 處理介詞後直接結束的情況 | |
response = self._fix_incomplete_preposition(response, match) | |
elif issue_type == 'underscore_format': | |
# 將下劃線格式轉換為空格分隔 | |
original = match.group(0) | |
replacement = original.replace('_', ' ') | |
response = response.replace(original, replacement) | |
elif issue_type == 'word_repetition': | |
# 移除重複的詞彙 | |
repeated_word = match.group(1) | |
response = response.replace(f"{repeated_word} {repeated_word}", repeated_word) | |
elif issue_type == 'incomplete_location' or issue_type == 'incomplete_context': | |
# 移除不完整的位置或上下文引用 | |
response = response.replace(match.group(0), '') | |
elif issue_type == 'incomplete_prepositional': | |
# 處理不完整的介詞短語 | |
response = re.sub(r'\bIn\s*,\s*', 'Throughout the scene, ', response) | |
elif issue_type == 'missing_subject': | |
# 為Overall句子添加主語 | |
response = re.sub(r'\bOverall,\s+(?=exudes)', 'Overall, the scene ', response) | |
elif issue_type == 'redundant_structure': | |
# 簡化冗餘結構 | |
response = re.sub(r'\batmosphere of\s+is one of\b', 'atmosphere is one of', response) | |
elif issue_type == 'redundant_participle': | |
# 清理冗餘分詞 | |
response = re.sub(r'turned\s+illuminating', 'illuminating', response) | |
else: | |
# 對於其他不完整模式,直接移除 | |
response = response.replace(match.group(0), '') | |
# 清理多餘空格 | |
response = re.sub(r'\s{2,}', ' ', response).strip() | |
except re.error as e: | |
self.logger.warning(f"Regular expression pattern error for {issue_type}: {pattern} - {str(e)}") | |
continue | |
# 第二階段:處理物件類別格式問題 | |
response = self._clean_object_class_references(response) | |
# 第三階段:確保句子正確結束 | |
response = self._ensure_proper_sentence_ending(response) | |
# 第四階段:最終語法檢查 | |
response = self._final_grammar_check(response) | |
return response.strip() | |
except Exception as e: | |
self.logger.error(f"Error in _ensure_grammatical_completeness: {str(e)}") | |
return response | |
def _fix_incomplete_preposition(self, response: str, match) -> str: | |
""" | |
修正不完整的介詞短語 | |
Args: | |
response: 回應文本 | |
match: 正則匹配對象 | |
Returns: | |
str: 修正後的回應 | |
""" | |
preposition = match.group(1) | |
match_start = match.start() | |
# 找到句子的開始位置 | |
sentence_start = response.rfind('.', 0, match_start) | |
sentence_start = sentence_start + 1 if sentence_start != -1 else 0 | |
# 提取句子片段 | |
sentence_fragment = response[sentence_start:match_start].strip() | |
# 如果句子片段有意義,嘗試移除不完整的介詞部分 | |
if len(sentence_fragment) > 10: | |
# 移除介詞及其後的內容,添加適當的句號 | |
response = response[:match_start].rstrip() + '.' | |
else: | |
# 如果句子片段太短,移除整個不完整的句子 | |
response = response[:sentence_start] + response[match.end():] | |
return response | |
def _clean_object_class_references(self, response: str) -> str: | |
""" | |
清理物件類別引用中的格式問題 | |
Args: | |
response: 回應文本 | |
Returns: | |
str: 清理後的回應 | |
""" | |
# 移除類別ID引用(如 "unknown-class 2", "Class 0" 等) | |
class_id_patterns = [ | |
r'\bunknown[- ]?class\s*\d+\s*objects?', | |
r'\bclass[- ]?\d+\s*objects?', | |
r'\b[Cc]lass\s*\d+\s*objects?', | |
r'\bunknown[- ][Cc]lass\s*\d+\s*objects?' | |
] | |
for pattern in class_id_patterns: | |
try: | |
# 替換為更自然的描述 | |
response = re.sub(pattern, 'objects', response, flags=re.IGNORECASE) | |
except re.error as e: | |
self.logger.warning(f"Error cleaning class reference pattern {pattern}: {str(e)}") | |
continue | |
# 處理數量描述中的問題 | |
response = re.sub(r'\b(\w+)\s+unknown[- ]?\w*\s*objects?', r'\1 objects', response, flags=re.IGNORECASE) | |
return response | |
def _ensure_proper_sentence_ending(self, response: str) -> str: | |
""" | |
確保句子有適當的結尾 | |
Args: | |
response: 回應文本 | |
Returns: | |
str: 具有適當結尾的回應 | |
""" | |
if not response or not response.strip(): | |
return response | |
response = response.strip() | |
# 檢查是否以標點符號結尾 | |
if response and response[-1] not in ['.', '!', '?']: | |
# 常見介詞和連詞列表 | |
problematic_endings = [ | |
"into", "onto", "about", "above", "across", "after", "along", "around", | |
"at", "before", "behind", "below", "beneath", "beside", "between", | |
"beyond", "by", "down", "during", "except", "for", "from", "in", | |
"inside", "near", "of", "off", "on", "over", "through", "to", | |
"toward", "under", "up", "upon", "with", "within", "and", "or", "but" | |
] | |
words = response.split() | |
if words: | |
last_word = words[-1].lower().rstrip('.,!?') | |
if last_word in problematic_endings: | |
# 找到最後完整的句子 | |
last_period_pos = max( | |
response.rfind('.'), | |
response.rfind('!'), | |
response.rfind('?') | |
) | |
if last_period_pos > len(response) // 2: # 如果有較近的完整句子 | |
response = response[:last_period_pos + 1] | |
else: | |
# 移除問題詞彙並添加句號 | |
if len(words) > 1: | |
response = " ".join(words[:-1]) + "." | |
else: | |
response = "The scene displays various elements." | |
else: | |
# 正常情況下添加句號 | |
response += "." | |
return response | |
def _final_grammar_check(self, response: str) -> str: | |
""" | |
最終語法檢查和清理 | |
Args: | |
response: 回應文本 | |
Returns: | |
str: 最終清理後的回應 | |
""" | |
if not response: | |
return response | |
# 修正連續標點符號 | |
response = re.sub(r'([.!?]){2,}', r'\1', response) | |
# 修正句號前的空格 | |
response = re.sub(r'\s+([.!?])', r'\1', response) | |
# 修正句號後缺少空格的問題 | |
response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response) | |
# 確保首字母大寫 | |
if response and response[0].islower(): | |
response = response[0].upper() + response[1:] | |
# 移除多餘的空格 | |
response = re.sub(r'\s{2,}', ' ', response) | |
# 處理空句子或過短的回應 | |
if len(response.strip()) < 20: | |
return "The scene contains various visual elements." | |
return response.strip() | |
def _control_word_length(self, response: str) -> str: | |
"""控制文字長度在合理範圍內""" | |
words = response.split() | |
if len(words) > 200: | |
# 找到接近字數限制的句子結束處 | |
truncated = ' '.join(words[:200]) | |
last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?')) | |
if last_period > 0: | |
response = truncated[:last_period+1] | |
else: | |
response = truncated + "." | |
return response | |
def _final_formatting(self, response: str) -> str: | |
"""最終格式化處理""" | |
# 確保首字母大寫 | |
if response and response[0].islower(): | |
response = response[0].upper() + response[1:] | |
# 統一格式為單一段落 | |
response = re.sub(r'\s*\n\s*', ' ', response) | |
response = ' '.join(response.split()) | |
return response.strip() | |
def _recover_from_overcleaning(self, original_response: str) -> str: | |
"""從過度清理中恢復內容""" | |
try: | |
# 嘗試從原始回應中找到最佳段落 | |
paragraphs = [p for p in original_response.split('\n\n') if p.strip()] | |
if paragraphs: | |
# 選擇最長的段落作為主要描述 | |
best_para = max(paragraphs, key=len) | |
# 使用基本清理規則 | |
best_para = re.sub(r'\[.*?\]', '', best_para) | |
best_para = re.sub(r'\s{2,}', ' ', best_para).strip() | |
if len(best_para) >= 40: | |
return best_para | |
return "Unable to generate a valid enhanced description." | |
except Exception as e: | |
self.logger.error(f"Recovery from overcleaning failed: {str(e)}") | |
return "Description generation error." | |
def _validate_cleaned_response(self, response: str): | |
"""驗證清理後的回應""" | |
if not response: | |
raise ResponseProcessingError("Response is empty after cleaning") | |
if len(response.strip()) < 20: | |
raise ResponseProcessingError("Response is too short after cleaning") | |
# 檢查是否包含基本的句子結構 | |
if not re.search(r'[.!?]', response): | |
raise ResponseProcessingError("Response lacks proper sentence structure") | |
def remove_explanatory_notes(self, response: str) -> str: | |
""" | |
移除解釋性注釋和說明 | |
Args: | |
response: 包含可能注釋的回應 | |
Returns: | |
str: 移除注釋後的回應 | |
""" | |
try: | |
# 識別常見的注釋和解釋模式 | |
note_patterns = [ | |
r'(?:^|\n)Note:.*?(?:\n|$)', | |
r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)', | |
r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)', | |
r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)' | |
] | |
# 尋找段落 | |
paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()] | |
# 如果只有一個段落,檢查並清理它 | |
if len(paragraphs) == 1: | |
for pattern in note_patterns: | |
paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE) | |
return paragraphs[0].strip() | |
# 如果有多個段落,移除注釋段落 | |
content_paragraphs = [] | |
for paragraph in paragraphs: | |
is_note = False | |
for pattern in note_patterns: | |
if re.search(pattern, paragraph, flags=re.IGNORECASE): | |
is_note = True | |
break | |
# 檢查段落是否以常見的注釋詞開頭 | |
if paragraph.lower().startswith(('note:', 'please note:', 'remember:')): | |
is_note = True | |
if not is_note: | |
content_paragraphs.append(paragraph) | |
return '\n\n'.join(content_paragraphs).strip() | |
except Exception as e: | |
self.logger.error(f"Failed to remove explanatory notes: {str(e)}") | |
return response | |
def get_processor_info(self) -> Dict[str, Any]: | |
""" | |
獲取處理器信息 | |
Returns: | |
Dict[str, Any]: 包含處理器狀態和配置的信息 | |
""" | |
return { | |
"replacement_alternatives_count": len(self.replacement_alternatives), | |
"prefixes_to_remove_count": len(self.prefixes_to_remove), | |
"suffixes_to_remove_count": len(self.suffixes_to_remove), | |
"repetitive_patterns_count": len(self.repetitive_patterns), | |
"initialization_status": "success" | |
} | |