Spaces:
Running
on
Zero
Running
on
Zero
File size: 18,417 Bytes
595e0a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 |
import random
import hashlib
import numpy as np
import sqlite3
import re
import traceback
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity
from dog_database import get_dog_description
from breed_health_info import breed_health_info
from breed_noise_info import breed_noise_info
@dataclass
class BreedDescriptionVector:
"""品種描述向量的資料結構"""
breed_name: str
description_text: str
embedding: np.ndarray
characteristics: Dict[str, Any]
class SemanticVectorManager:
"""
語義向量管理器
處理 SBERT 模型初始化、品種向量化建構和品種描述生成
"""
def __init__(self):
"""初始化語義向量管理器"""
self.model_name = 'all-MiniLM-L6-v2'
self.sbert_model = None
self._sbert_loading_attempted = False
self.breed_vectors = {}
self.breed_list = self._get_breed_list()
# 延遲SBERT模型載入直到需要時才在GPU環境中進行
print("SemanticVectorManager initialized (SBERT loading deferred)")
def _get_breed_list(self) -> List[str]:
"""從資料庫獲取品種清單"""
try:
conn = sqlite3.connect('animal_detector.db')
cursor = conn.cursor()
cursor.execute("SELECT DISTINCT Breed FROM AnimalCatalog")
breeds = [row[0] for row in cursor.fetchall()]
cursor.close()
conn.close()
# 過濾掉野生動物品種
breeds = [breed for breed in breeds if breed != 'Dhole']
return breeds
except Exception as e:
print(f"Error getting breed list: {str(e)}")
return ['Labrador_Retriever', 'German_Shepherd', 'Golden_Retriever',
'Bulldog', 'Poodle', 'Beagle', 'Rottweiler', 'Yorkshire_Terrier']
def _initialize_model(self):
"""初始化 SBERT 模型,包含容錯機制 - 設計用於ZeroGPU相容性"""
if self.sbert_model is not None or self._sbert_loading_attempted:
return self.sbert_model
try:
print("Loading SBERT model in GPU context...")
# 如果主要模型失敗,嘗試不同的模型名稱
model_options = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'all-MiniLM-L12-v2']
for model_name in model_options:
try:
# 明確指定設備以處理ZeroGPU環境
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.sbert_model = SentenceTransformer(model_name, device=device)
self.model_name = model_name
print(f"SBERT model {model_name} loaded successfully on {device}")
return self.sbert_model
except Exception as model_e:
print(f"Failed to load {model_name}: {str(model_e)}")
continue
# 如果所有模型都失敗
print("All SBERT models failed to load. Using basic text matching fallback.")
self.sbert_model = None
return None
except Exception as e:
print(f"Failed to initialize any SBERT model: {str(e)}")
print(traceback.format_exc())
print("Will provide basic text-based recommendations without embeddings")
self.sbert_model = None
return None
finally:
self._sbert_loading_attempted = True
def _create_breed_description(self, breed: str) -> str:
"""為品種創建包含所有關鍵特徵的全面自然語言描述"""
try:
# 獲取所有信息來源
breed_info = get_dog_description(breed) or {}
health_info = breed_health_info.get(breed, {}) if breed_health_info else {}
noise_info = breed_noise_info.get(breed, {}) if breed_noise_info else {}
breed_display_name = breed.replace('_', ' ')
description_parts = []
# 1. 基本尺寸和身體特徵
size = breed_info.get('Size', 'medium').lower()
description_parts.append(f"{breed_display_name} is a {size} sized dog breed")
# 2. 氣質和個性(匹配的關鍵因素)
temperament = breed_info.get('Temperament', '')
if temperament:
description_parts.append(f"with a {temperament.lower()} temperament")
# 3. 運動和活動水平(公寓居住的關鍵因素)
exercise_needs = breed_info.get('Exercise Needs', 'moderate').lower()
if 'high' in exercise_needs or 'very high' in exercise_needs:
description_parts.append("requiring high daily exercise and mental stimulation")
elif 'low' in exercise_needs or 'minimal' in exercise_needs:
description_parts.append("with minimal exercise requirements, suitable for apartment living")
else:
description_parts.append("with moderate exercise needs")
# 4. 噪音特徵(安靜需求的關鍵因素)
noise_level = noise_info.get('noise_level', 'moderate').lower()
if 'low' in noise_level or 'quiet' in noise_level:
description_parts.append("known for being quiet and rarely barking")
elif 'high' in noise_level or 'loud' in noise_level:
description_parts.append("tends to be vocal and bark frequently")
else:
description_parts.append("with moderate barking tendencies")
# 5. 居住空間相容性
if size in ['small', 'tiny']:
description_parts.append("excellent for small apartments and limited spaces")
elif size in ['large', 'giant']:
description_parts.append("requiring large living spaces and preferably a yard")
else:
description_parts.append("adaptable to various living situations")
# 6. 美容和維護
grooming_needs = breed_info.get('Grooming Needs', 'moderate').lower()
if 'high' in grooming_needs:
description_parts.append("requiring regular professional grooming")
elif 'low' in grooming_needs:
description_parts.append("with minimal grooming requirements")
else:
description_parts.append("with moderate grooming needs")
# 7. 家庭相容性
good_with_children = breed_info.get('Good with Children', 'Yes')
if good_with_children == 'Yes':
description_parts.append("excellent with children and families")
else:
description_parts.append("better suited for adult households")
# 8. 智力和可訓練性(從資料庫描述中提取)
intelligence_keywords = []
description_text = breed_info.get('Description', '').lower()
if description_text:
# 從描述中提取智力指標
if any(word in description_text for word in ['intelligent', 'smart', 'clever', 'quick to learn']):
intelligence_keywords.extend(['highly intelligent', 'trainable', 'quick learner'])
elif any(word in description_text for word in ['stubborn', 'independent', 'difficult to train']):
intelligence_keywords.extend(['independent minded', 'requires patience', 'challenging to train'])
else:
intelligence_keywords.extend(['moderate intelligence', 'trainable with consistency'])
# 從描述中提取工作/用途特徵
if any(word in description_text for word in ['working', 'herding', 'guard', 'hunting']):
intelligence_keywords.extend(['working breed', 'purpose-driven', 'task-oriented'])
elif any(word in description_text for word in ['companion', 'lap', 'toy', 'decorative']):
intelligence_keywords.extend(['companion breed', 'affectionate', 'people-focused'])
# 添加智力背景到描述中
if intelligence_keywords:
description_parts.append(f"characterized as {', '.join(intelligence_keywords[:2])}")
# 9. 特殊特徵和用途(使用資料庫挖掘進行增強)
if breed_info.get('Description'):
desc = breed_info.get('Description', '')[:150] # 增加到 150 字元以提供更多背景
if desc:
# 從描述中提取關鍵特徵以便更好的語義匹配
desc_lower = desc.lower()
key_traits = []
# 從描述中提取關鍵行為特徵
if 'friendly' in desc_lower:
key_traits.append('friendly')
if 'gentle' in desc_lower:
key_traits.append('gentle')
if 'energetic' in desc_lower or 'active' in desc_lower:
key_traits.append('energetic')
if 'calm' in desc_lower or 'peaceful' in desc_lower:
key_traits.append('calm')
if 'protective' in desc_lower or 'guard' in desc_lower:
key_traits.append('protective')
trait_text = f" and {', '.join(key_traits)}" if key_traits else ""
description_parts.append(f"Known for: {desc.lower()}{trait_text}")
# 10. 照護水平需求
try:
care_level = breed_info.get('Care Level', 'moderate')
if isinstance(care_level, str):
description_parts.append(f"requiring {care_level.lower()} overall care level")
else:
description_parts.append("requiring moderate overall care level")
except Exception as e:
print(f"Error processing care level for {breed}: {str(e)}")
description_parts.append("requiring moderate overall care level")
# 11. 壽命資訊
try:
lifespan = breed_info.get('Lifespan', '10-12 years')
if lifespan and isinstance(lifespan, str) and lifespan.strip():
description_parts.append(f"with a typical lifespan of {lifespan}")
else:
description_parts.append("with a typical lifespan of 10-12 years")
except Exception as e:
print(f"Error processing lifespan for {breed}: {str(e)}")
description_parts.append("with a typical lifespan of 10-12 years")
# 創建全面的描述
full_description = '. '.join(description_parts) + '.'
# 添加全面的關鍵字以便更好的語義匹配
keywords = []
# 基本品種名稱關鍵字
keywords.extend([word.lower() for word in breed_display_name.split()])
# 氣質關鍵字
if temperament:
keywords.extend([word.lower().strip(',') for word in temperament.split()])
# 基於尺寸的關鍵字
if 'small' in size or 'tiny' in size:
keywords.extend(['small', 'tiny', 'compact', 'little', 'apartment', 'indoor', 'lap'])
elif 'large' in size or 'giant' in size:
keywords.extend(['large', 'big', 'giant', 'huge', 'yard', 'space', 'outdoor'])
else:
keywords.extend(['medium', 'moderate', 'average', 'balanced'])
# 活動水平關鍵字
exercise_needs = breed_info.get('Exercise Needs', 'moderate').lower()
if 'high' in exercise_needs:
keywords.extend(['active', 'energetic', 'exercise', 'outdoor', 'hiking', 'running', 'athletic'])
elif 'low' in exercise_needs:
keywords.extend(['calm', 'low-energy', 'indoor', 'relaxed', 'couch', 'sedentary'])
else:
keywords.extend(['moderate', 'balanced', 'walks', 'regular'])
# 噪音水平關鍵字
noise_level = noise_info.get('noise_level', 'moderate').lower()
if 'quiet' in noise_level or 'low' in noise_level:
keywords.extend(['quiet', 'silent', 'calm', 'peaceful', 'low-noise'])
elif 'high' in noise_level or 'loud' in noise_level:
keywords.extend(['vocal', 'barking', 'loud', 'alert', 'watchdog'])
# 居住情況關鍵字
if size in ['small', 'tiny'] and 'low' in exercise_needs:
keywords.extend(['apartment', 'city', 'urban', 'small-space'])
if size in ['large', 'giant'] or 'high' in exercise_needs:
keywords.extend(['house', 'yard', 'suburban', 'rural', 'space'])
# 家庭關鍵字
good_with_children = breed_info.get('Good with Children', 'Yes')
if good_with_children == 'Yes':
keywords.extend(['family', 'children', 'kids', 'friendly', 'gentle'])
# 智力和可訓練性關鍵字(從資料庫描述挖掘)
if intelligence_keywords:
keywords.extend([word.lower() for phrase in intelligence_keywords for word in phrase.split()])
# 美容相關關鍵字(增強)
grooming_needs = breed_info.get('Grooming Needs', 'moderate').lower()
if 'high' in grooming_needs:
keywords.extend(['high-maintenance', 'professional-grooming', 'daily-brushing', 'coat-care'])
elif 'low' in grooming_needs:
keywords.extend(['low-maintenance', 'minimal-grooming', 'easy-care', 'wash-and-go'])
else:
keywords.extend(['moderate-grooming', 'weekly-brushing', 'regular-care'])
# 基於壽命的關鍵字
lifespan = breed_info.get('Lifespan', '10-12 years')
if lifespan and isinstance(lifespan, str):
try:
# 從壽命字符串中提取年數(例如 "10-12 years" 或 "12-15 years")
import re
years = re.findall(r'\d+', lifespan)
if years:
avg_years = sum(int(y) for y in years) / len(years)
if avg_years >= 14:
keywords.extend(['long-lived', 'longevity', 'durable', 'healthy-lifespan'])
elif avg_years <= 8:
keywords.extend(['shorter-lifespan', 'health-considerations', 'special-care'])
else:
keywords.extend(['average-lifespan', 'moderate-longevity'])
except:
keywords.extend(['average-lifespan'])
# 將關鍵字添加到描述中以便更好的語義匹配
unique_keywords = list(set(keywords))
keyword_text = ' '.join(unique_keywords)
full_description += f" Additional context: {keyword_text}"
return full_description
except Exception as e:
print(f"Error creating description for {breed}: {str(e)}")
return f"{breed.replace('_', ' ')} is a dog breed with unique characteristics."
def _build_breed_vectors(self):
"""為所有品種建立向量表示 - 延遲調用當需要時"""
try:
print("Building breed vector database...")
# 初始化模型如果尚未完成
if self.sbert_model is None:
self._initialize_model()
# 如果模型不可用則跳過
if self.sbert_model is None:
print("SBERT model not available, skipping vector building")
return
for breed in self.breed_list:
description = self._create_breed_description(breed)
# 生成嵌入向量
embedding = self.sbert_model.encode(description, convert_to_tensor=False)
# 獲取品種特徵
breed_info = get_dog_description(breed)
characteristics = {
'size': breed_info.get('Size', 'Medium') if breed_info else 'Medium',
'exercise_needs': breed_info.get('Exercise Needs', 'Moderate') if breed_info else 'Moderate',
'grooming_needs': breed_info.get('Grooming Needs', 'Moderate') if breed_info else 'Moderate',
'good_with_children': breed_info.get('Good with Children', 'Yes') if breed_info else 'Yes',
'temperament': breed_info.get('Temperament', '') if breed_info else ''
}
self.breed_vectors[breed] = BreedDescriptionVector(
breed_name=breed,
description_text=description,
embedding=embedding,
characteristics=characteristics
)
print(f"Successfully built {len(self.breed_vectors)} breed vectors")
except Exception as e:
print(f"Error building breed vectors: {str(e)}")
print(traceback.format_exc())
raise
def get_breed_vectors(self) -> Dict[str, BreedDescriptionVector]:
"""獲取所有品種向量"""
# 確保向量已建構
if not self.breed_vectors:
self._build_breed_vectors()
return self.breed_vectors
def get_sbert_model(self) -> Optional[SentenceTransformer]:
"""獲取 SBERT 模型"""
return self.sbert_model
def get_breed_list(self) -> List[str]:
"""獲取品種清單"""
return self.breed_list
def is_model_available(self) -> bool:
"""檢查 SBERT 模型是否可用"""
return self.sbert_model is not None
def encode_text(self, text: str) -> np.ndarray:
"""使用 SBERT 模型編碼文本"""
# 初始化模型如果尚未完成
if self.sbert_model is None:
self._initialize_model()
if self.sbert_model is None:
raise RuntimeError("SBERT model not available")
return self.sbert_model.encode(text, convert_to_tensor=False)
|