Rulga commited on
Commit
e1e3365
·
1 Parent(s): 54a2558

Add URL availability check before loading documents in knowledge base

Browse files
Files changed (1) hide show
  1. api/fastapi_server.py +28 -5
api/fastapi_server.py CHANGED
@@ -221,6 +221,15 @@ def init_models():
221
  return llm, embeddings
222
 
223
  # --------------- Knowledge Base Management ---------------
 
 
 
 
 
 
 
 
 
224
  def build_knowledge_base():
225
  """Build or update the knowledge base"""
226
  global vector_store, kb_info
@@ -235,11 +244,25 @@ def build_knowledge_base():
235
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
236
 
237
  headers = {
238
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
239
  }
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # Load documents with detailed logging
242
- for url in URLS:
243
  try:
244
  print(f"Attempting to load {url}")
245
  loader = WebBaseLoader(
@@ -253,11 +276,11 @@ def build_knowledge_base():
253
  if docs:
254
  documents.extend(docs)
255
  else:
256
- # Попробуем альтернативный метод загрузки
257
  response = requests.get(url, headers=headers, timeout=30)
258
  response.raise_for_status()
259
  soup = BeautifulSoup(response.text, 'html.parser')
260
- # Получаем основной контент, исключая навигацию и футер
261
  main_content = ' '.join([p.text for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])])
262
  if main_content:
263
  from langchain_core.documents import Document
 
221
  return llm, embeddings
222
 
223
  # --------------- Knowledge Base Management ---------------
224
+ def check_url_availability(url: str, headers: dict) -> bool:
225
+ """Check if URL is accessible"""
226
+ try:
227
+ response = requests.head(url, headers=headers, timeout=10)
228
+ return response.status_code == 200
229
+ except Exception as e:
230
+ print(f"URL check failed for {url}: {str(e)}")
231
+ return False
232
+
233
  def build_knowledge_base():
234
  """Build or update the knowledge base"""
235
  global vector_store, kb_info
 
244
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
245
 
246
  headers = {
247
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
248
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
249
+ 'Accept-Language': 'en-US,en;q=0.5',
250
+ 'Connection': 'keep-alive',
251
  }
252
+
253
+ # First check which URLs are accessible
254
+ available_urls = [url for url in URLS if check_url_availability(url, headers)]
255
+
256
+ if not available_urls:
257
+ raise HTTPException(
258
+ status_code=500,
259
+ detail="None of the provided URLs are accessible. Please check the domain and URLs."
260
+ )
261
+
262
+ print(f"Found {len(available_urls)} accessible URLs out of {len(URLS)}")
263
 
264
+ # Load documents with detailed logging and error handling
265
+ for url in available_urls:
266
  try:
267
  print(f"Attempting to load {url}")
268
  loader = WebBaseLoader(
 
276
  if docs:
277
  documents.extend(docs)
278
  else:
279
+ # Try alternative loading method
280
  response = requests.get(url, headers=headers, timeout=30)
281
  response.raise_for_status()
282
  soup = BeautifulSoup(response.text, 'html.parser')
283
+ # Get main content, excluding navigation and footer
284
  main_content = ' '.join([p.text for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])])
285
  if main_content:
286
  from langchain_core.documents import Document