Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

Turkish-LLM-RAG-Chatbot / RAG /scraper /general_scraper.py

Upload 17 files

70d956a verified 12 days ago

1.13 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse

	def scrape_webpage(url: str) -> dict:
	"""
	Herhangi bir web sayfasından başlık ve içerik çeker."""
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
	}

	try:
	response = requests.get(url , headers = headers , timeout=10)
	response.raise_for_status() # HTTP hatalarını kontrol et
	except Exception as e:
	raise Exception(f"Web sayfası alınamadı: {e}")
	soup = BeautifulSoup(response.text , "html.parser")

	# Sayfa Başlığı
	title_tag = soup.find("title")
	title = title_tag.get_text(strip=True) if title_tag else "Başlık bulunamadı"

	# Sayfa İçeriği
	paragraphs = soup.find_all("p")
	content = "\n".join([p.get_text(strip = True) for p in paragraphs if len(p.get_text(strip = True))>30])

	if not content:
	raise Exception("Sayfa içeriği bulunamadı")
	return{
	"title":title,
	"content":content
	}