|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urlparse
|
|
|
|
def scrape_webpage(url: str) -> dict:
|
|
"""
|
|
Herhangi bir web sayfasından başlık ve içerik çeker."""
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url , headers = headers , timeout=10)
|
|
response.raise_for_status()
|
|
except Exception as e:
|
|
raise Exception(f"Web sayfası alınamadı: {e}")
|
|
soup = BeautifulSoup(response.text , "html.parser")
|
|
|
|
|
|
title_tag = soup.find("title")
|
|
title = title_tag.get_text(strip=True) if title_tag else "Başlık bulunamadı"
|
|
|
|
|
|
paragraphs = soup.find_all("p")
|
|
content = "\n".join([p.get_text(strip = True) for p in paragraphs if len(p.get_text(strip = True))>30])
|
|
|
|
if not content:
|
|
raise Exception("Sayfa içeriği bulunamadı")
|
|
return{
|
|
"title":title,
|
|
"content":content
|
|
} |