|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
import hashlib
|
|
|
|
def get_article_id(url):
|
|
return hashlib.md5(url.encode()).hexdigest()
|
|
|
|
def clean_haberler_content(content):
|
|
"""Gereksiz telif ve site reklam metinlerini siler"""
|
|
blacklist_phrases = [
|
|
"© Copyright",
|
|
"Haberler.com:",
|
|
"Haber:",
|
|
"bildirimlerimize izin vererek",
|
|
"masaüstü",
|
|
"Tüm Hakları Gizlidir",
|
|
"Haberler.com’da"
|
|
]
|
|
lines = content.split("\n")
|
|
cleaned_lines = []
|
|
for line in lines:
|
|
if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases):
|
|
cleaned_lines.append(line)
|
|
return "\n".join(cleaned_lines).strip()
|
|
|
|
def extract_full_content(soup):
|
|
"""Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır"""
|
|
|
|
candidate_containers = [
|
|
|
|
("div", {"class": "haber-metin"}),
|
|
("div", {"class": "article-content"}),
|
|
("div", {"class": "news-content"}),
|
|
("div", {"class": "detail-text"}),
|
|
("div", {"class": "content-text"}),
|
|
("div", {"id": "content"}),
|
|
|
|
("article", {}),
|
|
|
|
("section", {}),
|
|
("main", {}),
|
|
]
|
|
|
|
for tag, attr in candidate_containers:
|
|
container = soup.find(tag, attr)
|
|
if container:
|
|
paragraphs = container.find_all("p")
|
|
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
|
|
if content and len(content.strip()) > 50:
|
|
return content
|
|
|
|
|
|
paragraphs = soup.find_all("p")
|
|
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
|
|
if content and len(content.strip()) > 50:
|
|
return content
|
|
|
|
|
|
all_divs = soup.find_all("div")
|
|
text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)]
|
|
fallback_content = "\n".join(text_blobs)
|
|
if fallback_content and len(fallback_content.strip()) > 50:
|
|
return fallback_content
|
|
|
|
return ""
|
|
|
|
def scrape_haberler():
|
|
url = "https://www.haberler.com/son-dakika/"
|
|
headers = {"User-Agent": "Mozilla/5.0"}
|
|
response = requests.get(url, headers=headers)
|
|
response.encoding = "utf-8"
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
articles = []
|
|
seen = set()
|
|
|
|
for a_tag in soup.select("a"):
|
|
href = a_tag.get("href", "")
|
|
text = a_tag.get_text(strip=True)
|
|
|
|
if not href or not text or "haberi" not in href:
|
|
continue
|
|
|
|
if not href.startswith("http"):
|
|
href = "https://www.haberler.com" + href
|
|
|
|
if href in seen:
|
|
continue
|
|
seen.add(href)
|
|
|
|
try:
|
|
detail_resp = requests.get(href, headers=headers, timeout=10)
|
|
detail_resp.encoding = "utf-8"
|
|
detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
|
|
|
|
title_tag = detail_soup.select_one("h1")
|
|
full_content = extract_full_content(detail_soup)
|
|
full_content = clean_haberler_content(full_content)
|
|
|
|
if title_tag and full_content and len(full_content.strip()) > 50:
|
|
article = {
|
|
"id": get_article_id(href),
|
|
"title": title_tag.get_text(strip=True),
|
|
"content": full_content,
|
|
"url": href,
|
|
"source": "haberler.com",
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
articles.append(article)
|
|
print(f"{article['title']} → {href}")
|
|
else:
|
|
print(f"İçerik bulunamadı → {href}")
|
|
|
|
except Exception as e:
|
|
print(f"Hata ({href}): {e}")
|
|
|
|
print(f"\nToplam {len(articles)} haber çekildi.")
|
|
return articles
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n")
|
|
|
|
articles = scrape_haberler()
|
|
|
|
print("\nÇekilen Haber Özeti:")
|
|
for i, article in enumerate(articles, 1):
|
|
print(f"\n{i}. Haber")
|
|
print(f"Başlık: {article['title']}")
|
|
print(f"Link: {article['url']}")
|
|
print(f"İçerik Uzunluğu: {len(article['content'])} karakter")
|
|
print(f"Zaman Damgası: {article['timestamp']}")
|
|
print(f"\nİçerik:\n{article['content']}")
|
|
print("-" * 120)
|
|
|