iamseyhmus7's picture
Upload 17 files
70d956a verified
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib
def get_article_id(url):
return hashlib.md5(url.encode()).hexdigest()
def clean_haberler_content(content):
"""Gereksiz telif ve site reklam metinlerini siler"""
blacklist_phrases = [
"© Copyright",
"Haberler.com:",
"Haber:",
"bildirimlerimize izin vererek",
"masaüstü",
"Tüm Hakları Gizlidir",
"Haberler.com’da"
]
lines = content.split("\n")
cleaned_lines = []
for line in lines:
if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases):
cleaned_lines.append(line)
return "\n".join(cleaned_lines).strip()
def extract_full_content(soup):
"""Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır"""
candidate_containers = [
# Özel div class'ları
("div", {"class": "haber-metin"}),
("div", {"class": "article-content"}),
("div", {"class": "news-content"}),
("div", {"class": "detail-text"}),
("div", {"class": "content-text"}),
("div", {"id": "content"}),
# Article
("article", {}),
# Section/main
("section", {}),
("main", {}),
]
for tag, attr in candidate_containers:
container = soup.find(tag, attr)
if container:
paragraphs = container.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
if content and len(content.strip()) > 50:
return content
# Son çare: tüm <p> etiketlerini tara
paragraphs = soup.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
if content and len(content.strip()) > 50:
return content
# Ekstra: bazı haberler <div> içinde tek blok metin olabilir
all_divs = soup.find_all("div")
text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)]
fallback_content = "\n".join(text_blobs)
if fallback_content and len(fallback_content.strip()) > 50:
return fallback_content
return "" # Hiçbir içerik bulunamadıysa
def scrape_haberler():
url = "https://www.haberler.com/son-dakika/"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
articles = []
seen = set()
for a_tag in soup.select("a"):
href = a_tag.get("href", "")
text = a_tag.get_text(strip=True)
if not href or not text or "haberi" not in href:
continue
if not href.startswith("http"):
href = "https://www.haberler.com" + href
if href in seen:
continue
seen.add(href)
try:
detail_resp = requests.get(href, headers=headers, timeout=10)
detail_resp.encoding = "utf-8"
detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
title_tag = detail_soup.select_one("h1")
full_content = extract_full_content(detail_soup)
full_content = clean_haberler_content(full_content)
if title_tag and full_content and len(full_content.strip()) > 50:
article = {
"id": get_article_id(href),
"title": title_tag.get_text(strip=True),
"content": full_content,
"url": href,
"source": "haberler.com",
"timestamp": datetime.utcnow().isoformat()
}
articles.append(article)
print(f"{article['title']}{href}")
else:
print(f"İçerik bulunamadı → {href}")
except Exception as e:
print(f"Hata ({href}): {e}")
print(f"\nToplam {len(articles)} haber çekildi.")
return articles
# Test / terminal çıktısı
if __name__ == "__main__":
print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n")
articles = scrape_haberler()
print("\nÇekilen Haber Özeti:")
for i, article in enumerate(articles, 1):
print(f"\n{i}. Haber")
print(f"Başlık: {article['title']}")
print(f"Link: {article['url']}")
print(f"İçerik Uzunluğu: {len(article['content'])} karakter")
print(f"Zaman Damgası: {article['timestamp']}")
print(f"\nİçerik:\n{article['content']}")
print("-" * 120)