File size: 4,757 Bytes
70d956a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib
def get_article_id(url):
return hashlib.md5(url.encode()).hexdigest()
def clean_haberler_content(content):
"""Gereksiz telif ve site reklam metinlerini siler"""
blacklist_phrases = [
"© Copyright",
"Haberler.com:",
"Haber:",
"bildirimlerimize izin vererek",
"masaüstü",
"Tüm Hakları Gizlidir",
"Haberler.com’da"
]
lines = content.split("\n")
cleaned_lines = []
for line in lines:
if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases):
cleaned_lines.append(line)
return "\n".join(cleaned_lines).strip()
def extract_full_content(soup):
"""Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır"""
candidate_containers = [
# Özel div class'ları
("div", {"class": "haber-metin"}),
("div", {"class": "article-content"}),
("div", {"class": "news-content"}),
("div", {"class": "detail-text"}),
("div", {"class": "content-text"}),
("div", {"id": "content"}),
# Article
("article", {}),
# Section/main
("section", {}),
("main", {}),
]
for tag, attr in candidate_containers:
container = soup.find(tag, attr)
if container:
paragraphs = container.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
if content and len(content.strip()) > 50:
return content
# Son çare: tüm <p> etiketlerini tara
paragraphs = soup.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
if content and len(content.strip()) > 50:
return content
# Ekstra: bazı haberler <div> içinde tek blok metin olabilir
all_divs = soup.find_all("div")
text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)]
fallback_content = "\n".join(text_blobs)
if fallback_content and len(fallback_content.strip()) > 50:
return fallback_content
return "" # Hiçbir içerik bulunamadıysa
def scrape_haberler():
url = "https://www.haberler.com/son-dakika/"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
articles = []
seen = set()
for a_tag in soup.select("a"):
href = a_tag.get("href", "")
text = a_tag.get_text(strip=True)
if not href or not text or "haberi" not in href:
continue
if not href.startswith("http"):
href = "https://www.haberler.com" + href
if href in seen:
continue
seen.add(href)
try:
detail_resp = requests.get(href, headers=headers, timeout=10)
detail_resp.encoding = "utf-8"
detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
title_tag = detail_soup.select_one("h1")
full_content = extract_full_content(detail_soup)
full_content = clean_haberler_content(full_content)
if title_tag and full_content and len(full_content.strip()) > 50:
article = {
"id": get_article_id(href),
"title": title_tag.get_text(strip=True),
"content": full_content,
"url": href,
"source": "haberler.com",
"timestamp": datetime.utcnow().isoformat()
}
articles.append(article)
print(f"{article['title']} → {href}")
else:
print(f"İçerik bulunamadı → {href}")
except Exception as e:
print(f"Hata ({href}): {e}")
print(f"\nToplam {len(articles)} haber çekildi.")
return articles
# Test / terminal çıktısı
if __name__ == "__main__":
print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n")
articles = scrape_haberler()
print("\nÇekilen Haber Özeti:")
for i, article in enumerate(articles, 1):
print(f"\n{i}. Haber")
print(f"Başlık: {article['title']}")
print(f"Link: {article['url']}")
print(f"İçerik Uzunluğu: {len(article['content'])} karakter")
print(f"Zaman Damgası: {article['timestamp']}")
print(f"\nİçerik:\n{article['content']}")
print("-" * 120)
|