import requests from bs4 import BeautifulSoup from datetime import datetime import hashlib def get_article_id(url): return hashlib.md5(url.encode()).hexdigest() def clean_haberler_content(content): """Gereksiz telif ve site reklam metinlerini siler""" blacklist_phrases = [ "© Copyright", "Haberler.com:", "Haber:", "bildirimlerimize izin vererek", "masaüstü", "Tüm Hakları Gizlidir", "Haberler.com’da" ] lines = content.split("\n") cleaned_lines = [] for line in lines: if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases): cleaned_lines.append(line) return "\n".join(cleaned_lines).strip() def extract_full_content(soup): """Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır""" candidate_containers = [ # Özel div class'ları ("div", {"class": "haber-metin"}), ("div", {"class": "article-content"}), ("div", {"class": "news-content"}), ("div", {"class": "detail-text"}), ("div", {"class": "content-text"}), ("div", {"id": "content"}), # Article ("article", {}), # Section/main ("section", {}), ("main", {}), ] for tag, attr in candidate_containers: container = soup.find(tag, attr) if container: paragraphs = container.find_all("p") content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) if content and len(content.strip()) > 50: return content # Son çare: tüm

etiketlerini tara paragraphs = soup.find_all("p") content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) if content and len(content.strip()) > 50: return content # Ekstra: bazı haberler

içinde tek blok metin olabilir all_divs = soup.find_all("div") text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)] fallback_content = "\n".join(text_blobs) if fallback_content and len(fallback_content.strip()) > 50: return fallback_content return "" # Hiçbir içerik bulunamadıysa def scrape_haberler(): url = "https://www.haberler.com/son-dakika/" headers = {"User-Agent": "Mozilla/5.0"} response = requests.get(url, headers=headers) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "html.parser") articles = [] seen = set() for a_tag in soup.select("a"): href = a_tag.get("href", "") text = a_tag.get_text(strip=True) if not href or not text or "haberi" not in href: continue if not href.startswith("http"): href = "https://www.haberler.com" + href if href in seen: continue seen.add(href) try: detail_resp = requests.get(href, headers=headers, timeout=10) detail_resp.encoding = "utf-8" detail_soup = BeautifulSoup(detail_resp.text, "html.parser") title_tag = detail_soup.select_one("h1") full_content = extract_full_content(detail_soup) full_content = clean_haberler_content(full_content) if title_tag and full_content and len(full_content.strip()) > 50: article = { "id": get_article_id(href), "title": title_tag.get_text(strip=True), "content": full_content, "url": href, "source": "haberler.com", "timestamp": datetime.utcnow().isoformat() } articles.append(article) print(f"{article['title']} → {href}") else: print(f"İçerik bulunamadı → {href}") except Exception as e: print(f"Hata ({href}): {e}") print(f"\nToplam {len(articles)} haber çekildi.") return articles # Test / terminal çıktısı if __name__ == "__main__": print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n") articles = scrape_haberler() print("\nÇekilen Haber Özeti:") for i, article in enumerate(articles, 1): print(f"\n{i}. Haber") print(f"Başlık: {article['title']}") print(f"Link: {article['url']}") print(f"İçerik Uzunluğu: {len(article['content'])} karakter") print(f"Zaman Damgası: {article['timestamp']}") print(f"\nİçerik:\n{article['content']}") print("-" * 120)