Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

File size: 4,757 Bytes

70d956a

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

def get_article_id(url):
    return hashlib.md5(url.encode()).hexdigest()

def clean_haberler_content(content):
    """Gereksiz telif ve site reklam metinlerini siler"""
    blacklist_phrases = [
        "© Copyright",
        "Haberler.com:",
        "Haber:",
        "bildirimlerimize izin vererek",
        "masaüstü",
        "Tüm Hakları Gizlidir",
        "Haberler.com’da"
    ]
    lines = content.split("\n")
    cleaned_lines = []
    for line in lines:
        if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases):
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines).strip()

def extract_full_content(soup):
    """Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır"""

    candidate_containers = [
        # Özel div class'ları
        ("div", {"class": "haber-metin"}),
        ("div", {"class": "article-content"}),
        ("div", {"class": "news-content"}),
        ("div", {"class": "detail-text"}),
        ("div", {"class": "content-text"}),
        ("div", {"id": "content"}),
        # Article
        ("article", {}),
        # Section/main
        ("section", {}),
        ("main", {}),
    ]

    for tag, attr in candidate_containers:
        container = soup.find(tag, attr)
        if container:
            paragraphs = container.find_all("p")
            content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            if content and len(content.strip()) > 50:
                return content

    # Son çare: tüm <p> etiketlerini tara
    paragraphs = soup.find_all("p")
    content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
    if content and len(content.strip()) > 50:
        return content

    # Ekstra: bazı haberler <div> içinde tek blok metin olabilir
    all_divs = soup.find_all("div")
    text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)]
    fallback_content = "\n".join(text_blobs)
    if fallback_content and len(fallback_content.strip()) > 50:
        return fallback_content

    return ""  # Hiçbir içerik bulunamadıysa

def scrape_haberler():
    url = "https://www.haberler.com/son-dakika/"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, "html.parser")

    articles = []
    seen = set()

    for a_tag in soup.select("a"):
        href = a_tag.get("href", "")
        text = a_tag.get_text(strip=True)

        if not href or not text or "haberi" not in href:
            continue

        if not href.startswith("http"):
            href = "https://www.haberler.com" + href

        if href in seen:
            continue
        seen.add(href)

        try:
            detail_resp = requests.get(href, headers=headers, timeout=10)
            detail_resp.encoding = "utf-8"
            detail_soup = BeautifulSoup(detail_resp.text, "html.parser")

            title_tag = detail_soup.select_one("h1")
            full_content = extract_full_content(detail_soup)
            full_content = clean_haberler_content(full_content)

            if title_tag and full_content and len(full_content.strip()) > 50:
                article = {
                    "id": get_article_id(href),
                    "title": title_tag.get_text(strip=True),
                    "content": full_content,
                    "url": href,
                    "source": "haberler.com",
                    "timestamp": datetime.utcnow().isoformat()
                }
                articles.append(article)
                print(f"{article['title']} → {href}")
            else:
                print(f"İçerik bulunamadı → {href}")

        except Exception as e:
            print(f"Hata ({href}): {e}")

    print(f"\nToplam {len(articles)} haber çekildi.")
    return articles

# Test / terminal çıktısı
if __name__ == "__main__":
    print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n")
    
    articles = scrape_haberler()

    print("\nÇekilen Haber Özeti:")
    for i, article in enumerate(articles, 1):
        print(f"\n{i}. Haber")
        print(f"Başlık: {article['title']}")
        print(f"Link: {article['url']}")
        print(f"İçerik Uzunluğu: {len(article['content'])} karakter")
        print(f"Zaman Damgası: {article['timestamp']}")
        print(f"\nİçerik:\n{article['content']}")
        print("-" * 120)