Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

File size: 2,151 Bytes

70d956a

# milliyet_link_scraper.py
import requests
from bs4 import BeautifulSoup

def get_sondakika_links():
    url = "https://www.milliyet.com.tr/son-dakika/"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    base_url = "https://www.milliyet.com.tr"

    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    seen = set()
    news_links = []

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()

        if href.startswith("/"):
            href = base_url + href
        elif not href.startswith("http"):
            continue

        # -737 ile biten haber ID'sine sahip olanları al (haber linkleri)
        if "-737" in href and "milliyet.com.tr" in href:
            if href not in seen:
                seen.add(href)
                news_links.append(href)

    return news_links
def get_news_content(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    # Başlık bulma (farklı HTML yapıları için deneme)
    title = None
    for selector in [
        ("h1", {"id": "title"}),
        ("h1", {"class": "news-title"}),
        ("h1", {})
    ]:
        found = soup.find(selector[0], selector[1])
        if found:
            title = found.get_text(strip=True)
            break
    if not title:
        title = "Başlık bulunamadı"

    # İçerik bulma
    content = ""
    article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content")
    if article_div:
        paragraphs = article_div.find_all("p")
        content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
    else:
        # Yedek olarak tüm paragrafları dene
        paragraphs = soup.find_all("p")
        content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])

    return {
        "title": title,
        "content": content.strip()
    }