# milliyet_link_scraper.py import requests from bs4 import BeautifulSoup def get_sondakika_links(): url = "https://www.milliyet.com.tr/son-dakika/" headers = { "User-Agent": "Mozilla/5.0" } base_url = "https://www.milliyet.com.tr" response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") seen = set() news_links = [] for a in soup.find_all("a", href=True): href = a["href"].strip() if href.startswith("/"): href = base_url + href elif not href.startswith("http"): continue # -737 ile biten haber ID'sine sahip olanları al (haber linkleri) if "-737" in href and "milliyet.com.tr" in href: if href not in seen: seen.add(href) news_links.append(href) return news_links def get_news_content(url): headers = {"User-Agent": "Mozilla/5.0"} response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Başlık bulma (farklı HTML yapıları için deneme) title = None for selector in [ ("h1", {"id": "title"}), ("h1", {"class": "news-title"}), ("h1", {}) ]: found = soup.find(selector[0], selector[1]) if found: title = found.get_text(strip=True) break if not title: title = "Başlık bulunamadı" # İçerik bulma content = "" article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content") if article_div: paragraphs = article_div.find_all("p") content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) else: # Yedek olarak tüm paragrafları dene paragraphs = soup.find_all("p") content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]) return { "title": title, "content": content.strip() }