iamseyhmus7's picture
Upload 17 files
70d956a verified
# milliyet_link_scraper.py
import requests
from bs4 import BeautifulSoup
def get_sondakika_links():
url = "https://www.milliyet.com.tr/son-dakika/"
headers = {
"User-Agent": "Mozilla/5.0"
}
base_url = "https://www.milliyet.com.tr"
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen = set()
news_links = []
for a in soup.find_all("a", href=True):
href = a["href"].strip()
if href.startswith("/"):
href = base_url + href
elif not href.startswith("http"):
continue
# -737 ile biten haber ID'sine sahip olanları al (haber linkleri)
if "-737" in href and "milliyet.com.tr" in href:
if href not in seen:
seen.add(href)
news_links.append(href)
return news_links
def get_news_content(url):
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Başlık bulma (farklı HTML yapıları için deneme)
title = None
for selector in [
("h1", {"id": "title"}),
("h1", {"class": "news-title"}),
("h1", {})
]:
found = soup.find(selector[0], selector[1])
if found:
title = found.get_text(strip=True)
break
if not title:
title = "Başlık bulunamadı"
# İçerik bulma
content = ""
article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content")
if article_div:
paragraphs = article_div.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
else:
# Yedek olarak tüm paragrafları dene
paragraphs = soup.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
return {
"title": title,
"content": content.strip()
}