|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
def get_sondakika_links():
|
|
url = "https://www.milliyet.com.tr/son-dakika/"
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0"
|
|
}
|
|
base_url = "https://www.milliyet.com.tr"
|
|
|
|
response = requests.get(url, headers=headers)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
seen = set()
|
|
news_links = []
|
|
|
|
for a in soup.find_all("a", href=True):
|
|
href = a["href"].strip()
|
|
|
|
if href.startswith("/"):
|
|
href = base_url + href
|
|
elif not href.startswith("http"):
|
|
continue
|
|
|
|
|
|
if "-737" in href and "milliyet.com.tr" in href:
|
|
if href not in seen:
|
|
seen.add(href)
|
|
news_links.append(href)
|
|
|
|
return news_links
|
|
def get_news_content(url):
|
|
headers = {"User-Agent": "Mozilla/5.0"}
|
|
response = requests.get(url, headers=headers)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
title = None
|
|
for selector in [
|
|
("h1", {"id": "title"}),
|
|
("h1", {"class": "news-title"}),
|
|
("h1", {})
|
|
]:
|
|
found = soup.find(selector[0], selector[1])
|
|
if found:
|
|
title = found.get_text(strip=True)
|
|
break
|
|
if not title:
|
|
title = "Başlık bulunamadı"
|
|
|
|
|
|
content = ""
|
|
article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content")
|
|
if article_div:
|
|
paragraphs = article_div.find_all("p")
|
|
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
|
|
else:
|
|
|
|
paragraphs = soup.find_all("p")
|
|
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
|
|
|
|
return {
|
|
"title": title,
|
|
"content": content.strip()
|
|
}
|
|
|