File size: 2,151 Bytes
70d956a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# milliyet_link_scraper.py
import requests
from bs4 import BeautifulSoup
def get_sondakika_links():
url = "https://www.milliyet.com.tr/son-dakika/"
headers = {
"User-Agent": "Mozilla/5.0"
}
base_url = "https://www.milliyet.com.tr"
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen = set()
news_links = []
for a in soup.find_all("a", href=True):
href = a["href"].strip()
if href.startswith("/"):
href = base_url + href
elif not href.startswith("http"):
continue
# -737 ile biten haber ID'sine sahip olanları al (haber linkleri)
if "-737" in href and "milliyet.com.tr" in href:
if href not in seen:
seen.add(href)
news_links.append(href)
return news_links
def get_news_content(url):
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Başlık bulma (farklı HTML yapıları için deneme)
title = None
for selector in [
("h1", {"id": "title"}),
("h1", {"class": "news-title"}),
("h1", {})
]:
found = soup.find(selector[0], selector[1])
if found:
title = found.get_text(strip=True)
break
if not title:
title = "Başlık bulunamadı"
# İçerik bulma
content = ""
article_div = soup.find("div", class_="articleBox") or soup.find("div", class_="news-content")
if article_div:
paragraphs = article_div.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
else:
# Yedek olarak tüm paragrafları dene
paragraphs = soup.find_all("p")
content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
return {
"title": title,
"content": content.strip()
}
|