File size: 4,757 Bytes
70d956a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

def get_article_id(url):
    return hashlib.md5(url.encode()).hexdigest()

def clean_haberler_content(content):
    """Gereksiz telif ve site reklam metinlerini siler"""
    blacklist_phrases = [
        "© Copyright",
        "Haberler.com:",
        "Haber:",
        "bildirimlerimize izin vererek",
        "masaüstü",
        "Tüm Hakları Gizlidir",
        "Haberler.com’da"
    ]
    lines = content.split("\n")
    cleaned_lines = []
    for line in lines:
        if not any(phrase.lower() in line.lower() for phrase in blacklist_phrases):
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines).strip()

def extract_full_content(soup):
    """Haberin farklı yapılarda olabilecek içeriğini sağlam şekilde çıkarır"""

    candidate_containers = [
        # Özel div class'ları
        ("div", {"class": "haber-metin"}),
        ("div", {"class": "article-content"}),
        ("div", {"class": "news-content"}),
        ("div", {"class": "detail-text"}),
        ("div", {"class": "content-text"}),
        ("div", {"id": "content"}),
        # Article
        ("article", {}),
        # Section/main
        ("section", {}),
        ("main", {}),
    ]

    for tag, attr in candidate_containers:
        container = soup.find(tag, attr)
        if container:
            paragraphs = container.find_all("p")
            content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            if content and len(content.strip()) > 50:
                return content

    # Son çare: tüm <p> etiketlerini tara
    paragraphs = soup.find_all("p")
    content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
    if content and len(content.strip()) > 50:
        return content

    # Ekstra: bazı haberler <div> içinde tek blok metin olabilir
    all_divs = soup.find_all("div")
    text_blobs = [div.get_text(strip=True) for div in all_divs if div.get_text(strip=True)]
    fallback_content = "\n".join(text_blobs)
    if fallback_content and len(fallback_content.strip()) > 50:
        return fallback_content

    return ""  # Hiçbir içerik bulunamadıysa

def scrape_haberler():
    url = "https://www.haberler.com/son-dakika/"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, "html.parser")

    articles = []
    seen = set()

    for a_tag in soup.select("a"):
        href = a_tag.get("href", "")
        text = a_tag.get_text(strip=True)

        if not href or not text or "haberi" not in href:
            continue

        if not href.startswith("http"):
            href = "https://www.haberler.com" + href

        if href in seen:
            continue
        seen.add(href)

        try:
            detail_resp = requests.get(href, headers=headers, timeout=10)
            detail_resp.encoding = "utf-8"
            detail_soup = BeautifulSoup(detail_resp.text, "html.parser")

            title_tag = detail_soup.select_one("h1")
            full_content = extract_full_content(detail_soup)
            full_content = clean_haberler_content(full_content)

            if title_tag and full_content and len(full_content.strip()) > 50:
                article = {
                    "id": get_article_id(href),
                    "title": title_tag.get_text(strip=True),
                    "content": full_content,
                    "url": href,
                    "source": "haberler.com",
                    "timestamp": datetime.utcnow().isoformat()
                }
                articles.append(article)
                print(f"{article['title']}{href}")
            else:
                print(f"İçerik bulunamadı → {href}")

        except Exception as e:
            print(f"Hata ({href}): {e}")

    print(f"\nToplam {len(articles)} haber çekildi.")
    return articles

# Test / terminal çıktısı
if __name__ == "__main__":
    print("Haberler.com sitesinden son dakika haberleri çekiliyor...\n")
    
    articles = scrape_haberler()

    print("\nÇekilen Haber Özeti:")
    for i, article in enumerate(articles, 1):
        print(f"\n{i}. Haber")
        print(f"Başlık: {article['title']}")
        print(f"Link: {article['url']}")
        print(f"İçerik Uzunluğu: {len(article['content'])} karakter")
        print(f"Zaman Damgası: {article['timestamp']}")
        print(f"\nİçerik:\n{article['content']}")
        print("-" * 120)