Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

File size: 2,767 Bytes

70d956a

from scraper.milliyet import get_sondakika_links as milliyet_links, get_news_content as milliyet_parse
from scraper.haberler import scrape_haberler 
from VektorDataBase.pinecone_client import upsert_article_chunks , get_id_from_url
from VektorDataBase.embedder import get_embedding
from datetime import datetime

# Tüm işlenmiş içerikleri burada tutuyoruz (hash veya direkt metin de olabilir)
processed_contents = set()

def is_duplicate_content(content: str) -> bool:
    """Aynı içerik daha önce işlendi mi? aynı içeriğin tekrar veritabanına eklenmesini engeller"""  
    return content.strip() in processed_contents

def process_news_item(source: str, url: str, parse_func):
    try:
        news = parse_func(url)
        title = news.get("title", "").strip()
        content = news.get("content", "").strip()

        if not title or not content:
            print(f"{source} boş içerik veya başlık atlandı → {url}")
            return
        
        if is_duplicate_content(content):
            print(f"{source} aynı içerik atlandı (dupe) → {url}")
            return
        
        upsert_article_chunks(
            article_id_base=get_id_from_url(url),
            url=url,
            title=title,
            text=content,
            timestamp=datetime.now().isoformat(),
            embed_func=get_embedding
        )
        processed_contents.add(content)

    except Exception as e:
        print(f"Error processing {source} news item: {e}")

def run_pipeline():
    print(f"\nPipeline çalışıyor... {datetime.now().isoformat()}")

    # --- Milliyet ---
    print("\nMilliyet haberleri çekiliyor...")
    for link in milliyet_links():
        process_news_item("milliyet.com.tr", link, milliyet_parse)

    # --- Haberler.com ---
    print("\nHaberler.com içerikleri işleniyor...")
    haberler_articles = scrape_haberler()
    for article in haberler_articles:
        title = article["title"].strip()
        content = article["content"].strip()

        if not title or not content:
            print(f"haberler.com boş içerik atlandı → {article['url']}")
            continue

        if is_duplicate_content(content):
            print(f"haberler.com aynı içerik atlandı (dupe) → {article['url']}")
            continue

        upsert_article_chunks(
            article_id_base=article["id"],
            url=article["url"],
            title=title,
            text=content,
            timestamp=article["timestamp"],
            embed_func=get_embedding
        )
        processed_contents.add(content)

    print(f"\nPipeline tamamlandı: {datetime.utcnow().isoformat()}")
if __name__ == "__main__":
    run_pipeline()