Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

App Files Files Community

Turkish-LLM-RAG-Chatbot / RAG /pipeline.py

iamseyhmus7

Upload 17 files

70d956a verified 11 days ago

raw

history blame contribute delete

2.77 kB

	from scraper.milliyet import get_sondakika_links as milliyet_links, get_news_content as milliyet_parse
	from scraper.haberler import scrape_haberler
	from VektorDataBase.pinecone_client import upsert_article_chunks , get_id_from_url
	from VektorDataBase.embedder import get_embedding
	from datetime import datetime

	# Tüm işlenmiş içerikleri burada tutuyoruz (hash veya direkt metin de olabilir)
	processed_contents = set()

	def is_duplicate_content(content: str) -> bool:
	"""Aynı içerik daha önce işlendi mi? aynı içeriğin tekrar veritabanına eklenmesini engeller"""
	return content.strip() in processed_contents

	def process_news_item(source: str, url: str, parse_func):
	try:
	news = parse_func(url)
	title = news.get("title", "").strip()
	content = news.get("content", "").strip()

	if not title or not content:
	print(f"{source} boş içerik veya başlık atlandı → {url}")
	return

	if is_duplicate_content(content):
	print(f"{source} aynı içerik atlandı (dupe) → {url}")
	return

	upsert_article_chunks(
	article_id_base=get_id_from_url(url),
	url=url,
	title=title,
	text=content,
	timestamp=datetime.now().isoformat(),
	embed_func=get_embedding
	)
	processed_contents.add(content)

	except Exception as e:
	print(f"Error processing {source} news item: {e}")

	def run_pipeline():
	print(f"\nPipeline çalışıyor... {datetime.now().isoformat()}")

	# --- Milliyet ---
	print("\nMilliyet haberleri çekiliyor...")
	for link in milliyet_links():
	process_news_item("milliyet.com.tr", link, milliyet_parse)

	# --- Haberler.com ---
	print("\nHaberler.com içerikleri işleniyor...")
	haberler_articles = scrape_haberler()
	for article in haberler_articles:
	title = article["title"].strip()
	content = article["content"].strip()

	if not title or not content:
	print(f"haberler.com boş içerik atlandı → {article['url']}")
	continue

	if is_duplicate_content(content):
	print(f"haberler.com aynı içerik atlandı (dupe) → {article['url']}")
	continue

	upsert_article_chunks(
	article_id_base=article["id"],
	url=article["url"],
	title=title,
	text=content,
	timestamp=article["timestamp"],
	embed_func=get_embedding
	)
	processed_contents.add(content)

	print(f"\nPipeline tamamlandı: {datetime.utcnow().isoformat()}")
	if __name__ == "__main__":
	run_pipeline()