Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

App Files Files Community

Turkish-LLM-RAG-Chatbot / RAG /VektorDataBase /pinecone_client.py

iamseyhmus7

Upload 17 files

70d956a verified 12 days ago

raw

history blame contribute delete

3.15 kB

	import os
	import hashlib
	from dotenv import load_dotenv
	from pinecone import Pinecone, ServerlessSpec
	from chunkingg import chunk_text

	# .env dosyasını yükle
	load_dotenv()

	# Ortam değişkenlerini oku
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
	INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
	DIMENSION = int(os.getenv("PINECONE_DIMENSION", "1024"))
	ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-west-2")

	# Pinecone bağlantısı
	pc = Pinecone(api_key=PINECONE_API_KEY)

	# Eğer index yoksa oluştur
	if INDEX_NAME not in pc.list_indexes().names():
	pc.create_index(
	name=INDEX_NAME,
	dimension=DIMENSION,
	metric="cosine",
	spec=ServerlessSpec(
	cloud="aws",
	region=ENVIRONMENT
	)
	)

	# Index nesnesi
	index = pc.Index(INDEX_NAME)

	def get_id_from_url(url: str) -> str:
	"""URL'den benzersiz bir ID üretir"""
	return hashlib.md5(url.encode()).hexdigest()

	def upsert_article_chunks(article_id_base: str, url: str, title: str, text: str, timestamp: str, embed_func):
	"""
	Metni chunk'lara bölerek her chunk'ı embed'le ve Pinecone'a yükle.
	- Boş, çok kısa ve tekrarlayan chunk'lar alınmaz.
	"""
	chunks = chunk_text(text)
	vectors = []
	seen_chunks = set() # Tekrarı engellemek için içerik seti

	for i, chunk in enumerate(chunks):
	chunk = chunk.strip()

	# Filtreleme
	if not chunk:
	continue
	if len(chunk) < 30:
	continue
	if chunk in seen_chunks:
	continue

	embedding = embed_func(chunk)
	if not embedding or not isinstance(embedding, list):
	continue

	chunk_id = f"{article_id_base}-chunk-{i}"
	vectors.append({
	"id": chunk_id,
	"values": embedding,
	"metadata": {
	"url": url,
	"title": title,
	"text": chunk,
	"timestamp": timestamp
	}
	})

	seen_chunks.add(chunk)

	try:
	if vectors:
	index.upsert(vectors)
	print(f"{len(vectors)} chunk '{title[:50]}...' için Pinecone'a yüklendi.")
	else:
	print(f"[UYARI] '{title[:50]}...' için geçerli chunk bulunamadı.")
	except Exception as e:
	print(f"[HATA] Pinecone upsert hatası: {e}")

	def search_pinecone(query_embedding, top_k=3):
	"""
	Verilen embedding vektörüyle Pinecone'da arama yapar.
	Dönen sonuçlar: [{'id': ..., 'score': ..., 'metadata': {...}}, ...]
	"""
	try:
	# Eğer embedding np.ndarray ise .tolist() ile listeye çevir
	vector = query_embedding.tolist() if hasattr(query_embedding, 'tolist') else list(query_embedding)
	result = index.query(
	vector=vector,
	top_k=top_k,
	include_metadata=True
	)
	# Sonuçlar result['matches'] içinde döner
	return result['matches']
	except Exception as e:
	print(f"[HATA] Pinecone arama hatası: {e}")
	return []