|
import os
|
|
import hashlib
|
|
from dotenv import load_dotenv
|
|
from pinecone import Pinecone, ServerlessSpec
|
|
from chunkingg import chunk_text
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
|
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
|
|
DIMENSION = int(os.getenv("PINECONE_DIMENSION", "1024"))
|
|
ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-west-2")
|
|
|
|
|
|
pc = Pinecone(api_key=PINECONE_API_KEY)
|
|
|
|
|
|
if INDEX_NAME not in pc.list_indexes().names():
|
|
pc.create_index(
|
|
name=INDEX_NAME,
|
|
dimension=DIMENSION,
|
|
metric="cosine",
|
|
spec=ServerlessSpec(
|
|
cloud="aws",
|
|
region=ENVIRONMENT
|
|
)
|
|
)
|
|
|
|
|
|
index = pc.Index(INDEX_NAME)
|
|
|
|
def get_id_from_url(url: str) -> str:
|
|
"""URL'den benzersiz bir ID üretir"""
|
|
return hashlib.md5(url.encode()).hexdigest()
|
|
|
|
def upsert_article_chunks(article_id_base: str, url: str, title: str, text: str, timestamp: str, embed_func):
|
|
"""
|
|
Metni chunk'lara bölerek her chunk'ı embed'le ve Pinecone'a yükle.
|
|
- Boş, çok kısa ve tekrarlayan chunk'lar alınmaz.
|
|
"""
|
|
chunks = chunk_text(text)
|
|
vectors = []
|
|
seen_chunks = set()
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
chunk = chunk.strip()
|
|
|
|
|
|
if not chunk:
|
|
continue
|
|
if len(chunk) < 30:
|
|
continue
|
|
if chunk in seen_chunks:
|
|
continue
|
|
|
|
embedding = embed_func(chunk)
|
|
if not embedding or not isinstance(embedding, list):
|
|
continue
|
|
|
|
chunk_id = f"{article_id_base}-chunk-{i}"
|
|
vectors.append({
|
|
"id": chunk_id,
|
|
"values": embedding,
|
|
"metadata": {
|
|
"url": url,
|
|
"title": title,
|
|
"text": chunk,
|
|
"timestamp": timestamp
|
|
}
|
|
})
|
|
|
|
seen_chunks.add(chunk)
|
|
|
|
try:
|
|
if vectors:
|
|
index.upsert(vectors)
|
|
print(f"{len(vectors)} chunk '{title[:50]}...' için Pinecone'a yüklendi.")
|
|
else:
|
|
print(f"[UYARI] '{title[:50]}...' için geçerli chunk bulunamadı.")
|
|
except Exception as e:
|
|
print(f"[HATA] Pinecone upsert hatası: {e}")
|
|
|
|
def search_pinecone(query_embedding, top_k=3):
|
|
"""
|
|
Verilen embedding vektörüyle Pinecone'da arama yapar.
|
|
Dönen sonuçlar: [{'id': ..., 'score': ..., 'metadata': {...}}, ...]
|
|
"""
|
|
try:
|
|
|
|
vector = query_embedding.tolist() if hasattr(query_embedding, 'tolist') else list(query_embedding)
|
|
result = index.query(
|
|
vector=vector,
|
|
top_k=top_k,
|
|
include_metadata=True
|
|
)
|
|
|
|
return result['matches']
|
|
except Exception as e:
|
|
print(f"[HATA] Pinecone arama hatası: {e}")
|
|
return [] |