Spaces:

iamseyhmus7
/

Turkish-LLM-RAG-Chatbot

Running

File size: 3,154 Bytes

70d956a

import os
import hashlib
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from chunkingg import chunk_text

# .env dosyasını yükle
load_dotenv()

# Ortam değişkenlerini oku
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
DIMENSION = int(os.getenv("PINECONE_DIMENSION", "1024"))
ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-west-2")

# Pinecone bağlantısı
pc = Pinecone(api_key=PINECONE_API_KEY)

# Eğer index yoksa oluştur
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=ENVIRONMENT
        )
    )

# Index nesnesi
index = pc.Index(INDEX_NAME)

def get_id_from_url(url: str) -> str:
    """URL'den benzersiz bir ID üretir"""
    return hashlib.md5(url.encode()).hexdigest()

def upsert_article_chunks(article_id_base: str, url: str, title: str, text: str, timestamp: str, embed_func):
    """

    Metni chunk'lara bölerek her chunk'ı embed'le ve Pinecone'a yükle.

    - Boş, çok kısa ve tekrarlayan chunk'lar alınmaz.

    """
    chunks = chunk_text(text)
    vectors = []
    seen_chunks = set()  # Tekrarı engellemek için içerik seti

    for i, chunk in enumerate(chunks):
        chunk = chunk.strip()

        # Filtreleme
        if not chunk:
            continue
        if len(chunk) < 30:
            continue
        if chunk in seen_chunks:
            continue

        embedding = embed_func(chunk)
        if not embedding or not isinstance(embedding, list):
            continue

        chunk_id = f"{article_id_base}-chunk-{i}"
        vectors.append({
            "id": chunk_id,
            "values": embedding,
            "metadata": {
                "url": url,
                "title": title,
                "text": chunk,
                "timestamp": timestamp
            }
        })

        seen_chunks.add(chunk)

    try:
        if vectors:
            index.upsert(vectors)
            print(f"{len(vectors)} chunk '{title[:50]}...' için Pinecone'a yüklendi.")
        else:
            print(f"[UYARI] '{title[:50]}...' için geçerli chunk bulunamadı.")
    except Exception as e:
        print(f"[HATA] Pinecone upsert hatası: {e}")
        
def search_pinecone(query_embedding, top_k=3):
    """

    Verilen embedding vektörüyle Pinecone'da arama yapar.

    Dönen sonuçlar: [{'id': ..., 'score': ..., 'metadata': {...}}, ...]

    """
    try:
        # Eğer embedding np.ndarray ise .tolist() ile listeye çevir
        vector = query_embedding.tolist() if hasattr(query_embedding, 'tolist') else list(query_embedding)
        result = index.query(
            vector=vector,
            top_k=top_k,
            include_metadata=True
        )
        # Sonuçlar result['matches'] içinde döner
        return result['matches']
    except Exception as e:
        print(f"[HATA] Pinecone arama hatası: {e}")
        return []