File size: 2,737 Bytes
982eab4 a3c88c4 2cfdb3c d665d4a a3c88c4 d665d4a e2765f4 d665d4a 2cfdb3c e2765f4 2cfdb3c a3c88c4 d4c3edd 982eab4 d665d4a 982eab4 2cfdb3c d665d4a 982eab4 2cfdb3c 982eab4 2cfdb3c a3c88c4 d665d4a 2cfdb3c a3c88c4 2cfdb3c a3c88c4 d665d4a 982eab4 2cfdb3c 982eab4 2cfdb3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import tempfile
import PyPDF2
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEndpoint
# Use /tmp for cache
CACHE_DIR = tempfile.gettempdir()
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_HOME"] = CACHE_DIR
DATA_PATH = "/app/data"
VECTORSTORE_PATH = "/app/vectorstore"
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L6-v2"
def load_embedding_model():
"""Load sentence transformer embeddings."""
return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
def load_documents(pdf_path):
"""Extract text from PDF and split into documents."""
try:
with open(pdf_path, "rb") as f:
pdf = PyPDF2.PdfReader(f)
text = "".join(page.extract_text() or "" for page in pdf.pages)
if not text.strip():
raise ValueError("No text extracted from PDF")
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.create_documents([text])
return docs
except Exception as e:
raise ValueError(f"Failed to process PDF: {str(e)}")
def load_vectorstore(pdf_path):
"""Load or create FAISS vector store from PDF."""
vectorstore_file = os.path.join(VECTORSTORE_PATH, "faiss_index")
embedding_model = load_embedding_model()
if os.path.exists(vectorstore_file):
try:
return FAISS.load_local(vectorstore_file, embedding_model, allow_dangerous_deserialization=True)
except:
pass # Rebuild if loading fails
docs = load_documents(pdf_path)
vectorstore = FAISS.from_documents(docs, embedding_model)
vectorstore.save_local(vectorstore_file)
return vectorstore
def ask_question(query, pdf_path):
"""Run RAG query and return answer with contexts."""
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if not api_key:
raise ValueError("HUGGINGFACEHUB_API_TOKEN not set")
vectorstore = load_vectorstore(pdf_path)
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.2",
huggingfacehub_api_token=api_key,
temperature=0.5,
max_new_tokens=256
)
qa = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True
)
result = qa({"query": query})
return {
"answer": result["result"],
"contexts": [doc.page_content for doc in result["source_documents"]]
} |