testing / model.py
samim2024's picture
Create model.py
982eab4 verified
raw
history blame
2.02 kB
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import uuid
import faiss
vectorstore = None
def load_vectorstore(pdf_path):
global vectorstore
reader = PdfReader(pdf_path)
text = "".join([page.extract_text() or "" for page in reader.pages])
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_text(text)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
dim = len(embeddings.embed_query("test"))
index = faiss.IndexFlatL2(dim)
vectorstore = FAISS(
embedding_function=embeddings,
index=index,
docstore=InMemoryDocstore({}),
index_to_docstore_id={}
)
uuids = [str(uuid.uuid4()) for _ in chunks]
vectorstore.add_texts(chunks, ids=uuids)
def ask_question(query):
global vectorstore
if not vectorstore:
return "Please upload and index a document first."
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
model_kwargs={"temperature": 0.7, "max_length": 512}
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
prompt = PromptTemplate(
template="Use the context to answer the question:
Context: {context}
Question: {question}
Answer:",
input_variables=["context", "question"]
)
chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
return_source_documents=False,
chain_type_kwargs={"prompt": prompt}
)
return chain({"query": query})["result"]