|
import os |
|
from PyPDF2 import PdfReader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.docstore.in_memory import InMemoryDocstore |
|
from langchain_community.llms import HuggingFaceHub |
|
from langchain.chains import RetrievalQA |
|
from langchain.prompts import PromptTemplate |
|
import uuid |
|
import faiss |
|
|
|
vectorstore = None |
|
|
|
def load_vectorstore(pdf_path): |
|
global vectorstore |
|
|
|
reader = PdfReader(pdf_path) |
|
text = "".join([page.extract_text() or "" for page in reader.pages]) |
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
|
chunks = splitter.split_text(text) |
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") |
|
dim = len(embeddings.embed_query("test")) |
|
index = faiss.IndexFlatL2(dim) |
|
|
|
vectorstore = FAISS( |
|
embedding_function=embeddings, |
|
index=index, |
|
docstore=InMemoryDocstore({}), |
|
index_to_docstore_id={} |
|
) |
|
uuids = [str(uuid.uuid4()) for _ in chunks] |
|
vectorstore.add_texts(chunks, ids=uuids) |
|
|
|
|
|
def ask_question(query): |
|
global vectorstore |
|
if not vectorstore: |
|
return "Please upload and index a document first." |
|
|
|
llm = HuggingFaceHub( |
|
repo_id="mistralai/Mistral-7B-Instruct-v0.1", |
|
huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"), |
|
model_kwargs={"temperature": 0.7, "max_length": 512} |
|
) |
|
|
|
retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) |
|
prompt = PromptTemplate( |
|
template="Use the context to answer the question: |
|
Context: {context} |
|
Question: {question} |
|
Answer:", |
|
input_variables=["context", "question"] |
|
) |
|
|
|
chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
retriever=retriever, |
|
return_source_documents=False, |
|
chain_type_kwargs={"prompt": prompt} |
|
) |
|
return chain({"query": query})["result"] |
|
|