Update model.py
Browse files
model.py
CHANGED
@@ -1,48 +1,79 @@
|
|
1 |
import os
|
2 |
import tempfile
|
|
|
3 |
from langchain.vectorstores import FAISS
|
4 |
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
-
from langchain_community.document_loaders import TextLoader
|
6 |
from langchain.text_splitter import CharacterTextSplitter
|
7 |
from langchain.docstore.document import Document
|
8 |
from langchain.chains import RetrievalQA
|
9 |
-
from
|
10 |
-
from langchain.embeddings.base import Embeddings
|
11 |
|
12 |
-
# Use /tmp for
|
13 |
CACHE_DIR = tempfile.gettempdir()
|
14 |
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
|
15 |
os.environ["HF_HOME"] = CACHE_DIR
|
16 |
|
17 |
DATA_PATH = "/app/data"
|
18 |
VECTORSTORE_PATH = "/app/vectorstore"
|
19 |
-
DOCS_FILENAME = "context.txt"
|
20 |
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
21 |
|
22 |
-
def load_embedding_model()
|
|
|
23 |
return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
24 |
|
25 |
-
def load_documents()
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
def load_vectorstore()
|
|
|
32 |
vectorstore_file = os.path.join(VECTORSTORE_PATH, "faiss_index")
|
33 |
embedding_model = load_embedding_model()
|
|
|
34 |
if os.path.exists(vectorstore_file):
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
vectorstore = FAISS.from_documents(docs, embedding_model)
|
38 |
vectorstore.save_local(vectorstore_file)
|
39 |
return vectorstore
|
40 |
|
41 |
-
def ask_question(query
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
)
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import tempfile
|
3 |
+
import PyPDF2
|
4 |
from langchain.vectorstores import FAISS
|
5 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
6 |
from langchain.text_splitter import CharacterTextSplitter
|
7 |
from langchain.docstore.document import Document
|
8 |
from langchain.chains import RetrievalQA
|
9 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
|
|
10 |
|
11 |
+
# Use /tmp for cache
|
12 |
CACHE_DIR = tempfile.gettempdir()
|
13 |
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
|
14 |
os.environ["HF_HOME"] = CACHE_DIR
|
15 |
|
16 |
DATA_PATH = "/app/data"
|
17 |
VECTORSTORE_PATH = "/app/vectorstore"
|
|
|
18 |
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L6-v2"
|
19 |
|
20 |
+
def load_embedding_model():
|
21 |
+
"""Load sentence transformer embeddings."""
|
22 |
return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
|
23 |
|
24 |
+
def load_documents(pdf_path):
|
25 |
+
"""Extract text from PDF and split into documents."""
|
26 |
+
try:
|
27 |
+
with open(pdf_path, "rb") as f:
|
28 |
+
pdf = PyPDF2.PdfReader(f)
|
29 |
+
text = "".join(page.extract_text() or "" for page in pdf.pages)
|
30 |
+
if not text.strip():
|
31 |
+
raise ValueError("No text extracted from PDF")
|
32 |
+
|
33 |
+
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
34 |
+
docs = splitter.create_documents([text])
|
35 |
+
return docs
|
36 |
+
except Exception as e:
|
37 |
+
raise ValueError(f"Failed to process PDF: {str(e)}")
|
38 |
|
39 |
+
def load_vectorstore(pdf_path):
|
40 |
+
"""Load or create FAISS vector store from PDF."""
|
41 |
vectorstore_file = os.path.join(VECTORSTORE_PATH, "faiss_index")
|
42 |
embedding_model = load_embedding_model()
|
43 |
+
|
44 |
if os.path.exists(vectorstore_file):
|
45 |
+
try:
|
46 |
+
return FAISS.load_local(vectorstore_file, embedding_model, allow_dangerous_deserialization=True)
|
47 |
+
except:
|
48 |
+
pass # Rebuild if loading fails
|
49 |
+
|
50 |
+
docs = load_documents(pdf_path)
|
51 |
vectorstore = FAISS.from_documents(docs, embedding_model)
|
52 |
vectorstore.save_local(vectorstore_file)
|
53 |
return vectorstore
|
54 |
|
55 |
+
def ask_question(query, pdf_path):
|
56 |
+
"""Run RAG query and return answer with contexts."""
|
57 |
+
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
58 |
+
if not api_key:
|
59 |
+
raise ValueError("HUGGINGFACEHUB_API_TOKEN not set")
|
60 |
+
|
61 |
+
vectorstore = load_vectorstore(pdf_path)
|
62 |
+
llm = HuggingFaceEndpoint(
|
63 |
+
repo_id="mistralai/Mistral-7B-Instruct-v0.2",
|
64 |
+
huggingfacehub_api_token=api_key,
|
65 |
+
temperature=0.5,
|
66 |
+
max_new_tokens=256
|
67 |
)
|
68 |
+
|
69 |
+
qa = RetrievalQA.from_chain_type(
|
70 |
+
llm=llm,
|
71 |
+
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
|
72 |
+
return_source_documents=True
|
73 |
+
)
|
74 |
+
|
75 |
+
result = qa({"query": query})
|
76 |
+
return {
|
77 |
+
"answer": result["result"],
|
78 |
+
"contexts": [doc.page_content for doc in result["source_documents"]]
|
79 |
+
}
|