samim2024 commited on
Commit
d665d4a
·
verified ·
1 Parent(s): afbb5c0

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +46 -59
model.py CHANGED
@@ -1,75 +1,62 @@
1
  import os
2
- from langchain_community.vectorstores import FAISS
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- from langchain_community.llms import HuggingFaceHub
5
- from langchain.prompts import PromptTemplate
6
- from langchain.chains import RetrievalQA
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.document_loaders import TextLoader
 
9
  from langchain.docstore.document import Document
 
 
 
10
 
11
- # Load Hugging Face API token from environment
12
- HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 
 
13
 
14
- # Embedding model (can be changed to any sentence transformer model)
15
- embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
16
 
17
- # Prompt template for Mistral
18
- prompt_template = PromptTemplate(
19
- input_variables=["context", "question"],
20
- template="""You are an intelligent assistant. Use the context below to answer the question.
21
- If the answer is not contained in the context, say "I don't know."
22
 
23
- Context: {context}
24
- Question: {question}
25
- Answer:"""
26
- )
27
 
28
- def create_vectorstore(doc_path: str = "data/docs.txt"):
29
- """Create or load FAISS vectorstore from the given document."""
30
- loader = TextLoader(doc_path)
31
- documents = loader.load()
32
 
33
- # Split into smaller chunks
34
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
35
- docs = text_splitter.split_documents(documents)
 
 
 
 
36
 
37
- # Create FAISS vectorstore
38
- vectordb = FAISS.from_documents(docs, embedding_model)
39
- vectordb.save_local("vectorstore")
40
- return vectordb
41
 
42
- def load_vectorstore():
43
- """Load existing FAISS vectorstore from disk."""
44
- return FAISS.load_local("vectorstore", embedding_model, allow_dangerous_deserialization=True)
45
 
46
- def get_llm():
47
- """Load the HuggingFace Mistral LLM."""
48
- return HuggingFaceHub(
49
- repo_id="mistralai/Mistral-7B-Instruct-v0.1",
50
- model_kwargs={"temperature": 0.5, "max_new_tokens": 512},
51
- huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
52
- )
 
 
53
 
54
- def build_qa_chain():
55
- """Build the full RAG QA chain."""
56
- vectordb = load_vectorstore()
57
- retriever = vectordb.as_retriever()
58
- llm = get_llm()
59
 
60
- qa_chain = RetrievalQA.from_chain_type(
61
- llm=llm,
62
- retriever=retriever,
63
- return_source_documents=True,
64
- chain_type_kwargs={"prompt": prompt_template}
 
 
65
  )
66
- return qa_chain
67
 
68
- def ask_question(query: str) -> dict:
69
- """Handle a single user query."""
70
- chain = build_qa_chain()
71
- result = chain({"query": query})
72
- return {
73
- "answer": result["result"],
74
- "sources": [doc.metadata.get("source", "unknown") for doc in result["source_documents"]]
75
- }
 
1
  import os
2
+ from langchain.vectorstores import FAISS
3
+ from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
 
4
  from langchain_community.document_loaders import TextLoader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
  from langchain.docstore.document import Document
7
+ from langchain.chains import RetrievalQA
8
+ from langchain_community.llms import HuggingFaceHub
9
+ from langchain.embeddings.base import Embeddings
10
 
11
+ # Set safe caching directories to avoid permission denied errors
12
+ os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
13
+ os.environ["HF_HOME"] = "/app/cache"
14
+ os.makedirs("/app/cache", exist_ok=True)
15
 
16
+ # Constants
17
+ DATA_PATH = "/app/data"
18
+ VECTORSTORE_PATH = "/app/vectorstore"
19
+ DOCS_FILENAME = "context.txt"
20
+ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L6-v2"
21
 
 
 
 
 
 
22
 
23
+ def load_embedding_model() -> Embeddings:
24
+ """Initialize and return the HuggingFace embedding model."""
25
+ return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
 
26
 
 
 
 
 
27
 
28
+ def load_documents() -> list[Document]:
29
+ """Load and split documents into chunks."""
30
+ loader = TextLoader(os.path.join(DATA_PATH, DOCS_FILENAME))
31
+ raw_docs = loader.load()
32
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
33
+ docs = splitter.split_documents(raw_docs)
34
+ return docs
35
 
 
 
 
 
36
 
37
+ def load_vectorstore() -> FAISS:
38
+ """Load or create FAISS vectorstore from documents."""
39
+ vectorstore_file = os.path.join(VECTORSTORE_PATH, "faiss_index")
40
 
41
+ embedding_model = load_embedding_model()
42
+
43
+ if os.path.exists(vectorstore_file):
44
+ return FAISS.load_local(vectorstore_file, embedding_model, allow_dangerous_deserialization=True)
45
+
46
+ docs = load_documents()
47
+ vectorstore = FAISS.from_documents(docs, embedding_model)
48
+ vectorstore.save_local(vectorstore_file)
49
+ return vectorstore
50
 
 
 
 
 
 
51
 
52
+ def ask_question(query: str) -> str:
53
+ """Query the vectorstore and return the answer using the language model."""
54
+ vectorstore = load_vectorstore()
55
+
56
+ llm = HuggingFaceHub(
57
+ repo_id="mistralai/Mistral-7B-Instruct-v0.1",
58
+ model_kwargs={"temperature": 0.5, "max_tokens": 256},
59
  )
 
60
 
61
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
62
+ return qa.run(query)