remove old files
Browse files- rag_app/embeddings.py +0 -46
- rag_app/guardrail.gbnf +0 -13
- rag_app/rag.py +0 -272
rag_app/embeddings.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
from llama_cpp import Llama
|
2 |
-
from typing import Any, List
|
3 |
-
from llama_index.core.embeddings import BaseEmbedding
|
4 |
-
from llama_index.core.bridge.pydantic import PrivateAttr
|
5 |
-
|
6 |
-
|
7 |
-
class LlamaCppIndexEmbedding(BaseEmbedding):
|
8 |
-
_model: Llama = PrivateAttr()
|
9 |
-
|
10 |
-
def __init__(
|
11 |
-
self,
|
12 |
-
model_path: str = "models/bge-m3-Q4_K_M.gguf",
|
13 |
-
**kwargs: Any,
|
14 |
-
) -> None:
|
15 |
-
super().__init__(**kwargs)
|
16 |
-
self._model = Llama(model_path=model_path, embedding=True)
|
17 |
-
|
18 |
-
@classmethod
|
19 |
-
def class_name(cls) -> str:
|
20 |
-
return "llama-cpp-bge-m3-embeddings"
|
21 |
-
|
22 |
-
async def _aget_query_embedding(self, query: str) -> List[float]:
|
23 |
-
return self._get_query_embedding(query)
|
24 |
-
|
25 |
-
async def _aget_text_embedding(self, text: str) -> List[float]:
|
26 |
-
return self._get_text_embedding(text)
|
27 |
-
|
28 |
-
def _get_query_embedding(self, query: str) -> List[float]:
|
29 |
-
# Generate embedding using llama-cpp-python
|
30 |
-
response = self._model.create_embedding(input=query)
|
31 |
-
embedding = response['data'][0]['embedding']
|
32 |
-
return embedding
|
33 |
-
|
34 |
-
def _get_text_embedding(self, text: str) -> List[float]:
|
35 |
-
# Generate embedding for a single text
|
36 |
-
response = self._model.create_embedding(input=text)
|
37 |
-
embedding = response['data'][0]['embedding']
|
38 |
-
return embedding
|
39 |
-
|
40 |
-
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
|
41 |
-
# Generate embeddings for a list of texts
|
42 |
-
embeddings = []
|
43 |
-
for text in texts:
|
44 |
-
embedding = self._get_text_embedding(text)
|
45 |
-
embeddings.append(embedding)
|
46 |
-
return embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/guardrail.gbnf
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
root ::= (" "| "\n") grammar-models
|
2 |
-
grammar-models ::= category
|
3 |
-
category ::= "{" "\n" ws "\"flag\"" ":" ws category-flag "\n" ws "}"
|
4 |
-
category-flag ::= "\"safe\"" | "\"unsafe\""
|
5 |
-
boolean ::= "true" | "false"
|
6 |
-
null ::= "null"
|
7 |
-
string ::= "\"" (
|
8 |
-
[^"\\] |
|
9 |
-
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
10 |
-
)* "\"" ws
|
11 |
-
ws ::= ([ \t\n] ws)?
|
12 |
-
float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
|
13 |
-
integer ::= [0-9]+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/rag.py
DELETED
@@ -1,272 +0,0 @@
|
|
1 |
-
# !pip install pdfplumber
|
2 |
-
# !pip install rank_bm25
|
3 |
-
# !pip install langchain
|
4 |
-
# pip install sentence_transformers
|
5 |
-
# conda install -c conda-forge faiss-cpu
|
6 |
-
|
7 |
-
import pdfplumber
|
8 |
-
import pandas as pd
|
9 |
-
import numpy as np
|
10 |
-
import re
|
11 |
-
import os
|
12 |
-
from ast import literal_eval
|
13 |
-
import faiss
|
14 |
-
from llama_cpp import Llama, LlamaGrammar
|
15 |
-
from rank_bm25 import BM25Okapi
|
16 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
-
from sentence_transformers import SentenceTransformer, util
|
18 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
19 |
-
import PyPDF2
|
20 |
-
|
21 |
-
embedding_model = SentenceTransformer("models/all-MiniLM-L6-v2/")
|
22 |
-
llm = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
23 |
-
n_gpu_layers=-1, n_ctx=8000)
|
24 |
-
|
25 |
-
|
26 |
-
def extract_info_from_pdf(pdf_path):
|
27 |
-
"""
|
28 |
-
Extracts both paragraphs and tables from each PDF page using pdfplumber.
|
29 |
-
Returns a list of dictionaries with keys: "page_number", "paragraphs", "tables".
|
30 |
-
"""
|
31 |
-
document_data = []
|
32 |
-
with pdfplumber.open(pdf_path) as pdf:
|
33 |
-
for i, page in enumerate(pdf.pages, start=1):
|
34 |
-
page_data = {"page_number": i, "paragraphs": [], "tables": []}
|
35 |
-
text = page.extract_text()
|
36 |
-
if text:
|
37 |
-
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
38 |
-
page_data["paragraphs"] = paragraphs
|
39 |
-
tables = page.extract_tables()
|
40 |
-
dfs = []
|
41 |
-
for table in tables:
|
42 |
-
if len(table) > 1:
|
43 |
-
df = pd.DataFrame(table[1:], columns=table[0])
|
44 |
-
else:
|
45 |
-
df = pd.DataFrame(table)
|
46 |
-
dfs.append(df)
|
47 |
-
page_data["tables"] = dfs
|
48 |
-
document_data.append(page_data)
|
49 |
-
return document_data
|
50 |
-
|
51 |
-
|
52 |
-
def extract_financial_tables_regex(text):
|
53 |
-
"""
|
54 |
-
Extracts financial table information using a regex pattern (basic extraction).
|
55 |
-
"""
|
56 |
-
pattern = re.compile(r"(Revenue from Operations.*?)\n\n", re.DOTALL)
|
57 |
-
matches = pattern.findall(text)
|
58 |
-
if matches:
|
59 |
-
data_lines = matches[0].split("\n")
|
60 |
-
structured_data = [line.split() for line in data_lines if line.strip()]
|
61 |
-
if len(structured_data) > 1:
|
62 |
-
df = pd.DataFrame(structured_data[1:], columns=structured_data[0])
|
63 |
-
return df
|
64 |
-
return pd.DataFrame()
|
65 |
-
|
66 |
-
|
67 |
-
def clean_financial_data(df):
|
68 |
-
"""
|
69 |
-
Cleans the financial DataFrame by converting numerical columns.
|
70 |
-
"""
|
71 |
-
if df.empty:
|
72 |
-
return ""
|
73 |
-
for col in df.columns[1:]:
|
74 |
-
df[col] = df[col].replace({',': ''}, regex=True)
|
75 |
-
df[col] = pd.to_numeric(df[col], errors='coerce')
|
76 |
-
return df.to_string()
|
77 |
-
|
78 |
-
|
79 |
-
def combine_extracted_info(document_data, financial_text_regex=""):
|
80 |
-
"""
|
81 |
-
Combines extracted paragraphs and tables (converted to strings) into a single text.
|
82 |
-
Optionally appends extra financial table text.
|
83 |
-
"""
|
84 |
-
text_segments = []
|
85 |
-
for page in document_data:
|
86 |
-
for paragraph in page["paragraphs"]:
|
87 |
-
text_segments.append(paragraph)
|
88 |
-
for table in page["tables"]:
|
89 |
-
text_segments.append(table.to_string(index=False))
|
90 |
-
if financial_text_regex:
|
91 |
-
text_segments.append(financial_text_regex)
|
92 |
-
return "\n".join(text_segments)
|
93 |
-
|
94 |
-
|
95 |
-
def extract_text_from_pdf_pypdf2(pdf_path):
|
96 |
-
text = ""
|
97 |
-
with open(pdf_path, "rb") as file:
|
98 |
-
reader = PyPDF2.PdfReader(file)
|
99 |
-
for page in reader.pages:
|
100 |
-
text += page.extract_text() + "\n"
|
101 |
-
return text
|
102 |
-
|
103 |
-
|
104 |
-
def chunk_text(text, chunk_size=500, chunk_overlap=50):
|
105 |
-
"""
|
106 |
-
Uses RecursiveCharacterTextSplitter to chunk text.
|
107 |
-
"""
|
108 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
109 |
-
chunks = text_splitter.split_text(text)
|
110 |
-
return chunks
|
111 |
-
|
112 |
-
|
113 |
-
def build_faiss_index(chunks, embedding_model):
|
114 |
-
chunk_embeddings = embedding_model.encode(chunks)
|
115 |
-
dimension = chunk_embeddings.shape[1]
|
116 |
-
index = faiss.IndexFlatL2(dimension)
|
117 |
-
index.add(np.array(chunk_embeddings))
|
118 |
-
return index, chunk_embeddings
|
119 |
-
|
120 |
-
|
121 |
-
def retrieve_basic(query, index, chunks, embedding_model, k=5):
|
122 |
-
query_embedding = embedding_model.encode([query])
|
123 |
-
distances, indices = index.search(np.array(query_embedding), k)
|
124 |
-
return [chunks[i] for i in indices[0]], distances[0]
|
125 |
-
|
126 |
-
|
127 |
-
def retrieve_bm25(query, chunks, k=5):
|
128 |
-
tokenized_corpus = [chunk.lower().split() for chunk in chunks]
|
129 |
-
bm25_model = BM25Okapi(tokenized_corpus)
|
130 |
-
tokenized_query = query.lower().split()
|
131 |
-
scores = bm25_model.get_scores(tokenized_query)
|
132 |
-
top_indices = np.argsort(scores)[::-1][:k]
|
133 |
-
return [chunks[i] for i in top_indices], scores[top_indices]
|
134 |
-
|
135 |
-
|
136 |
-
def retrieve_advanced_embedding(query, chunks, embedding_model, k=5):
|
137 |
-
chunk_embeddings = embedding_model.encode(chunks)
|
138 |
-
query_embedding = embedding_model.encode([query])
|
139 |
-
scores = cosine_similarity(np.array(query_embedding), np.array(chunk_embeddings))[0]
|
140 |
-
top_indices = np.argsort(scores)[::-1][:k]
|
141 |
-
return [chunks[i] for i in top_indices], scores[top_indices]
|
142 |
-
|
143 |
-
|
144 |
-
def rerank_candidates(query, candidate_chunks, embedding_model):
|
145 |
-
"""
|
146 |
-
Re-ranks candidate chunks using cosine similarity with the query.
|
147 |
-
"""
|
148 |
-
candidate_embeddings = embedding_model.encode(candidate_chunks)
|
149 |
-
query_embedding = embedding_model.encode([query])
|
150 |
-
scores = cosine_similarity(np.array(query_embedding), np.array(candidate_embeddings))[0]
|
151 |
-
ranked_indices = np.argsort(scores)[::-1]
|
152 |
-
reranked_chunks = [candidate_chunks[i] for i in ranked_indices]
|
153 |
-
reranked_scores = scores[ranked_indices]
|
154 |
-
return reranked_chunks, reranked_scores
|
155 |
-
|
156 |
-
|
157 |
-
def get_grammar() -> LlamaGrammar:
|
158 |
-
"""
|
159 |
-
|
160 |
-
:return:
|
161 |
-
"""
|
162 |
-
file_path = "rag_app/guardrail.gbnf"
|
163 |
-
with open(file_path, 'r') as handler:
|
164 |
-
content = handler.read()
|
165 |
-
return LlamaGrammar.from_string(content)
|
166 |
-
|
167 |
-
|
168 |
-
def answer_question(query, context=None, max_length=5000):
|
169 |
-
# guardrails logic
|
170 |
-
output = llm(f"""Is this a harmful query: \n Query: {query}. \n\n Answer in 'SAFE'/'UNSAFE'""",
|
171 |
-
max_tokens=1000, stop=[], echo=False)
|
172 |
-
tag = llm(f"Is this a harmful query. Content:\n {output['choices'][0]['text']} \n\n Answer in 'SAFE'/'UNSAFE'",
|
173 |
-
max_tokens=1000, stop=[], echo=False, grammar=get_grammar())
|
174 |
-
flag = literal_eval(tag['choices'][0]['text'])['flag']
|
175 |
-
|
176 |
-
if flag == 'unsafe':
|
177 |
-
return "This question has been categorized as harmful. I can't help with these types of queries."
|
178 |
-
|
179 |
-
if not context:
|
180 |
-
output = llm(
|
181 |
-
f"""You're a helpful assistant. Answer the user query's in a professional tone.
|
182 |
-
Query: \n {query}""",
|
183 |
-
max_tokens=200,
|
184 |
-
stop=[],
|
185 |
-
echo=False
|
186 |
-
)
|
187 |
-
return output['choices'][0]['text']
|
188 |
-
|
189 |
-
if not context.strip():
|
190 |
-
return "Insufficient context to generate an answer."
|
191 |
-
|
192 |
-
prompt = f"""Your tone should be of a finance new reporter who comes at 7 PM Prime time. Questions would be
|
193 |
-
regarding a company's financials. Under context you have the relevant snapshot of that query from the
|
194 |
-
annual report. All you need to do is synthesize your response to the question based on the content of
|
195 |
-
these document snapshots.
|
196 |
-
|
197 |
-
# Context:
|
198 |
-
{context}\n\n
|
199 |
-
# Question: {query}
|
200 |
-
\nAnswer:
|
201 |
-
"""
|
202 |
-
output = llm(
|
203 |
-
prompt,
|
204 |
-
max_tokens=max_length,
|
205 |
-
stop=[],
|
206 |
-
echo=False
|
207 |
-
)
|
208 |
-
return output['choices'][0]['text']
|
209 |
-
|
210 |
-
|
211 |
-
def extract_final_answer(pdf_files, query):
|
212 |
-
combined_text = ""
|
213 |
-
for pdf_path in pdf_files:
|
214 |
-
print("reading:", pdf_path)
|
215 |
-
document_data = extract_info_from_pdf(pdf_path)
|
216 |
-
print("document_data:", len(document_data))
|
217 |
-
|
218 |
-
basic_text = extract_text_from_pdf_pypdf2(pdf_path)
|
219 |
-
financial_df = extract_financial_tables_regex(basic_text)
|
220 |
-
cleaned_financial_text = clean_financial_data(financial_df)
|
221 |
-
|
222 |
-
combined_text = combined_text + "\n" + combine_extracted_info(document_data, cleaned_financial_text)
|
223 |
-
print("Combined text length:", len(combined_text))
|
224 |
-
|
225 |
-
chunks = chunk_text(combined_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
226 |
-
print(f"Total chunks created: {len(chunks)}")
|
227 |
-
|
228 |
-
faiss_index, _ = build_faiss_index(chunks, embedding_model)
|
229 |
-
basic_results, basic_distances = retrieve_basic(query, faiss_index, chunks, embedding_model, k=k)
|
230 |
-
print("\n--- Basic RAG Results (FAISS) ---\n\n\n")
|
231 |
-
for chunk, dist in zip(basic_results, basic_distances):
|
232 |
-
print(f"Distance: {dist:.4f}\n")
|
233 |
-
print(f"Chunk: {chunk}\n{'-' * 40}")
|
234 |
-
|
235 |
-
bm25_results, bm25_scores = retrieve_bm25(query, chunks, k=k)
|
236 |
-
adv_emb_results, adv_emb_scores = retrieve_advanced_embedding(query, chunks, embedding_model, k=k)
|
237 |
-
|
238 |
-
print("\n--- Advanced RAG BM25 Results ---")
|
239 |
-
for chunk, score in zip(bm25_results, bm25_scores):
|
240 |
-
print(f"BM25 Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
|
241 |
-
|
242 |
-
print("\n--- Advanced RAG Embedding Results ---")
|
243 |
-
for chunk, score in zip(adv_emb_results, adv_emb_scores):
|
244 |
-
print(f"Embedding Similarity: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
|
245 |
-
|
246 |
-
candidate_set = list(set(basic_results + bm25_results + adv_emb_results))
|
247 |
-
print(f"\nTotal unique candidate chunks: {len(candidate_set)}")
|
248 |
-
|
249 |
-
reranked_chunks, reranked_scores = rerank_candidates(query, candidate_set, embedding_model)
|
250 |
-
|
251 |
-
print("\n--- Re-ranked Candidate Chunks ---")
|
252 |
-
for chunk, score in zip(reranked_chunks, reranked_scores):
|
253 |
-
print(f"Re-ranked Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
|
254 |
-
|
255 |
-
top_context = "\n".join(reranked_chunks[:k])
|
256 |
-
final_answer = answer_question(query, top_context)
|
257 |
-
|
258 |
-
print("\n--- Final Answer ---")
|
259 |
-
print(final_answer)
|
260 |
-
return final_answer
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
# Define paths, query, and parameters
|
265 |
-
# pdf_path = "reliance-jio-infocomm-limited-annual-report-fy-2023-24.pdf" # Update with your file path
|
266 |
-
# query = "What is the company's net revenue last year?" # Example query
|
267 |
-
chunk_size = 500
|
268 |
-
chunk_overlap = 50
|
269 |
-
candiadate_to_retrieve = 10 # Number of candidates to retrieve
|
270 |
-
k = 2
|
271 |
-
|
272 |
-
# extract_final_answer([pdf_path],"hello world")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|