snsynth commited on
Commit
c08eb8f
·
1 Parent(s): 8cec7ed

remove old files

Browse files
Files changed (3) hide show
  1. rag_app/embeddings.py +0 -46
  2. rag_app/guardrail.gbnf +0 -13
  3. rag_app/rag.py +0 -272
rag_app/embeddings.py DELETED
@@ -1,46 +0,0 @@
1
- from llama_cpp import Llama
2
- from typing import Any, List
3
- from llama_index.core.embeddings import BaseEmbedding
4
- from llama_index.core.bridge.pydantic import PrivateAttr
5
-
6
-
7
- class LlamaCppIndexEmbedding(BaseEmbedding):
8
- _model: Llama = PrivateAttr()
9
-
10
- def __init__(
11
- self,
12
- model_path: str = "models/bge-m3-Q4_K_M.gguf",
13
- **kwargs: Any,
14
- ) -> None:
15
- super().__init__(**kwargs)
16
- self._model = Llama(model_path=model_path, embedding=True)
17
-
18
- @classmethod
19
- def class_name(cls) -> str:
20
- return "llama-cpp-bge-m3-embeddings"
21
-
22
- async def _aget_query_embedding(self, query: str) -> List[float]:
23
- return self._get_query_embedding(query)
24
-
25
- async def _aget_text_embedding(self, text: str) -> List[float]:
26
- return self._get_text_embedding(text)
27
-
28
- def _get_query_embedding(self, query: str) -> List[float]:
29
- # Generate embedding using llama-cpp-python
30
- response = self._model.create_embedding(input=query)
31
- embedding = response['data'][0]['embedding']
32
- return embedding
33
-
34
- def _get_text_embedding(self, text: str) -> List[float]:
35
- # Generate embedding for a single text
36
- response = self._model.create_embedding(input=text)
37
- embedding = response['data'][0]['embedding']
38
- return embedding
39
-
40
- def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
41
- # Generate embeddings for a list of texts
42
- embeddings = []
43
- for text in texts:
44
- embedding = self._get_text_embedding(text)
45
- embeddings.append(embedding)
46
- return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/guardrail.gbnf DELETED
@@ -1,13 +0,0 @@
1
- root ::= (" "| "\n") grammar-models
2
- grammar-models ::= category
3
- category ::= "{" "\n" ws "\"flag\"" ":" ws category-flag "\n" ws "}"
4
- category-flag ::= "\"safe\"" | "\"unsafe\""
5
- boolean ::= "true" | "false"
6
- null ::= "null"
7
- string ::= "\"" (
8
- [^"\\] |
9
- "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
10
- )* "\"" ws
11
- ws ::= ([ \t\n] ws)?
12
- float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
13
- integer ::= [0-9]+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/rag.py DELETED
@@ -1,272 +0,0 @@
1
- # !pip install pdfplumber
2
- # !pip install rank_bm25
3
- # !pip install langchain
4
- # pip install sentence_transformers
5
- # conda install -c conda-forge faiss-cpu
6
-
7
- import pdfplumber
8
- import pandas as pd
9
- import numpy as np
10
- import re
11
- import os
12
- from ast import literal_eval
13
- import faiss
14
- from llama_cpp import Llama, LlamaGrammar
15
- from rank_bm25 import BM25Okapi
16
- from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from sentence_transformers import SentenceTransformer, util
18
- from sklearn.metrics.pairwise import cosine_similarity
19
- import PyPDF2
20
-
21
- embedding_model = SentenceTransformer("models/all-MiniLM-L6-v2/")
22
- llm = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
23
- n_gpu_layers=-1, n_ctx=8000)
24
-
25
-
26
- def extract_info_from_pdf(pdf_path):
27
- """
28
- Extracts both paragraphs and tables from each PDF page using pdfplumber.
29
- Returns a list of dictionaries with keys: "page_number", "paragraphs", "tables".
30
- """
31
- document_data = []
32
- with pdfplumber.open(pdf_path) as pdf:
33
- for i, page in enumerate(pdf.pages, start=1):
34
- page_data = {"page_number": i, "paragraphs": [], "tables": []}
35
- text = page.extract_text()
36
- if text:
37
- paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
38
- page_data["paragraphs"] = paragraphs
39
- tables = page.extract_tables()
40
- dfs = []
41
- for table in tables:
42
- if len(table) > 1:
43
- df = pd.DataFrame(table[1:], columns=table[0])
44
- else:
45
- df = pd.DataFrame(table)
46
- dfs.append(df)
47
- page_data["tables"] = dfs
48
- document_data.append(page_data)
49
- return document_data
50
-
51
-
52
- def extract_financial_tables_regex(text):
53
- """
54
- Extracts financial table information using a regex pattern (basic extraction).
55
- """
56
- pattern = re.compile(r"(Revenue from Operations.*?)\n\n", re.DOTALL)
57
- matches = pattern.findall(text)
58
- if matches:
59
- data_lines = matches[0].split("\n")
60
- structured_data = [line.split() for line in data_lines if line.strip()]
61
- if len(structured_data) > 1:
62
- df = pd.DataFrame(structured_data[1:], columns=structured_data[0])
63
- return df
64
- return pd.DataFrame()
65
-
66
-
67
- def clean_financial_data(df):
68
- """
69
- Cleans the financial DataFrame by converting numerical columns.
70
- """
71
- if df.empty:
72
- return ""
73
- for col in df.columns[1:]:
74
- df[col] = df[col].replace({',': ''}, regex=True)
75
- df[col] = pd.to_numeric(df[col], errors='coerce')
76
- return df.to_string()
77
-
78
-
79
- def combine_extracted_info(document_data, financial_text_regex=""):
80
- """
81
- Combines extracted paragraphs and tables (converted to strings) into a single text.
82
- Optionally appends extra financial table text.
83
- """
84
- text_segments = []
85
- for page in document_data:
86
- for paragraph in page["paragraphs"]:
87
- text_segments.append(paragraph)
88
- for table in page["tables"]:
89
- text_segments.append(table.to_string(index=False))
90
- if financial_text_regex:
91
- text_segments.append(financial_text_regex)
92
- return "\n".join(text_segments)
93
-
94
-
95
- def extract_text_from_pdf_pypdf2(pdf_path):
96
- text = ""
97
- with open(pdf_path, "rb") as file:
98
- reader = PyPDF2.PdfReader(file)
99
- for page in reader.pages:
100
- text += page.extract_text() + "\n"
101
- return text
102
-
103
-
104
- def chunk_text(text, chunk_size=500, chunk_overlap=50):
105
- """
106
- Uses RecursiveCharacterTextSplitter to chunk text.
107
- """
108
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
109
- chunks = text_splitter.split_text(text)
110
- return chunks
111
-
112
-
113
- def build_faiss_index(chunks, embedding_model):
114
- chunk_embeddings = embedding_model.encode(chunks)
115
- dimension = chunk_embeddings.shape[1]
116
- index = faiss.IndexFlatL2(dimension)
117
- index.add(np.array(chunk_embeddings))
118
- return index, chunk_embeddings
119
-
120
-
121
- def retrieve_basic(query, index, chunks, embedding_model, k=5):
122
- query_embedding = embedding_model.encode([query])
123
- distances, indices = index.search(np.array(query_embedding), k)
124
- return [chunks[i] for i in indices[0]], distances[0]
125
-
126
-
127
- def retrieve_bm25(query, chunks, k=5):
128
- tokenized_corpus = [chunk.lower().split() for chunk in chunks]
129
- bm25_model = BM25Okapi(tokenized_corpus)
130
- tokenized_query = query.lower().split()
131
- scores = bm25_model.get_scores(tokenized_query)
132
- top_indices = np.argsort(scores)[::-1][:k]
133
- return [chunks[i] for i in top_indices], scores[top_indices]
134
-
135
-
136
- def retrieve_advanced_embedding(query, chunks, embedding_model, k=5):
137
- chunk_embeddings = embedding_model.encode(chunks)
138
- query_embedding = embedding_model.encode([query])
139
- scores = cosine_similarity(np.array(query_embedding), np.array(chunk_embeddings))[0]
140
- top_indices = np.argsort(scores)[::-1][:k]
141
- return [chunks[i] for i in top_indices], scores[top_indices]
142
-
143
-
144
- def rerank_candidates(query, candidate_chunks, embedding_model):
145
- """
146
- Re-ranks candidate chunks using cosine similarity with the query.
147
- """
148
- candidate_embeddings = embedding_model.encode(candidate_chunks)
149
- query_embedding = embedding_model.encode([query])
150
- scores = cosine_similarity(np.array(query_embedding), np.array(candidate_embeddings))[0]
151
- ranked_indices = np.argsort(scores)[::-1]
152
- reranked_chunks = [candidate_chunks[i] for i in ranked_indices]
153
- reranked_scores = scores[ranked_indices]
154
- return reranked_chunks, reranked_scores
155
-
156
-
157
- def get_grammar() -> LlamaGrammar:
158
- """
159
-
160
- :return:
161
- """
162
- file_path = "rag_app/guardrail.gbnf"
163
- with open(file_path, 'r') as handler:
164
- content = handler.read()
165
- return LlamaGrammar.from_string(content)
166
-
167
-
168
- def answer_question(query, context=None, max_length=5000):
169
- # guardrails logic
170
- output = llm(f"""Is this a harmful query: \n Query: {query}. \n\n Answer in 'SAFE'/'UNSAFE'""",
171
- max_tokens=1000, stop=[], echo=False)
172
- tag = llm(f"Is this a harmful query. Content:\n {output['choices'][0]['text']} \n\n Answer in 'SAFE'/'UNSAFE'",
173
- max_tokens=1000, stop=[], echo=False, grammar=get_grammar())
174
- flag = literal_eval(tag['choices'][0]['text'])['flag']
175
-
176
- if flag == 'unsafe':
177
- return "This question has been categorized as harmful. I can't help with these types of queries."
178
-
179
- if not context:
180
- output = llm(
181
- f"""You're a helpful assistant. Answer the user query's in a professional tone.
182
- Query: \n {query}""",
183
- max_tokens=200,
184
- stop=[],
185
- echo=False
186
- )
187
- return output['choices'][0]['text']
188
-
189
- if not context.strip():
190
- return "Insufficient context to generate an answer."
191
-
192
- prompt = f"""Your tone should be of a finance new reporter who comes at 7 PM Prime time. Questions would be
193
- regarding a company's financials. Under context you have the relevant snapshot of that query from the
194
- annual report. All you need to do is synthesize your response to the question based on the content of
195
- these document snapshots.
196
-
197
- # Context:
198
- {context}\n\n
199
- # Question: {query}
200
- \nAnswer:
201
- """
202
- output = llm(
203
- prompt,
204
- max_tokens=max_length,
205
- stop=[],
206
- echo=False
207
- )
208
- return output['choices'][0]['text']
209
-
210
-
211
- def extract_final_answer(pdf_files, query):
212
- combined_text = ""
213
- for pdf_path in pdf_files:
214
- print("reading:", pdf_path)
215
- document_data = extract_info_from_pdf(pdf_path)
216
- print("document_data:", len(document_data))
217
-
218
- basic_text = extract_text_from_pdf_pypdf2(pdf_path)
219
- financial_df = extract_financial_tables_regex(basic_text)
220
- cleaned_financial_text = clean_financial_data(financial_df)
221
-
222
- combined_text = combined_text + "\n" + combine_extracted_info(document_data, cleaned_financial_text)
223
- print("Combined text length:", len(combined_text))
224
-
225
- chunks = chunk_text(combined_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
226
- print(f"Total chunks created: {len(chunks)}")
227
-
228
- faiss_index, _ = build_faiss_index(chunks, embedding_model)
229
- basic_results, basic_distances = retrieve_basic(query, faiss_index, chunks, embedding_model, k=k)
230
- print("\n--- Basic RAG Results (FAISS) ---\n\n\n")
231
- for chunk, dist in zip(basic_results, basic_distances):
232
- print(f"Distance: {dist:.4f}\n")
233
- print(f"Chunk: {chunk}\n{'-' * 40}")
234
-
235
- bm25_results, bm25_scores = retrieve_bm25(query, chunks, k=k)
236
- adv_emb_results, adv_emb_scores = retrieve_advanced_embedding(query, chunks, embedding_model, k=k)
237
-
238
- print("\n--- Advanced RAG BM25 Results ---")
239
- for chunk, score in zip(bm25_results, bm25_scores):
240
- print(f"BM25 Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
241
-
242
- print("\n--- Advanced RAG Embedding Results ---")
243
- for chunk, score in zip(adv_emb_results, adv_emb_scores):
244
- print(f"Embedding Similarity: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
245
-
246
- candidate_set = list(set(basic_results + bm25_results + adv_emb_results))
247
- print(f"\nTotal unique candidate chunks: {len(candidate_set)}")
248
-
249
- reranked_chunks, reranked_scores = rerank_candidates(query, candidate_set, embedding_model)
250
-
251
- print("\n--- Re-ranked Candidate Chunks ---")
252
- for chunk, score in zip(reranked_chunks, reranked_scores):
253
- print(f"Re-ranked Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
254
-
255
- top_context = "\n".join(reranked_chunks[:k])
256
- final_answer = answer_question(query, top_context)
257
-
258
- print("\n--- Final Answer ---")
259
- print(final_answer)
260
- return final_answer
261
-
262
-
263
-
264
- # Define paths, query, and parameters
265
- # pdf_path = "reliance-jio-infocomm-limited-annual-report-fy-2023-24.pdf" # Update with your file path
266
- # query = "What is the company's net revenue last year?" # Example query
267
- chunk_size = 500
268
- chunk_overlap = 50
269
- candiadate_to_retrieve = 10 # Number of candidates to retrieve
270
- k = 2
271
-
272
- # extract_final_answer([pdf_path],"hello world")