utkarsh1797 commited on
Commit
09c0fe4
Β·
verified Β·
1 Parent(s): be32662

Upload 3 files

Browse files
Files changed (3) hide show
  1. README (1).md +26 -0
  2. app (1).py +204 -0
  3. requirements.txt +10 -0
README (1).md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Financial RAG
3
+ emoji: πŸ“Š
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.11.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # πŸ“Š Financial RAG Model
13
+
14
+ This is a Retrieval-Augmented Generation (RAG) model for answering financial queries based on company financial statements.
15
+
16
+ ## πŸ›  How to Use
17
+ 1. **Upload a Financial PDF** (e.g., balance sheet, income statement).
18
+ 2. **Ask a Financial Question** related to the document.
19
+ 3. **Get an AI-generated response** based on relevant financial data.
20
+
21
+ ## πŸš€ Built With
22
+ - **FAISS & BM25** for document retrieval.
23
+ - **Google Gemini API** for answer generation.
24
+ - **Gradio** for the web interface.
25
+
26
+ πŸ”— **Try it out in the Hugging Face Space!**
app (1).py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import faiss
4
+ import numpy as np
5
+ import requests
6
+ import pdfplumber
7
+ import spacy
8
+ from sentence_transformers import SentenceTransformer, CrossEncoder
9
+ from rank_bm25 import BM25Okapi
10
+ import gradio as gr
11
+
12
+
13
+ # βœ… Load Models
14
+ spacy.cli.download("en_core_web_sm")
15
+ nlp = spacy.load("en_core_web_sm")
16
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
18
+
19
+ # βœ… Load API Key from Hugging Face Secrets
20
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
21
+
22
+ if not GEMINI_API_KEY:
23
+ raise ValueError("🚨 Please set the Google API Key in Hugging Face Secrets!")
24
+
25
+ GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
26
+
27
+ # βœ… Financial Keywords for Filtering
28
+ FINANCIAL_KEYWORDS = [
29
+ "revenue", "profit", "loss", "balance sheet", "cash flow",
30
+ "earnings", "expenses", "investment", "financial", "liability",
31
+ "assets", "equity", "debt", "capital", "tax", "dividends",
32
+ "reserves", "net income", "operating income"
33
+ ]
34
+
35
+ # βœ… Global Variables for FAISS & BM25
36
+ bm25, chunk_texts, faiss_index = None, [], None
37
+
38
+
39
+ # πŸ”Ή 1. Extract and Clean Text from PDF
40
+ def extract_text_from_pdf(pdf_path):
41
+ text = ""
42
+ with pdfplumber.open(pdf_path) as pdf:
43
+ for page in pdf.pages:
44
+ extracted = page.extract_text()
45
+ if extracted:
46
+ text += extracted + "\n"
47
+ return clean_text(text)
48
+
49
+
50
+ # πŸ”Ή 2. Clean Extracted Text
51
+ def clean_text(text):
52
+ text = re.sub(r"https?://\S+", "", text) # Remove URLs
53
+ text = re.sub(r"^\d{2}/\d{2}/\d{4}.*$", "", text, flags=re.MULTILINE) # Remove timestamps
54
+ text = re.sub(r"(?i)this data can be easily copy pasted.*?", "", text, flags=re.MULTILINE) # Remove metadata
55
+ text = re.sub(r"(?i)moneycontrol.com.*?", "", text, flags=re.MULTILINE) # Remove source attribution
56
+ text = re.sub(r"(\n\s*)+", "\n", text) # Remove extra blank lines
57
+ return text.strip()
58
+
59
+
60
+ # πŸ”Ή 3. Chunking Extracted Text
61
+ def chunk_text(text, max_tokens=64):
62
+ doc = nlp(text)
63
+ sentences = [sent.text for sent in doc.sents]
64
+
65
+ chunks, current_chunk = [], []
66
+ token_count = 0
67
+
68
+ for sentence in sentences:
69
+ tokens = sentence.split()
70
+ if token_count + len(tokens) > max_tokens:
71
+ chunks.append(" ".join(current_chunk))
72
+ current_chunk = []
73
+ token_count = 0
74
+ current_chunk.append(sentence)
75
+ token_count += len(tokens)
76
+
77
+ if current_chunk:
78
+ chunks.append(" ".join(current_chunk))
79
+
80
+ return chunks
81
+
82
+
83
+ # πŸ”Ή 4. Store Chunks in FAISS & BM25
84
+ def store_in_faiss(chunks):
85
+ global bm25, chunk_texts, faiss_index
86
+ embeddings = embed_model.encode(chunks, convert_to_numpy=True)
87
+
88
+ # Create FAISS index
89
+ faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
90
+ faiss_index.add(embeddings)
91
+
92
+ chunk_texts = chunks
93
+ bm25 = BM25Okapi([chunk.split() for chunk in chunks])
94
+ return faiss_index
95
+
96
+
97
+ # πŸ”Ή 5. Retrieve Chunks using BM25 with Scores
98
+ def retrieve_bm25(query, top_k=2):
99
+ tokenized_query = query.split()
100
+ scores = bm25.get_scores(tokenized_query)
101
+ top_indices = np.argsort(scores)[-top_k:][::-1] # Get top indices
102
+
103
+ # Normalize BM25 scores
104
+ min_score, max_score = np.min(scores), np.max(scores)
105
+ normalized_scores = [(scores[i] - min_score) / (max_score - min_score) if max_score != min_score else 1 for i in top_indices]
106
+
107
+ retrieved_chunks = [(chunk_texts[i], normalized_scores[idx]) for idx, i in enumerate(top_indices)]
108
+ return retrieved_chunks
109
+
110
+
111
+ # πŸ”Ή 6. Generate Response Using Google Gemini
112
+ def refine_with_gemini(query, retrieved_text):
113
+ if not retrieved_text.strip():
114
+ return "❌ No relevant financial data found for your query."
115
+
116
+ payload = {
117
+ "contents": [{
118
+ "parts": [{
119
+ "text": f"You are an expert financial analyst. Based on the provided data, extract only the relevant financial details related to the query: '{query}' and present them in a clear format.\n\nData:\n{retrieved_text}"
120
+ }]
121
+ }]
122
+ }
123
+
124
+ try:
125
+ response = requests.post(
126
+ f"{GEMINI_API_URL}?key={GEMINI_API_KEY}",
127
+ json=payload, headers={"Content-Type": "application/json"}
128
+ )
129
+ response_json = response.json()
130
+
131
+ if response.status_code != 200:
132
+ print("🚨 Gemini API Error Response:", response_json)
133
+ return f"⚠️ Gemini API Error: {response_json.get('error', {}).get('message', 'Unknown error')}"
134
+
135
+ print("βœ… Gemini API Response:", response_json)
136
+ return response_json.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "⚠️ Error generating response.")
137
+
138
+ except Exception as e:
139
+ print("🚨 Exception in Gemini API Call:", str(e))
140
+ return "⚠️ Gemini API Exception: Unable to fetch response."
141
+
142
+
143
+ # πŸ”Ή 7. Final Retrieval Function with Confidence Score
144
+ def retrieve_and_generate_secure(query):
145
+ print("πŸ” Query Received:", query)
146
+ if bm25 is None or not chunk_texts:
147
+ return "❌ No PDF data loaded. Please upload a PDF first."
148
+
149
+ bm25_results = retrieve_bm25(query)
150
+ if not bm25_results:
151
+ return "❌ No relevant financial data found for your query."
152
+
153
+ # Extract text and confidence scores
154
+ retrieved_texts, bm25_confidences = zip(*bm25_results)
155
+
156
+ # Average BM25 Confidence Score
157
+ avg_bm25_confidence = sum(bm25_confidences) / len(bm25_confidences)
158
+
159
+ # Get FAISS Similarity Score
160
+ query_embedding = embed_model.encode([query])
161
+ D, I = faiss_index.search(query_embedding, 1) # Top-1 FAISS retrieval
162
+ faiss_confidence = 1 / (1 + D[0][0]) if D[0][0] != 0 else 1 # Convert distance to similarity
163
+
164
+ # Combine Confidence Scores (Weighted Average)
165
+ final_confidence = (0.6 * avg_bm25_confidence) + (0.4 * faiss_confidence)
166
+
167
+ # Generate Final Answer
168
+ final_answer = refine_with_gemini(query, "\n".join(retrieved_texts))
169
+
170
+ return f"πŸ’¬ Answer: {final_answer}\n\nπŸ”Ή Confidence Score: {round(final_confidence * 100, 2)}%"
171
+
172
+
173
+ # πŸ”Ή 8. Load PDF and Process Data
174
+ def process_uploaded_pdf(pdf_file):
175
+ global faiss_index
176
+ text = extract_text_from_pdf(pdf_file.name)
177
+ chunks = chunk_text(text)
178
+ faiss_index = store_in_faiss(chunks)
179
+ return "βœ… PDF Processed Successfully! Now you can ask financial questions."
180
+
181
+
182
+ # πŸ”Ή 9. Build Gradio UI
183
+ with gr.Blocks() as app:
184
+ gr.Markdown("# πŸ“Š Financial RAG Model")
185
+ gr.Markdown("Upload a company financial report PDF and ask relevant financial questions.")
186
+
187
+ with gr.Row():
188
+ pdf_input = gr.File(label="πŸ“‚ Upload Financial PDF", type="filepath")
189
+ process_button = gr.Button("πŸ“œ Process PDF")
190
+
191
+ status_output = gr.Textbox(label="Processing Status", interactive=False)
192
+
193
+ with gr.Row():
194
+ query_input = gr.Textbox(label="❓ Ask a financial question")
195
+ answer_output = gr.Textbox(label="πŸ’¬ Answer", interactive=False)
196
+
197
+ query_button = gr.Button("πŸ” Get Answer")
198
+
199
+ # Events
200
+ process_button.click(process_uploaded_pdf, inputs=pdf_input, outputs=status_output)
201
+ query_button.click(retrieve_and_generate_secure, inputs=query_input, outputs=answer_output)
202
+
203
+ # πŸ”Ή 10. Launch UI
204
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ faiss-cpu
3
+ numpy
4
+ scipy
5
+ sentence-transformers
6
+ torch
7
+ spacy
8
+ pdfplumber
9
+ rank-bm25
10
+ requests