snsynth commited on
Commit
78f53a7
·
1 Parent(s): 5c351f3

adding all files with Docker configuration of the app

Browse files
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Miniconda image as the base
2
+ FROM python:3.10.15-bullseye
3
+ ENV PIP_DEFAULT_TIMEOUT=300
4
+
5
+ RUN apt-get update && \
6
+ apt-get install -y \
7
+ # General dependencies
8
+ locales \
9
+ locales-all && \
10
+ # Clean local repository of package files since they won't be needed anymore.
11
+ # Make sure this line is called after all apt-get update/install commands have
12
+ # run.
13
+ apt-get clean && \
14
+ # Also delete the index files which we also don't need anymore.
15
+ rm -rf /var/lib/apt/lists/* \
16
+
17
+ ENV LC_ALL en_US.UTF-8
18
+ ENV LANG en_US.UTF-8
19
+ ENV LANGUAGE en_US.UTF-8
20
+
21
+ COPY requirements.txt .
22
+ RUN pip install -r requirements.txt
23
+
24
+ RUN groupadd -g 900 mesop && useradd -u 900 -s /bin/bash -g mesop mesop
25
+ USER mesop
26
+
27
+ COPY . /finance-rag-chatbot-group39
28
+ WORKDIR /finance-rag-chatbot-group39
29
+
30
+ # Final command: run the mesop script
31
+ CMD ["mesop", "rag_app/app.py", "--port", "7680"]
rag_app/app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesop as me
2
+ from chat_utils import State, _make_style_chat_bubble_wrapper, _ROLE_ASSISTANT, on_chat_input, _make_chat_bubble_style, \
3
+ on_click_submit_chat_msg, _STYLE_CHAT_BUBBLE_NAME, handle_pdf_upload
4
+
5
+ _COLOR_BACKGROUND = me.theme_var("background")
6
+
7
+ _STYLE_APP_CONTAINER = me.Style(
8
+ background=_COLOR_BACKGROUND,
9
+ display="flex",
10
+ flex_direction="column",
11
+ height="100%",
12
+ margin=me.Margin.symmetric(vertical=0, horizontal="auto"),
13
+ width="min(1024px, 100%)",
14
+ box_shadow="0 3px 1px -2px #0003, 0 2px 2px #00000024, 0 1px 5px #0000001f",
15
+ padding=me.Padding(top=20, left=20, right=20),
16
+ )
17
+
18
+
19
+ @me.page()
20
+ def app():
21
+ state = me.state(State)
22
+ with me.box(style=_STYLE_APP_CONTAINER):
23
+ with me.box(style=me.Style(
24
+ width="min(680%, 100%)",
25
+ margin=me.Margin.symmetric(vertical=36, horizontal="auto"),
26
+ flex_grow=1,
27
+ overflow_y="auto",
28
+ padding=me.Padding(left=20, right=20)
29
+ )):
30
+ me.text("""
31
+ FinanceGPT - Powered by open source language models capable of document QnA on Annual
32
+ Investor Reports of top companies.
33
+ """,
34
+ style=me.Style(font_size=20, margin=me.Margin(bottom=24), text_align="center")
35
+ )
36
+ me.text("ℹ️ Upload annual reports to start asking questions.",
37
+ style=me.Style(font_size=12, margin=me.Margin(bottom=24), text_align="center")
38
+ )
39
+ for index, msg in enumerate(state.output):
40
+ with me.box(style=_make_style_chat_bubble_wrapper(msg.role), key=f"msg-{index}"):
41
+ if msg.role == _ROLE_ASSISTANT:
42
+ me.text("assistant", style=_STYLE_CHAT_BUBBLE_NAME)
43
+ with me.box(style=_make_chat_bubble_style(msg.role)):
44
+ me.markdown(msg.content)
45
+ if state.in_progress:
46
+ me.progress_spinner()
47
+ with me.box(key="scroll-to", style=me.Style(height=250)):
48
+ pass
49
+ with me.box(style=me.Style(
50
+ padding=me.Padding(top=30, left=20, right=20),
51
+ display="flex",
52
+ flex_direction="row"
53
+ )):
54
+ with me.content_uploader(
55
+ accepted_file_types=["application/pdf"],
56
+ on_upload=handle_pdf_upload,
57
+ type="icon",
58
+ style=me.Style(font_weight="bold", margin=me.Margin(right=8)),
59
+ ):
60
+ me.icon("attach_file")
61
+
62
+ with me.box(style=me.Style(flex_grow=1)):
63
+ me.input(
64
+ label="Enter your prompt",
65
+ key=f"input-{len(state.output)}",
66
+ on_input=on_chat_input,
67
+ on_enter=on_click_submit_chat_msg,
68
+ style=me.Style(width="100%")
69
+ )
70
+ with me.content_button(
71
+ color="primary",
72
+ type="flat",
73
+ disabled=state.in_progress,
74
+ on_click=on_click_submit_chat_msg,
75
+ style=me.Style(margin=me.Margin(top=8, left=8))
76
+ ):
77
+ me.icon("send" if not state.in_progress else "pending")
rag_app/chat_utils.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import mesop as me
3
+ from dataclasses import dataclass, field
4
+ from typing import Callable, Generator, Literal
5
+ import time
6
+ from rag import extract_final_answer, answer_question
7
+
8
+
9
+ Role = Literal["user", "assistant"]
10
+ _ROLE_USER = "user"
11
+ _ROLE_ASSISTANT = "assistant"
12
+
13
+ _COLOR_CHAT_BUBBLE_YOU = me.theme_var("surface-container-low")
14
+ _COLOR_CHAT_BUBBLE_BOT = me.theme_var("secondary-container")
15
+ _DEFAULT_BORDER_SIDE = me.BorderSide(
16
+ width="1px", style="solid", color=me.theme_var("secondary-fixed")
17
+ )
18
+ _STYLE_CHAT_BUBBLE_NAME = me.Style(
19
+ font_weight="bold",
20
+ font_size="12px",
21
+ padding=me.Padding(left=15, right=15, bottom=5),
22
+ )
23
+
24
+
25
+ @dataclass(kw_only=True)
26
+ class ChatMessage:
27
+ role: Role = "user"
28
+ content: str = ""
29
+
30
+
31
+ @me.stateclass
32
+ class State:
33
+ input: str = ""
34
+ output: list[ChatMessage] = field(default_factory=list)
35
+ in_progress: bool = False
36
+ pdf_files: list[str] = field(default_factory=list) # Changed to a list
37
+
38
+
39
+ def respond_to_chat(query: str, history: list[ChatMessage]):
40
+ assistant_message = ChatMessage(role=_ROLE_ASSISTANT)
41
+ yield assistant_message
42
+ state = me.state(State)
43
+ pdf_files = state.pdf_files
44
+ if pdf_files:
45
+ response = extract_final_answer(pdf_files, query)
46
+ else:
47
+ response = answer_question(query)
48
+
49
+ print("Agent response=", response)
50
+ yield response
51
+
52
+ # messages = [{"role": message.role, "content": message.content} for message in history]
53
+ # llm_response = llm.create_chat_completion(
54
+ # messages=messages,
55
+ # max_tokens=1024,
56
+ # stop=[],
57
+ # stream=True
58
+ # )
59
+ # assistant_message = ChatMessage(role=_ROLE_ASSISTANT)
60
+ # yield assistant_message
61
+ # for item in llm_response:
62
+ # delta = item['choices'][0]['delta']
63
+ # if 'content' in delta:
64
+ # text = delta["content"]
65
+ # yield text
66
+
67
+ def on_chat_input(e: me.InputEvent):
68
+ state = me.state(State)
69
+ state.input = e.value
70
+
71
+
72
+ def on_click_submit_chat_msg(e: me.ClickEvent | me.InputEnterEvent):
73
+ state = me.state(State)
74
+ if state.in_progress or not state.input:
75
+ return
76
+ input_ = state.input
77
+ state.input = ""
78
+ yield
79
+
80
+ output = state.output
81
+ output.append(ChatMessage(role=_ROLE_USER, content=input_))
82
+ state.in_progress = True
83
+ me.scroll_into_view(key="scroll-to")
84
+ yield
85
+
86
+ start_time = time.time()
87
+ for content in respond_to_chat(input_, state.output):
88
+ if isinstance(content, ChatMessage):
89
+ assistant_message = content
90
+ output.append(assistant_message)
91
+ state.output = output
92
+ else:
93
+ assistant_message.content += content
94
+ if (time.time() - start_time) >= 0.25:
95
+ start_time = time.time()
96
+ yield
97
+
98
+ state.in_progress = False
99
+ yield
100
+
101
+
102
+ def _make_style_chat_bubble_wrapper(role: Role) -> me.Style:
103
+ align_items = "end" if role == _ROLE_USER else "start"
104
+ return me.Style(
105
+ display="flex",
106
+ flex_direction="column",
107
+ align_items=align_items,
108
+ )
109
+
110
+
111
+ def _make_chat_bubble_style(role: Role) -> me.Style:
112
+ background = _COLOR_CHAT_BUBBLE_YOU
113
+ if role == _ROLE_ASSISTANT:
114
+ background = _COLOR_CHAT_BUBBLE_BOT
115
+ return me.Style(
116
+ width="80%",
117
+ font_size="13px",
118
+ background=background,
119
+ border_radius="15px",
120
+ padding=me.Padding(right=15, left=15, bottom=3),
121
+ margin=me.Margin(bottom=10),
122
+ border=me.Border(
123
+ left=_DEFAULT_BORDER_SIDE,
124
+ right=_DEFAULT_BORDER_SIDE,
125
+ top=_DEFAULT_BORDER_SIDE,
126
+ bottom=_DEFAULT_BORDER_SIDE,
127
+ ),
128
+ )
129
+
130
+
131
+ def save_uploaded_file(uploaded_file: me.UploadedFile):
132
+ save_directory = "docs"
133
+ os.makedirs(save_directory, exist_ok=True)
134
+ file_path = os.path.join(save_directory, uploaded_file.name)
135
+ with open(file_path, "wb") as f:
136
+ f.write(uploaded_file.getvalue())
137
+ print(f"File saved successfully at {file_path}")
138
+
139
+
140
+ def handle_pdf_upload(event: me.UploadEvent):
141
+ state = me.state(State)
142
+ save_uploaded_file(event.file)
143
+ state.pdf_files.append(os.path.join("docs", event.file.name))
rag_app/embeddings.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ from typing import Any, List
3
+ from llama_index.core.embeddings import BaseEmbedding
4
+ from llama_index.core.bridge.pydantic import PrivateAttr
5
+
6
+
7
+ class LlamaCppIndexEmbedding(BaseEmbedding):
8
+ _model: Llama = PrivateAttr()
9
+
10
+ def __init__(
11
+ self,
12
+ model_path: str = "models/bge-m3-Q4_K_M.gguf",
13
+ **kwargs: Any,
14
+ ) -> None:
15
+ super().__init__(**kwargs)
16
+ self._model = Llama(model_path=model_path, embedding=True)
17
+
18
+ @classmethod
19
+ def class_name(cls) -> str:
20
+ return "llama-cpp-bge-m3-embeddings"
21
+
22
+ async def _aget_query_embedding(self, query: str) -> List[float]:
23
+ return self._get_query_embedding(query)
24
+
25
+ async def _aget_text_embedding(self, text: str) -> List[float]:
26
+ return self._get_text_embedding(text)
27
+
28
+ def _get_query_embedding(self, query: str) -> List[float]:
29
+ # Generate embedding using llama-cpp-python
30
+ response = self._model.create_embedding(input=query)
31
+ embedding = response['data'][0]['embedding']
32
+ return embedding
33
+
34
+ def _get_text_embedding(self, text: str) -> List[float]:
35
+ # Generate embedding for a single text
36
+ response = self._model.create_embedding(input=text)
37
+ embedding = response['data'][0]['embedding']
38
+ return embedding
39
+
40
+ def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
41
+ # Generate embeddings for a list of texts
42
+ embeddings = []
43
+ for text in texts:
44
+ embedding = self._get_text_embedding(text)
45
+ embeddings.append(embedding)
46
+ return embeddings
rag_app/guardrail.gbnf ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root ::= (" "| "\n") grammar-models
2
+ grammar-models ::= category
3
+ category ::= "{" "\n" ws "\"flag\"" ":" ws category-flag "\n" ws "}"
4
+ category-flag ::= "\"safe\"" | "\"unsafe\""
5
+ boolean ::= "true" | "false"
6
+ null ::= "null"
7
+ string ::= "\"" (
8
+ [^"\\] |
9
+ "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
10
+ )* "\"" ws
11
+ ws ::= ([ \t\n] ws)?
12
+ float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
13
+ integer ::= [0-9]+
rag_app/rag.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !pip install pdfplumber
2
+ # !pip install rank_bm25
3
+ # !pip install langchain
4
+ # pip install sentence_transformers
5
+ # conda install -c conda-forge faiss-cpu
6
+
7
+ import pdfplumber
8
+ import pandas as pd
9
+ import numpy as np
10
+ import re
11
+ import os
12
+ from ast import literal_eval
13
+ import faiss
14
+ from llama_cpp import Llama, LlamaGrammar
15
+ from rank_bm25 import BM25Okapi
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from sentence_transformers import SentenceTransformer, util
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+ import PyPDF2
20
+
21
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
22
+ llm = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
23
+ n_gpu_layers=-1, n_ctx=8000)
24
+
25
+
26
+ def extract_info_from_pdf(pdf_path):
27
+ """
28
+ Extracts both paragraphs and tables from each PDF page using pdfplumber.
29
+ Returns a list of dictionaries with keys: "page_number", "paragraphs", "tables".
30
+ """
31
+ document_data = []
32
+ with pdfplumber.open(pdf_path) as pdf:
33
+ for i, page in enumerate(pdf.pages, start=1):
34
+ page_data = {"page_number": i, "paragraphs": [], "tables": []}
35
+ text = page.extract_text()
36
+ if text:
37
+ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
38
+ page_data["paragraphs"] = paragraphs
39
+ tables = page.extract_tables()
40
+ dfs = []
41
+ for table in tables:
42
+ if len(table) > 1:
43
+ df = pd.DataFrame(table[1:], columns=table[0])
44
+ else:
45
+ df = pd.DataFrame(table)
46
+ dfs.append(df)
47
+ page_data["tables"] = dfs
48
+ document_data.append(page_data)
49
+ return document_data
50
+
51
+
52
+ def extract_financial_tables_regex(text):
53
+ """
54
+ Extracts financial table information using a regex pattern (basic extraction).
55
+ """
56
+ pattern = re.compile(r"(Revenue from Operations.*?)\n\n", re.DOTALL)
57
+ matches = pattern.findall(text)
58
+ if matches:
59
+ data_lines = matches[0].split("\n")
60
+ structured_data = [line.split() for line in data_lines if line.strip()]
61
+ if len(structured_data) > 1:
62
+ df = pd.DataFrame(structured_data[1:], columns=structured_data[0])
63
+ return df
64
+ return pd.DataFrame()
65
+
66
+
67
+ def clean_financial_data(df):
68
+ """
69
+ Cleans the financial DataFrame by converting numerical columns.
70
+ """
71
+ if df.empty:
72
+ return ""
73
+ for col in df.columns[1:]:
74
+ df[col] = df[col].replace({',': ''}, regex=True)
75
+ df[col] = pd.to_numeric(df[col], errors='coerce')
76
+ return df.to_string()
77
+
78
+
79
+ def combine_extracted_info(document_data, financial_text_regex=""):
80
+ """
81
+ Combines extracted paragraphs and tables (converted to strings) into a single text.
82
+ Optionally appends extra financial table text.
83
+ """
84
+ text_segments = []
85
+ for page in document_data:
86
+ for paragraph in page["paragraphs"]:
87
+ text_segments.append(paragraph)
88
+ for table in page["tables"]:
89
+ text_segments.append(table.to_string(index=False))
90
+ if financial_text_regex:
91
+ text_segments.append(financial_text_regex)
92
+ return "\n".join(text_segments)
93
+
94
+
95
+ def extract_text_from_pdf_pypdf2(pdf_path):
96
+ text = ""
97
+ with open(pdf_path, "rb") as file:
98
+ reader = PyPDF2.PdfReader(file)
99
+ for page in reader.pages:
100
+ text += page.extract_text() + "\n"
101
+ return text
102
+
103
+
104
+ def chunk_text(text, chunk_size=500, chunk_overlap=50):
105
+ """
106
+ Uses RecursiveCharacterTextSplitter to chunk text.
107
+ """
108
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
109
+ chunks = text_splitter.split_text(text)
110
+ return chunks
111
+
112
+
113
+ def build_faiss_index(chunks, embedding_model):
114
+ chunk_embeddings = embedding_model.encode(chunks)
115
+ dimension = chunk_embeddings.shape[1]
116
+ index = faiss.IndexFlatL2(dimension)
117
+ index.add(np.array(chunk_embeddings))
118
+ return index, chunk_embeddings
119
+
120
+
121
+ def retrieve_basic(query, index, chunks, embedding_model, k=5):
122
+ query_embedding = embedding_model.encode([query])
123
+ distances, indices = index.search(np.array(query_embedding), k)
124
+ return [chunks[i] for i in indices[0]], distances[0]
125
+
126
+
127
+ def retrieve_bm25(query, chunks, k=5):
128
+ tokenized_corpus = [chunk.lower().split() for chunk in chunks]
129
+ bm25_model = BM25Okapi(tokenized_corpus)
130
+ tokenized_query = query.lower().split()
131
+ scores = bm25_model.get_scores(tokenized_query)
132
+ top_indices = np.argsort(scores)[::-1][:k]
133
+ return [chunks[i] for i in top_indices], scores[top_indices]
134
+
135
+
136
+ def retrieve_advanced_embedding(query, chunks, embedding_model, k=5):
137
+ chunk_embeddings = embedding_model.encode(chunks)
138
+ query_embedding = embedding_model.encode([query])
139
+ scores = cosine_similarity(np.array(query_embedding), np.array(chunk_embeddings))[0]
140
+ top_indices = np.argsort(scores)[::-1][:k]
141
+ return [chunks[i] for i in top_indices], scores[top_indices]
142
+
143
+
144
+ def rerank_candidates(query, candidate_chunks, embedding_model):
145
+ """
146
+ Re-ranks candidate chunks using cosine similarity with the query.
147
+ """
148
+ candidate_embeddings = embedding_model.encode(candidate_chunks)
149
+ query_embedding = embedding_model.encode([query])
150
+ scores = cosine_similarity(np.array(query_embedding), np.array(candidate_embeddings))[0]
151
+ ranked_indices = np.argsort(scores)[::-1]
152
+ reranked_chunks = [candidate_chunks[i] for i in ranked_indices]
153
+ reranked_scores = scores[ranked_indices]
154
+ return reranked_chunks, reranked_scores
155
+
156
+
157
+ def get_grammar() -> LlamaGrammar:
158
+ """
159
+
160
+ :return:
161
+ """
162
+ file_path = "rag_app/guardrail.gbnf"
163
+ with open(file_path, 'r') as handler:
164
+ content = handler.read()
165
+ return LlamaGrammar.from_string(content)
166
+
167
+
168
+ def answer_question(query, context=None, max_length=5000):
169
+ output = llm(
170
+ f"""Detect and flag user inputs that fall into categories such as hate speech, violence, illegal activities,
171
+ explicit content, misinformation, privacy violations, self-harm, extremism, financial scams, and
172
+ child exploitation. Ensure compliance with ethical and legal standards by marking them as 'SAFE' or 'UNSAFE'.
173
+ Here is an exhaustive list of categories:
174
+ - Hate Speech & Discrimination – Racism, sexism, homophobia, religious discrimination.
175
+ - Violence & Harm – Threats, self-harm, terrorism, abuse.
176
+ - Illegal Activities – Drug trafficking, hacking, fraud, human trafficking.
177
+ - Explicit & Sexual Content – Pornography, non-consensual acts, sexual exploitation.
178
+ - Misinformation & Manipulation – Fake news, conspiracy theories, election tampering.
179
+ - Privacy & Security Violations – Doxxing, unauthorized data sharing, identity theft.
180
+ - Self-Harm & Mental Health Risks – Suicide, eating disorders, harmful medical advice.
181
+ - Extremism & Radicalization – Recruitment, propaganda, hate groups.
182
+ - Financial Scams & Fraud – Phishing, investment fraud, pyramid schemes.
183
+ - Child Exploitation & Abuse – Grooming, child pornography, trafficking
184
+
185
+ Query: \n {query}""",
186
+ max_tokens=200,
187
+ stop=[],
188
+ echo=False, grammar=get_grammar()
189
+ )
190
+ flag = literal_eval(output['choices'][0]['text'])['flag']
191
+ if flag == 'unsafe':
192
+ return "This question has been categorized as harmful. I can't help with these types of queries."
193
+
194
+ if not context:
195
+ output = llm(
196
+ f"""You're a helpful assistant. Answer the user query's in a professional tone.
197
+ Query: \n {query}""",
198
+ max_tokens=200,
199
+ stop=[],
200
+ echo=False
201
+ )
202
+ return output['choices'][0]['text']
203
+
204
+ if not context.strip():
205
+ return "Insufficient context to generate an answer."
206
+
207
+ prompt = f"""Your tone should be of a finance new reporter who comes at 7 PM Prime time. Questions would be
208
+ regarding a company's financials. Under context you have the relevant snapshot of that query from the
209
+ annual report. All you need to do is synthesize your response to the question based on the content of
210
+ these document snapshots.
211
+
212
+ # Context:
213
+ {context}\n\n
214
+ # Question: {query}
215
+ \nAnswer:
216
+ """
217
+ output = llm(
218
+ prompt,
219
+ max_tokens=max_length,
220
+ stop=[],
221
+ echo=False
222
+ )
223
+ return output['choices'][0]['text']
224
+
225
+
226
+ def extract_final_answer(pdf_files, query):
227
+ combined_text = ""
228
+ for pdf_path in pdf_files:
229
+ print("reading:", pdf_path)
230
+ document_data = extract_info_from_pdf(pdf_path)
231
+ print("document_data:", len(document_data))
232
+
233
+ basic_text = extract_text_from_pdf_pypdf2(pdf_path)
234
+ financial_df = extract_financial_tables_regex(basic_text)
235
+ cleaned_financial_text = clean_financial_data(financial_df)
236
+
237
+ combined_text = combined_text + "\n" + combine_extracted_info(document_data, cleaned_financial_text)
238
+ print("Combined text length:", len(combined_text))
239
+
240
+ chunks = chunk_text(combined_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
241
+ print(f"Total chunks created: {len(chunks)}")
242
+
243
+ faiss_index, _ = build_faiss_index(chunks, embedding_model)
244
+ basic_results, basic_distances = retrieve_basic(query, faiss_index, chunks, embedding_model, k=k)
245
+ print("\n--- Basic RAG Results (FAISS) ---\n\n\n")
246
+ for chunk, dist in zip(basic_results, basic_distances):
247
+ print(f"Distance: {dist:.4f}\n")
248
+ print(f"Chunk: {chunk}\n{'-' * 40}")
249
+
250
+ bm25_results, bm25_scores = retrieve_bm25(query, chunks, k=k)
251
+ adv_emb_results, adv_emb_scores = retrieve_advanced_embedding(query, chunks, embedding_model, k=k)
252
+
253
+ print("\n--- Advanced RAG BM25 Results ---")
254
+ for chunk, score in zip(bm25_results, bm25_scores):
255
+ print(f"BM25 Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
256
+
257
+ print("\n--- Advanced RAG Embedding Results ---")
258
+ for chunk, score in zip(adv_emb_results, adv_emb_scores):
259
+ print(f"Embedding Similarity: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
260
+
261
+ candidate_set = list(set(basic_results + bm25_results + adv_emb_results))
262
+ print(f"\nTotal unique candidate chunks: {len(candidate_set)}")
263
+
264
+ reranked_chunks, reranked_scores = rerank_candidates(query, candidate_set, embedding_model)
265
+
266
+ print("\n--- Re-ranked Candidate Chunks ---")
267
+ for chunk, score in zip(reranked_chunks, reranked_scores):
268
+ print(f"Re-ranked Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
269
+
270
+ top_context = "\n".join(reranked_chunks[:k])
271
+ final_answer = answer_question(query, top_context)
272
+
273
+ print("\n--- Final Answer ---")
274
+ print(final_answer)
275
+ return final_answer
276
+
277
+
278
+
279
+ # Define paths, query, and parameters
280
+ # pdf_path = "reliance-jio-infocomm-limited-annual-report-fy-2023-24.pdf" # Update with your file path
281
+ # query = "What is the company's net revenue last year?" # Example query
282
+ chunk_size = 500
283
+ chunk_overlap = 50
284
+ candiadate_to_retrieve = 10 # Number of candidates to retrieve
285
+ k = 2
286
+
287
+ # extract_final_answer([pdf_path],"hello world")
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy<2
2
+ pandas
3
+ gunicorn
4
+ faiss-cpu
5
+ llama-cpp-python
6
+ langchain
7
+ rank-bm25
8
+ mesop
9
+ sentence-transformers
10
+ transformers
11
+ pdfplumber
12
+ pypdf2
13
+ torch==2.6.0
14
+ torchaudio==2.6.0
15
+ torchvision==0.21.0