Spaces:

snsynth
/

finance-rag-chatbot-group39

Sleeping

App Files Files Community

snsynth commited on Mar 15

Commit

78f53a7

1 Parent(s): 5c351f3

adding all files with Docker configuration of the app

Browse files

Files changed (7) hide show

Dockerfile +31 -0
rag_app/app.py +77 -0
rag_app/chat_utils.py +143 -0
rag_app/embeddings.py +46 -0
rag_app/guardrail.gbnf +13 -0
rag_app/rag.py +287 -0
requirements.txt +15 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# Use an official Miniconda image as the base
+FROM python:3.10.15-bullseye
+ENV PIP_DEFAULT_TIMEOUT=300
+RUN apt-get update && \
+  apt-get install -y \
+  # General dependencies
+  locales \
+  locales-all && \
+  # Clean local repository of package files since they won't be needed anymore.
+  # Make sure this line is called after all apt-get update/install commands have
+  # run.
+  apt-get clean && \
+  # Also delete the index files which we also don't need anymore.
+  rm -rf /var/lib/apt/lists/* \
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN groupadd -g 900 mesop && useradd -u 900 -s /bin/bash -g mesop mesop
+USER mesop
+COPY . /finance-rag-chatbot-group39
+WORKDIR /finance-rag-chatbot-group39
+# Final command: run the mesop script
+CMD ["mesop", "rag_app/app.py", "--port", "7680"]

rag_app/app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import mesop as me
+from chat_utils import State, _make_style_chat_bubble_wrapper, _ROLE_ASSISTANT, on_chat_input, _make_chat_bubble_style, \
+    on_click_submit_chat_msg, _STYLE_CHAT_BUBBLE_NAME, handle_pdf_upload
+_COLOR_BACKGROUND = me.theme_var("background")
+_STYLE_APP_CONTAINER = me.Style(
+    background=_COLOR_BACKGROUND,
+    display="flex",
+    flex_direction="column",
+    height="100%",
+    margin=me.Margin.symmetric(vertical=0, horizontal="auto"),
+    width="min(1024px, 100%)",
+    box_shadow="0 3px 1px -2px #0003, 0 2px 2px #00000024, 0 1px 5px #0000001f",
+    padding=me.Padding(top=20, left=20, right=20),
+)
+@me.page()
+def app():
+    state = me.state(State)
+    with me.box(style=_STYLE_APP_CONTAINER):
+        with me.box(style=me.Style(
+                width="min(680%, 100%)",
+                margin=me.Margin.symmetric(vertical=36, horizontal="auto"),
+                flex_grow=1,
+                overflow_y="auto",
+                padding=me.Padding(left=20, right=20)
+        )):
+            me.text("""
+            FinanceGPT - Powered by open source language models capable of document QnA on Annual
+            Investor Reports of top companies.
+            """,
+                    style=me.Style(font_size=20, margin=me.Margin(bottom=24), text_align="center")
+                    )
+            me.text("ℹ️ Upload annual reports to start asking questions.",
+                    style=me.Style(font_size=12, margin=me.Margin(bottom=24), text_align="center")
+                    )
+            for index, msg in enumerate(state.output):
+                with me.box(style=_make_style_chat_bubble_wrapper(msg.role), key=f"msg-{index}"):
+                    if msg.role == _ROLE_ASSISTANT:
+                        me.text("assistant", style=_STYLE_CHAT_BUBBLE_NAME)
+                    with me.box(style=_make_chat_bubble_style(msg.role)):
+                        me.markdown(msg.content)
+            if state.in_progress:
+                me.progress_spinner()
+                with me.box(key="scroll-to", style=me.Style(height=250)):
+                    pass
+        with me.box(style=me.Style(
+                padding=me.Padding(top=30, left=20, right=20),
+                display="flex",
+                flex_direction="row"
+        )):
+            with me.content_uploader(
+                    accepted_file_types=["application/pdf"],
+                    on_upload=handle_pdf_upload,
+                    type="icon",
+                    style=me.Style(font_weight="bold", margin=me.Margin(right=8)),
+            ):
+                me.icon("attach_file")
+            with me.box(style=me.Style(flex_grow=1)):
+                me.input(
+                    label="Enter your prompt",
+                    key=f"input-{len(state.output)}",
+                    on_input=on_chat_input,
+                    on_enter=on_click_submit_chat_msg,
+                    style=me.Style(width="100%")
+                )
+            with me.content_button(
+                    color="primary",
+                    type="flat",
+                    disabled=state.in_progress,
+                    on_click=on_click_submit_chat_msg,
+                    style=me.Style(margin=me.Margin(top=8, left=8))
+            ):
+                me.icon("send" if not state.in_progress else "pending")

rag_app/chat_utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import mesop as me
+from dataclasses import dataclass, field
+from typing import Callable, Generator, Literal
+import time
+from rag import extract_final_answer, answer_question
+Role = Literal["user", "assistant"]
+_ROLE_USER = "user"
+_ROLE_ASSISTANT = "assistant"
+_COLOR_CHAT_BUBBLE_YOU = me.theme_var("surface-container-low")
+_COLOR_CHAT_BUBBLE_BOT = me.theme_var("secondary-container")
+_DEFAULT_BORDER_SIDE = me.BorderSide(
+    width="1px", style="solid", color=me.theme_var("secondary-fixed")
+)
+_STYLE_CHAT_BUBBLE_NAME = me.Style(
+    font_weight="bold",
+    font_size="12px",
+    padding=me.Padding(left=15, right=15, bottom=5),
+)
+@dataclass(kw_only=True)
+class ChatMessage:
+    role: Role = "user"
+    content: str = ""
+@me.stateclass
+class State:
+    input: str = ""
+    output: list[ChatMessage] = field(default_factory=list)
+    in_progress: bool = False
+    pdf_files: list[str] = field(default_factory=list)  # Changed to a list
+def respond_to_chat(query: str, history: list[ChatMessage]):
+    assistant_message = ChatMessage(role=_ROLE_ASSISTANT)
+    yield assistant_message
+    state = me.state(State)
+    pdf_files = state.pdf_files
+    if pdf_files:
+        response = extract_final_answer(pdf_files, query)
+    else:
+        response = answer_question(query)
+    print("Agent response=", response)
+    yield response
+    # messages = [{"role": message.role, "content": message.content} for message in history]
+    # llm_response = llm.create_chat_completion(
+    #     messages=messages,
+    #     max_tokens=1024,
+    #     stop=[],
+    #     stream=True
+    # )
+    # assistant_message = ChatMessage(role=_ROLE_ASSISTANT)
+    # yield assistant_message
+    # for item in llm_response:
+    #     delta = item['choices'][0]['delta']
+    #     if 'content' in delta:
+    #         text = delta["content"]
+    #         yield text
+def on_chat_input(e: me.InputEvent):
+    state = me.state(State)
+    state.input = e.value
+def on_click_submit_chat_msg(e: me.ClickEvent | me.InputEnterEvent):
+    state = me.state(State)
+    if state.in_progress or not state.input:
+        return
+    input_ = state.input
+    state.input = ""
+    yield
+    output = state.output
+    output.append(ChatMessage(role=_ROLE_USER, content=input_))
+    state.in_progress = True
+    me.scroll_into_view(key="scroll-to")
+    yield
+    start_time = time.time()
+    for content in respond_to_chat(input_, state.output):
+        if isinstance(content, ChatMessage):
+            assistant_message = content
+            output.append(assistant_message)
+            state.output = output
+        else:
+            assistant_message.content += content
+            if (time.time() - start_time) >= 0.25:
+                start_time = time.time()
+                yield
+    state.in_progress = False
+    yield
+def _make_style_chat_bubble_wrapper(role: Role) -> me.Style:
+    align_items = "end" if role == _ROLE_USER else "start"
+    return me.Style(
+        display="flex",
+        flex_direction="column",
+        align_items=align_items,
+    )
+def _make_chat_bubble_style(role: Role) -> me.Style:
+    background = _COLOR_CHAT_BUBBLE_YOU
+    if role == _ROLE_ASSISTANT:
+        background = _COLOR_CHAT_BUBBLE_BOT
+    return me.Style(
+        width="80%",
+        font_size="13px",
+        background=background,
+        border_radius="15px",
+        padding=me.Padding(right=15, left=15, bottom=3),
+        margin=me.Margin(bottom=10),
+        border=me.Border(
+            left=_DEFAULT_BORDER_SIDE,
+            right=_DEFAULT_BORDER_SIDE,
+            top=_DEFAULT_BORDER_SIDE,
+            bottom=_DEFAULT_BORDER_SIDE,
+        ),
+    )
+def save_uploaded_file(uploaded_file: me.UploadedFile):
+    save_directory = "docs"
+    os.makedirs(save_directory, exist_ok=True)
+    file_path = os.path.join(save_directory, uploaded_file.name)
+    with open(file_path, "wb") as f:
+        f.write(uploaded_file.getvalue())
+    print(f"File saved successfully at {file_path}")
+def handle_pdf_upload(event: me.UploadEvent):
+    state = me.state(State)
+    save_uploaded_file(event.file)
+    state.pdf_files.append(os.path.join("docs", event.file.name))

rag_app/embeddings.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from llama_cpp import Llama
+from typing import Any, List
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.core.bridge.pydantic import PrivateAttr
+class LlamaCppIndexEmbedding(BaseEmbedding):
+    _model: Llama = PrivateAttr()
+    def __init__(
+        self,
+        model_path: str = "models/bge-m3-Q4_K_M.gguf",
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self._model = Llama(model_path=model_path, embedding=True)
+    @classmethod
+    def class_name(cls) -> str:
+        return "llama-cpp-bge-m3-embeddings"
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        return self._get_text_embedding(text)
+    def _get_query_embedding(self, query: str) -> List[float]:
+        # Generate embedding using llama-cpp-python
+        response = self._model.create_embedding(input=query)
+        embedding = response['data'][0]['embedding']
+        return embedding
+    def _get_text_embedding(self, text: str) -> List[float]:
+        # Generate embedding for a single text
+        response = self._model.create_embedding(input=text)
+        embedding = response['data'][0]['embedding']
+        return embedding
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        # Generate embeddings for a list of texts
+        embeddings = []
+        for text in texts:
+            embedding = self._get_text_embedding(text)
+            embeddings.append(embedding)
+        return embeddings

rag_app/guardrail.gbnf ADDED Viewed

	@@ -0,0 +1,13 @@

+root ::= (" "| "\n") grammar-models
+grammar-models ::= category
+category ::= "{" "\n"  ws "\"flag\"" ":" ws category-flag "\n" ws "}"
+category-flag ::= "\"safe\"" | "\"unsafe\""
+boolean ::= "true" | "false"
+null ::= "null"
+string ::= "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" ws
+ws ::= ([ \t\n] ws)?
+float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+integer ::= [0-9]+

rag_app/rag.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# !pip install pdfplumber
+# !pip install rank_bm25
+# !pip install langchain
+# pip install sentence_transformers
+# conda install -c conda-forge faiss-cpu
+import pdfplumber
+import pandas as pd
+import numpy as np
+import re
+import os
+from ast import literal_eval
+import faiss
+from llama_cpp import Llama, LlamaGrammar
+from rank_bm25 import BM25Okapi
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer, util
+from sklearn.metrics.pairwise import cosine_similarity
+import PyPDF2
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+llm = Llama(model_path="models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+            n_gpu_layers=-1, n_ctx=8000)
+def extract_info_from_pdf(pdf_path):
+    """
+    Extracts both paragraphs and tables from each PDF page using pdfplumber.
+    Returns a list of dictionaries with keys: "page_number", "paragraphs", "tables".
+    """
+    document_data = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages, start=1):
+            page_data = {"page_number": i, "paragraphs": [], "tables": []}
+            text = page.extract_text()
+            if text:
+                paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+                page_data["paragraphs"] = paragraphs
+            tables = page.extract_tables()
+            dfs = []
+            for table in tables:
+                if len(table) > 1:
+                    df = pd.DataFrame(table[1:], columns=table[0])
+                else:
+                    df = pd.DataFrame(table)
+                dfs.append(df)
+            page_data["tables"] = dfs
+            document_data.append(page_data)
+    return document_data
+def extract_financial_tables_regex(text):
+    """
+    Extracts financial table information using a regex pattern (basic extraction).
+    """
+    pattern = re.compile(r"(Revenue from Operations.*?)\n\n", re.DOTALL)
+    matches = pattern.findall(text)
+    if matches:
+        data_lines = matches[0].split("\n")
+        structured_data = [line.split() for line in data_lines if line.strip()]
+        if len(structured_data) > 1:
+            df = pd.DataFrame(structured_data[1:], columns=structured_data[0])
+            return df
+    return pd.DataFrame()
+def clean_financial_data(df):
+    """
+    Cleans the financial DataFrame by converting numerical columns.
+    """
+    if df.empty:
+        return ""
+    for col in df.columns[1:]:
+        df[col] = df[col].replace({',': ''}, regex=True)
+        df[col] = pd.to_numeric(df[col], errors='coerce')
+    return df.to_string()
+def combine_extracted_info(document_data, financial_text_regex=""):
+    """
+    Combines extracted paragraphs and tables (converted to strings) into a single text.
+    Optionally appends extra financial table text.
+    """
+    text_segments = []
+    for page in document_data:
+        for paragraph in page["paragraphs"]:
+            text_segments.append(paragraph)
+        for table in page["tables"]:
+            text_segments.append(table.to_string(index=False))
+    if financial_text_regex:
+        text_segments.append(financial_text_regex)
+    return "\n".join(text_segments)
+def extract_text_from_pdf_pypdf2(pdf_path):
+    text = ""
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+def chunk_text(text, chunk_size=500, chunk_overlap=50):
+    """
+    Uses RecursiveCharacterTextSplitter to chunk text.
+    """
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    chunks = text_splitter.split_text(text)
+    return chunks
+def build_faiss_index(chunks, embedding_model):
+    chunk_embeddings = embedding_model.encode(chunks)
+    dimension = chunk_embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(np.array(chunk_embeddings))
+    return index, chunk_embeddings
+def retrieve_basic(query, index, chunks, embedding_model, k=5):
+    query_embedding = embedding_model.encode([query])
+    distances, indices = index.search(np.array(query_embedding), k)
+    return [chunks[i] for i in indices[0]], distances[0]
+def retrieve_bm25(query, chunks, k=5):
+    tokenized_corpus = [chunk.lower().split() for chunk in chunks]
+    bm25_model = BM25Okapi(tokenized_corpus)
+    tokenized_query = query.lower().split()
+    scores = bm25_model.get_scores(tokenized_query)
+    top_indices = np.argsort(scores)[::-1][:k]
+    return [chunks[i] for i in top_indices], scores[top_indices]
+def retrieve_advanced_embedding(query, chunks, embedding_model, k=5):
+    chunk_embeddings = embedding_model.encode(chunks)
+    query_embedding = embedding_model.encode([query])
+    scores = cosine_similarity(np.array(query_embedding), np.array(chunk_embeddings))[0]
+    top_indices = np.argsort(scores)[::-1][:k]
+    return [chunks[i] for i in top_indices], scores[top_indices]
+def rerank_candidates(query, candidate_chunks, embedding_model):
+    """
+    Re-ranks candidate chunks using cosine similarity with the query.
+    """
+    candidate_embeddings = embedding_model.encode(candidate_chunks)
+    query_embedding = embedding_model.encode([query])
+    scores = cosine_similarity(np.array(query_embedding), np.array(candidate_embeddings))[0]
+    ranked_indices = np.argsort(scores)[::-1]
+    reranked_chunks = [candidate_chunks[i] for i in ranked_indices]
+    reranked_scores = scores[ranked_indices]
+    return reranked_chunks, reranked_scores
+def get_grammar() -> LlamaGrammar:
+    """
+    :return:
+    """
+    file_path = "rag_app/guardrail.gbnf"
+    with open(file_path, 'r') as handler:
+        content = handler.read()
+        return LlamaGrammar.from_string(content)
+def answer_question(query, context=None, max_length=5000):
+    output = llm(
+        f"""Detect and flag user inputs that fall into categories such as hate speech, violence, illegal activities,
+        explicit content, misinformation, privacy violations, self-harm, extremism, financial scams, and
+        child exploitation. Ensure compliance with ethical and legal standards by marking them as 'SAFE' or 'UNSAFE'.
+        Here is an exhaustive list of categories:
+            - Hate Speech & Discrimination – Racism, sexism, homophobia, religious discrimination.
+            - Violence & Harm – Threats, self-harm, terrorism, abuse.
+            - Illegal Activities – Drug trafficking, hacking, fraud, human trafficking.
+            - Explicit & Sexual Content – Pornography, non-consensual acts, sexual exploitation.
+            - Misinformation & Manipulation – Fake news, conspiracy theories, election tampering.
+            - Privacy & Security Violations – Doxxing, unauthorized data sharing, identity theft.
+            - Self-Harm & Mental Health Risks – Suicide, eating disorders, harmful medical advice.
+            - Extremism & Radicalization – Recruitment, propaganda, hate groups.
+            - Financial Scams & Fraud – Phishing, investment fraud, pyramid schemes.
+            - Child Exploitation & Abuse – Grooming, child pornography, trafficking
+        Query: \n {query}""",
+        max_tokens=200,
+        stop=[],
+        echo=False, grammar=get_grammar()
+    )
+    flag = literal_eval(output['choices'][0]['text'])['flag']
+    if flag == 'unsafe':
+        return "This question has been categorized as harmful. I can't help with these types of queries."
+    if not context:
+        output = llm(
+            f"""You're a helpful assistant. Answer the user query's in a professional tone.
+            Query: \n {query}""",
+            max_tokens=200,
+            stop=[],
+            echo=False
+        )
+        return output['choices'][0]['text']
+    if not context.strip():
+        return "Insufficient context to generate an answer."
+    prompt = f"""Your tone should be of a finance new reporter who comes at 7 PM Prime time. Questions would be
+              regarding a company's financials. Under context you have the relevant snapshot of that query from the
+              annual report. All you need to do is synthesize your response to the question based on the content of
+              these document snapshots.
+              # Context:
+              {context}\n\n
+              # Question: {query}
+              \nAnswer:
+              """
+    output = llm(
+        prompt,
+        max_tokens=max_length,
+        stop=[],
+        echo=False
+    )
+    return output['choices'][0]['text']
+def extract_final_answer(pdf_files, query):
+    combined_text = ""
+    for pdf_path in pdf_files:
+        print("reading:", pdf_path)
+        document_data = extract_info_from_pdf(pdf_path)
+        print("document_data:", len(document_data))
+        basic_text = extract_text_from_pdf_pypdf2(pdf_path)
+        financial_df = extract_financial_tables_regex(basic_text)
+        cleaned_financial_text = clean_financial_data(financial_df)
+        combined_text = combined_text + "\n" + combine_extracted_info(document_data, cleaned_financial_text)
+    print("Combined text length:", len(combined_text))
+    chunks = chunk_text(combined_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    print(f"Total chunks created: {len(chunks)}")
+    faiss_index, _ = build_faiss_index(chunks, embedding_model)
+    basic_results, basic_distances = retrieve_basic(query, faiss_index, chunks, embedding_model, k=k)
+    print("\n--- Basic RAG Results (FAISS) ---\n\n\n")
+    for chunk, dist in zip(basic_results, basic_distances):
+        print(f"Distance: {dist:.4f}\n")
+        print(f"Chunk: {chunk}\n{'-' * 40}")
+    bm25_results, bm25_scores = retrieve_bm25(query, chunks, k=k)
+    adv_emb_results, adv_emb_scores = retrieve_advanced_embedding(query, chunks, embedding_model, k=k)
+    print("\n--- Advanced RAG BM25 Results ---")
+    for chunk, score in zip(bm25_results, bm25_scores):
+        print(f"BM25 Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
+    print("\n--- Advanced RAG Embedding Results ---")
+    for chunk, score in zip(adv_emb_results, adv_emb_scores):
+        print(f"Embedding Similarity: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
+    candidate_set = list(set(basic_results + bm25_results + adv_emb_results))
+    print(f"\nTotal unique candidate chunks: {len(candidate_set)}")
+    reranked_chunks, reranked_scores = rerank_candidates(query, candidate_set, embedding_model)
+    print("\n--- Re-ranked Candidate Chunks ---")
+    for chunk, score in zip(reranked_chunks, reranked_scores):
+        print(f"Re-ranked Score: {score:.4f}\nChunk: {chunk}\n{'-' * 40}")
+    top_context = "\n".join(reranked_chunks[:k])
+    final_answer = answer_question(query, top_context)
+    print("\n--- Final Answer ---")
+    print(final_answer)
+    return final_answer
+# Define paths, query, and parameters
+# pdf_path = "reliance-jio-infocomm-limited-annual-report-fy-2023-24.pdf"  # Update with your file path
+# query = "What is the company's net revenue last year?"  # Example query
+chunk_size = 500
+chunk_overlap = 50
+candiadate_to_retrieve = 10  # Number of candidates to retrieve
+k = 2
+# extract_final_answer([pdf_path],"hello world")

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+numpy<2
+pandas
+gunicorn
+faiss-cpu
+llama-cpp-python
+langchain
+rank-bm25
+mesop
+sentence-transformers
+transformers
+pdfplumber
+pypdf2
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0