Spaces:

Rabbit-Innotech
/

GBVR_Chatbot

Sleeping

App Files Files Community

Rabbit-Innotech commited on Apr 10

Commit

5b7f559

verified ·

1 Parent(s): 2e78714

Update app.py

Browse files

Files changed (1) hide show

app.py +704 -59

app.py CHANGED Viewed

@@ -1,64 +1,709 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
 if __name__ == "__main__":
-    demo.launch()

+import os
+import PyPDF2
+from PyPDF2 import PdfReader
+import pandas as pd
+## Embedding model!
+from langchain_huggingface import HuggingFaceEmbeddings
+embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+folder_path = "./"
+context_data = []
+# List all files in the folder
+files = os.listdir(folder_path)
+# Get list of CSV and Excel files
+data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))]
+# Process each file
+for f, file in enumerate(data_files, 1):
+    print(f"\nProcessing file {f}: {file}")
+    file_path = os.path.join(folder_path, file)
+    try:
+        # Read the file based on its extension
+        if file.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        else:
+            df = pd.read_excel(file_path)
+        # Extract non-empty values from column 2 and append them
+        context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())
+    except Exception as e:
+        print(f"Error processing file {file}: {str(e)}")
+import os
+import PyPDF2
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+def extract_text_from_pdf(pdf_path):
+    """Extract text from a PDF file."""
+    try:
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            return "".join(page.extract_text() or "" for page in reader.pages)
+    except Exception as e:
+        print(f"Error with {pdf_path}: {e}")
+        return ""
+pdf_files = [f for f in files if f.lower().endswith(".pdf")]
+# Process PDFs
+documents = []
+for file in pdf_files:
+    print(f"Processing: {file}")
+    pdf_path = os.path.join(folder_path, file)
+    text = extract_text_from_pdf(pdf_path)
+    if text:
+        documents.append(Document(page_content=text, metadata={"source": file}))
+# Split into chunks
+text_splitter = RecursiveCharacterTextSplitter(
+    separators=['\n\n', '\n', '.', ','],
+    chunk_size=500,
+    chunk_overlap=50
+)
+chunks = text_splitter.split_documents(documents)
+text_only_chunks = [chunk.page_content for chunk in chunks]
+from urllib.parse import urljoin, urlparse
+import requests
+from io import BytesIO
+from bs4 import BeautifulSoup
+from langchain_core.prompts import ChatPromptTemplate
 import gradio as gr
+def scrape_websites(base_urls):
+    try:
+        visited_links = set()  # To avoid revisiting the same link
+        content_by_url = {}  # Store content from each URL
+        for base_url in base_urls:
+            if not base_url.strip():
+                continue  # Skip empty or invalid URLs
+            print(f"Scraping base URL: {base_url}")
+            html_content = fetch_page_content(base_url)
+            if html_content:
+                cleaned_content = clean_body_content(html_content)
+                content_by_url[base_url] = cleaned_content
+                visited_links.add(base_url)
+                # Extract and process all internal links
+                soup = BeautifulSoup(html_content, "html.parser")
+                links = extract_internal_links(base_url, soup)
+                for link in links:
+                    if link not in visited_links:
+                        print(f"Scraping link: {link}")
+                        page_content = fetch_page_content(link)
+                        if page_content:
+                            cleaned_content = clean_body_content(page_content)
+                            content_by_url[link] = cleaned_content
+                            visited_links.add(link)
+                        # If the link is a PDF file, extract its content
+                        if link.lower().endswith('.pdf'):
+                            print(f"Extracting PDF content from: {link}")
+                            pdf_content = extract_pdf_text(link)
+                            if pdf_content:
+                                content_by_url[link] = pdf_content
+        return content_by_url
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+        return {}
+def fetch_page_content(url):
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+def extract_internal_links(base_url, soup):
+    links = set()
+    for anchor in soup.find_all("a", href=True):
+        href = anchor["href"]
+        full_url = urljoin(base_url, href)
+        if is_internal_link(base_url, full_url):
+            links.add(full_url)
+    return links
+def is_internal_link(base_url, link_url):
+    base_netloc = urlparse(base_url).netloc
+    link_netloc = urlparse(link_url).netloc
+    return base_netloc == link_netloc
+def extract_pdf_text(pdf_url):
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        # Open the PDF from the response content
+        with BytesIO(response.content) as file:
+            reader = PdfReader(file)
+            pdf_text = ""
+            for page in reader.pages:
+                pdf_text += page.extract_text()
+        return pdf_text if pdf_text else None
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching PDF {pdf_url}: {e}")
+        return None
+    except Exception as e:
+        print(f"Error reading PDF {pdf_url}: {e}")
+        return None
+def clean_body_content(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove scripts and styles
+    for script_or_style in soup(["script", "style"]):
+        script_or_style.extract()
+    # Get text and clean up
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+    return cleaned_content
+# if __name__ == "__main__":
+#     website = [
+#                #"https://www.rib.gov.rw/index.php?id=371",
+#                "https://haguruka.org.rw/our-work/"
+#                ]
+#     all_content = scrape_websites(website)
+#     # Temporary list to store (url, content) tuples
+#     temp_list = []
+#     # Process and store each URL with its content
+#     for url, content in all_content.items():
+#         temp_list.append((url, content))
+# processed_texts = []
+# # Process each element in the temporary list
+# for element in temp_list:
+#     if isinstance(element, tuple):
+#         url, content = element  # Unpack the tuple
+#         processed_texts.append(f"url: {url}, content: {content}")
+#     elif isinstance(element, str):
+#         processed_texts.append(element)
+#     else:
+#         processed_texts.append(str(element))
+# def chunk_string(s, chunk_size=2000):
+#     return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+# # List to store the chunks
+# chunked_texts = []
+# for text in processed_texts:
+#   chunked_texts.extend(chunk_string(text))
+data = []
+data.extend(context_data)
+#data.extend([item for item in text_only_chunks if item not in data])
+#data.extend([item for item in chunked_texts if item not in data])
+#from langchain_community.vectorstores import Chroma
+from langchain_chroma import Chroma
+vectorstore = Chroma(
+    collection_name="Dataset",
+    embedding_function=embed_model,
 )
+vectorstore.get().keys()
+# add data to vector nstore
+vectorstore.add_texts(data)
+api= os.environ.get('V1')
+from openai import OpenAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+import gradio as gr
+from typing import Iterator
+import time
+#template for GBV support chatbot
+template = ("""
+You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
+You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner.
+When responding follow these guidelines:
+1. **Emotional Intelligence**
+   - Validate feelings without judgment (e.g., "It is completely understandable to feel this way")
+   - Offer reassurance when appropriate, always centered on empowerment
+   - Adjust your tone based on the emotional state conveyed
+2. **Personalized Communication**
+   - Avoid contractions (e.g., use I am instead of I'm)
+   - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
+   - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
+   - Balance warmth with professionalism
+3. **Conversation Management**
+   - Refer to {conversation_history} to maintain continuity and avoid repetition
+   - Keep responses concise unless greater detail is explicitly requested
+   - Use clear paragraph breaks for readability
+   - Prioritize immediate concerns before addressing secondary issues
+4. **Information Delivery**
+   - Extract only relevant information from {context} that directly addresses the question
+   - Present information in accessible, non-technical language
+   - Organize resource recommendations in order of relevance and accessibility
+   - Provide links [URL] only when specifically requested, prefaced with clear descriptions
+   - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
+5. **Safety and Ethics**
+   - Prioritize user safety in all responses
+   - Never generate speculative content about their specific situation
+   - Avoid phrases that could minimize experiences or create pressure
+   - Include gentle reminders about professional help when discussing serious issues
+Your response should balance emotional support with practical guidance.
+    **Context:** {context}
+    **User's Question:** {question}
+    **Your Response:**
+""")
+rag_prompt = PromptTemplate.from_template(template)
+retriever = vectorstore.as_retriever()
+import requests
+API_TOKEN = os.environ.get('Token')
+model_name = "facebook/nllb-200-distilled-600M"
+url = f"https://api-inference.huggingface.co/models/{model_name}"
+headers = {
+    "Authorization": f"Bearer {API_TOKEN}"
+}
+def translate_text(text, src_lang, tgt_lang):
+    """Translate text using Hugging Face API"""
+    response = requests.post(
+        url,
+        headers=headers,
+        json={
+            "inputs": text,
+            "parameters": {
+                "src_lang": src_lang,
+                "tgt_lang": tgt_lang
+            }
+        }
+    )
+    if response.status_code == 200:
+        result = response.json()
+        if isinstance(result, list) and len(result) > 0:
+            return result[0]['translation_text']
+        return result['translation_text']
+    else:
+        print(f"Translation error: {response.status_code}, {response.text}")
+        return text  # Return original text if translation fails
+class OpenRouterLLM:
+    def __init__(self, key: str):
+        try:
+            self.client = OpenAI(
+                base_url="https://openrouter.ai/api/v1",
+                api_key=key
+            )
+            self.headers = {
+                "HTTP-Referer": "http://localhost:3000",
+                "X-Title": "Local Development"
+            }
+        except Exception as e:
+            print(f"Initialization error: {e}")
+            raise
+    def stream(self, prompt: str) -> Iterator[str]:
+        try:
+            completion = self.client.chat.completions.create(
+                #model="deepseek/deepseek-r1-distill-llama-70b:free",
+                model="meta-llama/llama-3.3-70b-instruct:free",
+                #model="google/gemini-2.5-pro-exp-03-25:free",
+                messages=[{"role": "user", "content": prompt}],
+                stream=True
+            )
+            for chunk in completion:
+                delta = chunk.choices[0].delta
+                if hasattr(delta, "content") and delta.content:
+                    yield delta.content
+        except Exception as e:
+            yield f"Streaming error: {str(e)}"
+class UserSession:
+    def __init__(self, llm: OpenRouterLLM):  # Accept an instance of OpenRouterLLM
+        self.current_user = None
+        self.welcome_message = None
+        self.conversation_history = []  # Add conversation history storage
+        self.llm = llm  # Store the LLM instance
+    def set_user(self, user_info):
+        self.current_user = user_info
+        self.set_welcome_message(user_info.get("Nickname", "Guest"))
+        # Initialize conversation history with welcome message
+        welcome = self.get_welcome_message()
+        self.conversation_history = [
+            {"role": "assistant", "content": welcome},
+        ]
+    def get_user(self):
+        return self.current_user
+    def set_welcome_message(self, Nickname, src_lang="eng_Latn", tgt_lang="kin_Latn"):
+        """Set a dynamic welcome message using the OpenRouterLLM."""
+        prompt = (
+            f"Create a very brief welcome message for {Nickname}. "
+            f"The message should: "
+            f"1. Welcome {Nickname} warmly and professionally. "
+            f"2. Emphasize that this is a safe and trusted space. "
+            f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. "
+            f"4. Use a tone that is warm, reassuring, and professional. "
+            f"5. Keep the message concise and impactful."
+        )
+        # Use the OpenRouterLLM to generate the message
+        welcome = "".join(self.llm.stream(prompt))  # Stream and concatenate the response
+        welcome_text=translate_text(welcome, src_lang, tgt_lang)
+        # Format the message with HTML styling
+        self.welcome_message = (
+            f"<div style='font-size: 20px;'>"
+            f"{welcome_text}"
+            f"</div>"
+        )
+    def get_welcome_message(self):
+        return self.welcome_message
+    def add_to_history(self, role, message):
+        """Add a message to the conversation history"""
+        self.conversation_history.append({"role": role, "content": message})
+    def get_conversation_history(self):
+        """Get the full conversation history"""
+        return self.conversation_history
+    def get_formatted_history(self):
+        """Get conversation history formatted as a string for the LLM"""
+        formatted_history = ""
+        for entry in self.conversation_history:
+            role = "User" if entry["role"] == "user" else "Assistant"
+            formatted_history += f"{role}: {entry['content']}\n\n"
+        return formatted_history
+api_key =api
+llm_instance = OpenRouterLLM(key=api_key)
+#llm_instance = model
+user_session = UserSession(llm_instance)
+def collect_user_info(Nickname):
+    if not Nickname:
+        return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), []
+    # Store user info for chat session
+    user_info = {
+        "Nickname": Nickname,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+    }
+    # Set user in session
+    user_session.set_user(user_info)
+    # Generate welcome message
+    welcome_message = user_session.get_welcome_message()
+    # Add initial message to start the conversation
+    chat_history = add_initial_message([(None, welcome_message)])
+    # Return welcome message and update UI
+    return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history
+# Add initial message to start the conversation
+def add_initial_message(chatbot):
+    #initial_message = (" "
+   # )
+    return chatbot #+ [(None, initial_message)]
+# Create RAG chain with user context and conversation history
+def create_rag_chain(retriever, template, api_key):
+    llm = OpenRouterLLM(api_key)
+    rag_prompt = PromptTemplate.from_template(template)
+    def stream_func(input_dict):
+        # Get context using the retriever's invoke method
+        context = retriever.invoke(input_dict["question"])
+        context_str = "\n".join([doc.page_content for doc in context])
+        # Get user info from the session
+        user_info = user_session.get_user() or {}
+        first_name = user_info.get("Nickname", "User")
+        # Get conversation history
+        conversation_history = user_session.get_formatted_history()
+        # Format prompt with user context and conversation history
+        prompt = rag_prompt.format(
+            context=context_str,
+            question=input_dict["question"],
+            first_name=first_name,
+            conversation_history=conversation_history
+        )
+        # Stream response
+        return llm.stream(prompt)
+    return stream_func
+# def rag_memory_stream(message, history):
+#     # Add user message to history
+#     user_session.add_to_history("user", message)
+#     # Initialize with empty response
+#     partial_text = ""
+#     full_response = ""
+#     # Use the rag_chain with the question
+#     for new_text in rag_chain({"question": message}):
+#         partial_text += new_text
+#         full_response = partial_text
+#         yield partial_text
+#     # After generating the complete response, add it to history
+#     user_session.add_to_history("assistant", full_response)
+def rag_memory_stream(message, history, user_lang="kin_Latn", system_lang="eng_Latn"):
+    english_message = translate_text(message, user_lang, system_lang)
+    user_session.add_to_history("user", english_message)
+    full_response = ""
+    for new_text in rag_chain({"question": english_message}):
+        full_response += new_text
+    translated_response = translate_text(full_response, system_lang, user_lang)
+    user_session.add_to_history("assistant", full_response)
+    yield translated_response
+import gradio as gr
+api_key = api
+def chatbot_interface():
+    api_key = api
+    global template
+    template = """
+        You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
+        **Previous conversation:** {conversation_history}
+        **Context information:** {context}
+        **User's Question:** {question}
+        When responding follow these guidelines:
+        1. **Strict Context Adherence**
+           - Only use information that appears in the provided {context}
+           - If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response
+        2. **Personalized Communication**
+           - Avoid contractions (e.g., use I am instead of I'm)
+           - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
+           - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
+           - Balance warmth with professionalism
+        3. **Emotional Intelligence**
+           - Validate feelings without judgment
+           - Offer reassurance when appropriate, always centered on empowerment
+           - Adjust your tone based on the emotional state conveyed
+        4. **Conversation Management**
+           - Refer to {conversation_history} to maintain continuity and avoid repetition
+           - Use clear paragraph breaks for readability
+        5. **Information Delivery**
+           - Extract only relevant information from {context} that directly addresses the question
+           - Present information in accessible, non-technical language
+           - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
+        6. **Safety and Ethics**
+           - Do not generate any speculative content or advice not supported by the context
+           - If the context contains safety information, prioritize sharing that information
+        Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials.
+        **Context:** {context}
+        **User's Question:** {question}
+        **Your Response:**
+    """
+    global rag_chain
+    rag_chain = create_rag_chain(retriever, template, api_key)
+    with gr.Blocks() as demo:
+        # User registration section
+        with gr.Column(visible=True, elem_id="registration_container") as registration_container:
+            gr.Markdown("### Your privacy matters to us! Just share a nickname you feel comfy with to start chatting..")
+            with gr.Row():
+                first_name = gr.Textbox(
+                    label="Nickname",
+                    placeholder="Enter your Nickname You feel comfy",
+                    scale=1,
+                    elem_id="input_nickname"
+                )
+            with gr.Row():
+                submit_btn = gr.Button("Start Chatting", variant="primary", scale=2)
+            response_message = gr.Markdown()
+        # Chatbot section (initially hidden)
+        with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container:
+            chat_interface = gr.ChatInterface(
+                fn=rag_memory_stream,
+                title="Chat with GBVR",
+                fill_height=True
+            )
+            # Footer with version info
+            gr.Markdown("Ijwi ry'Ubufasha Chatbot v1.0.0 © 2025")
+        # Handle user registration
+        submit_btn.click(
+            collect_user_info,
+            inputs=[first_name],
+            outputs=[response_message, chatbot_container, registration_container, chat_interface.chatbot]
+        )
+    demo.css = """
+    :root {
+        --background: #f0f0f0;
+        --text: #000000;
+    }
+    body, .gradio-container {
+        margin: 0;
+        padding: 0;
+        width: 100vw;
+        height: 100vh;
+        display: flex;
+        flex-direction: column;
+        justify-content: center;
+        align-items: center;
+        background: var(--background);
+        color: var(--text);
+    }
+    .gradio-container {
+        max-width: 100%;
+        max-height: 100%;
+    }
+    .gr-box {
+        background: var(--background);
+        color: var(--text);
+        border-radius: 12px;
+        padding: 2rem;
+        border: 1px solid rgba(0, 0, 0, 0.1);
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
+    }
+    .gr-button-primary {
+        background: var(--background);
+        color: var(--text);
+        padding: 12px 24px;
+        border-radius: 8px;
+        transition: all 0.3s ease;
+        border: 1px solid rgba(0, 0, 0, 0.1);
+    }
+    .gr-button-primary:hover {
+        transform: translateY(-1px);
+        box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
+    }
+    footer {
+        text-align: center;
+        color: var(--text);
+        opacity: 0.7;
+        padding: 1rem;
+        font-size: 0.9em;
+    }
+    .gr-markdown h3 {
+        color: var(--text);
+        margin-bottom: 1rem;
+    }
+    .registration-markdown, .chat-title h1 {
+        color: var(--text);
+    }
+    """
+    return demo
+# Launch the interface
 if __name__ == "__main__":
+    chatbot_interface().launch(share=True)