Rabbit-Innotech commited on
Commit
bd67e9b
·
verified ·
1 Parent(s): 5f75e1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +327 -663
app.py CHANGED
@@ -1,284 +1,93 @@
1
  import os
2
- import requests
3
  import time
4
- from io import BytesIO
5
- from typing import Iterator, List, Dict, Any, Optional
6
- from urllib.parse import urljoin, urlparse
7
-
8
- # Data processing imports
9
  import pandas as pd
10
- import PyPDF2
11
- from PyPDF2 import PdfReader
12
- from bs4 import BeautifulSoup
13
-
14
- # AI and NLP imports
15
- from openai import OpenAI
16
  from langchain_huggingface import HuggingFaceEmbeddings
17
- from langchain_chroma import Chroma
18
  from langchain_core.prompts import PromptTemplate
19
  from langchain_core.output_parsers import StrOutputParser
20
  from langchain_core.runnables import RunnablePassthrough
21
- from langchain.text_splitter import RecursiveCharacterTextSplitter
22
- from langchain.schema import Document
23
 
24
- # UI import
 
 
 
 
 
 
 
 
 
 
25
  import gradio as gr
 
26
 
27
- class DataProcessor:
28
- """Handles processing of various data sources including CSV, Excel, PDF, and web content."""
29
-
30
- def __init__(self, folder_path: str = "./"):
31
- self.folder_path = folder_path
32
- self.files = os.listdir(folder_path)
33
-
34
- def process_tabular_data(self) -> List[str]:
35
- """Process CSV and Excel files to extract data."""
36
- context_data = []
37
- data_files = [f for f in self.files if f.endswith(('.csv', '.xlsx', '.xls'))]
38
-
39
- for f, file in enumerate(data_files, 1):
40
- print(f"\nProcessing file {f}: {file}")
41
- file_path = os.path.join(self.folder_path, file)
42
-
43
- try:
44
- # Read file based on extension
45
- if file.endswith('.csv'):
46
- df = pd.read_csv(file_path)
47
- else:
48
- df = pd.read_excel(file_path)
49
-
50
- # Extract non-empty values from column 2
51
- context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())
52
- except Exception as e:
53
- print(f"Error processing file {file}: {str(e)}")
54
-
55
- return context_data
56
-
57
- def extract_text_from_pdf(self, pdf_path: str) -> str:
58
- """Extract text content from a PDF file."""
59
- try:
60
- with open(pdf_path, "rb") as file:
61
- reader = PyPDF2.PdfReader(file)
62
- return "".join(page.extract_text() or "" for page in reader.pages)
63
- except Exception as e:
64
- print(f"Error with {pdf_path}: {e}")
65
- return ""
66
-
67
- def process_pdf_files(self) -> List[Document]:
68
- """Process all PDF files and return documents."""
69
- pdf_files = [f for f in self.files if f.lower().endswith(".pdf")]
70
- documents = []
71
-
72
- for file in pdf_files:
73
- print(f"Processing: {file}")
74
- pdf_path = os.path.join(self.folder_path, file)
75
- text = self.extract_text_from_pdf(pdf_path)
76
- if text:
77
- documents.append(Document(page_content=text, metadata={"source": file}))
78
-
79
- return documents
80
-
81
- def split_documents(self, documents: List[Document], chunk_size: int = 500) -> List[str]:
82
- """Split documents into manageable chunks."""
83
- text_splitter = RecursiveCharacterTextSplitter(
84
- separators=['\n\n', '\n', '.', ','],
85
- chunk_size=chunk_size,
86
- chunk_overlap=50
87
- )
88
- chunks = text_splitter.split_documents(documents)
89
- return [chunk.page_content for chunk in chunks]
90
-
91
- def extract_pdf_text_from_url(self, pdf_url: str) -> Optional[str]:
92
- """Extract text from a PDF URL."""
93
- try:
94
- response = requests.get(pdf_url)
95
- response.raise_for_status()
96
-
97
- with BytesIO(response.content) as file:
98
- reader = PdfReader(file)
99
- pdf_text = ""
100
- for page in reader.pages:
101
- pdf_text += page.extract_text()
102
-
103
- return pdf_text if pdf_text else None
104
- except requests.exceptions.RequestException as e:
105
- print(f"Error fetching PDF {pdf_url}: {e}")
106
- return None
107
- except Exception as e:
108
- print(f"Error reading PDF {pdf_url}: {e}")
109
- return None
110
 
 
 
111
 
112
- class WebScraper:
113
- """Web scraping functionality for collecting data from websites."""
114
-
115
- def scrape_websites(self, base_urls: List[str]) -> Dict[str, str]:
116
- """Scrape content from a list of base URLs and their internal links."""
117
- try:
118
- visited_links = set()
119
- content_by_url = {}
120
-
121
- for base_url in base_urls:
122
- if not base_url.strip():
123
- continue
124
-
125
- print(f"Scraping base URL: {base_url}")
126
- html_content = self.fetch_page_content(base_url)
127
- if html_content:
128
- cleaned_content = self.clean_body_content(html_content)
129
- content_by_url[base_url] = cleaned_content
130
- visited_links.add(base_url)
131
-
132
- # Extract and process internal links
133
- soup = BeautifulSoup(html_content, "html.parser")
134
- links = self.extract_internal_links(base_url, soup)
135
-
136
- for link in links:
137
- if link not in visited_links:
138
- print(f"Scraping link: {link}")
139
- page_content = self.fetch_page_content(link)
140
- if page_content:
141
- cleaned_content = self.clean_body_content(page_content)
142
- content_by_url[link] = cleaned_content
143
- visited_links.add(link)
144
-
145
- # Extract PDF content if link is a PDF
146
- if link.lower().endswith('.pdf'):
147
- print(f"Extracting PDF content from: {link}")
148
- pdf_processor = DataProcessor()
149
- pdf_content = pdf_processor.extract_pdf_text_from_url(link)
150
- if pdf_content:
151
- content_by_url[link] = pdf_content
152
-
153
- return content_by_url
154
- except Exception as e:
155
- print(f"Error during scraping: {e}")
156
- return {}
157
-
158
- def fetch_page_content(self, url: str) -> Optional[str]:
159
- """Fetch HTML content from a URL."""
160
- try:
161
- response = requests.get(url, timeout=10)
162
- response.raise_for_status()
163
- return response.text
164
- except requests.exceptions.RequestException as e:
165
- print(f"Error fetching {url}: {e}")
166
- return None
167
-
168
- def extract_internal_links(self, base_url: str, soup: BeautifulSoup) -> set:
169
- """Extract internal links from a BeautifulSoup object."""
170
- links = set()
171
- for anchor in soup.find_all("a", href=True):
172
- href = anchor["href"]
173
- full_url = urljoin(base_url, href)
174
- if self.is_internal_link(base_url, full_url):
175
- links.add(full_url)
176
- return links
177
 
178
- def is_internal_link(self, base_url: str, link_url: str) -> bool:
179
- """Check if a link is internal to the base URL."""
180
- base_netloc = urlparse(base_url).netloc
181
- link_netloc = urlparse(link_url).netloc
182
- return base_netloc == link_netloc
183
 
184
- def clean_body_content(self, html_content: str) -> str:
185
- """Clean HTML content to extract useful text."""
186
- soup = BeautifulSoup(html_content, "html.parser")
187
 
188
- # Remove scripts and styles
189
- for script_or_style in soup(["script", "style"]):
190
- script_or_style.extract()
191
-
192
- # Get text and clean up
193
- cleaned_content = soup.get_text(separator="\n")
194
- cleaned_content = "\n".join(
195
- line.strip() for line in cleaned_content.splitlines() if line.strip()
196
- )
197
- return cleaned_content
198
-
199
-
200
- class TranslationService:
201
- """Translation service using Hugging Face API."""
202
-
203
- def __init__(self, api_token: str, model_name: str = "facebook/nllb-200-distilled-600M"):
204
- self.model_name = model_name
205
- self.url = f"https://api-inference.huggingface.co/models/{model_name}"
206
- self.headers = {"Authorization": f"Bearer {api_token}"}
207
-
208
- def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> str:
209
- """Translate text using Hugging Face API."""
210
  try:
211
- response = requests.post(
212
- self.url,
213
- headers=self.headers,
214
- json={
215
- "inputs": text,
216
- "parameters": {
217
- "src_lang": src_lang,
218
- "tgt_lang": tgt_lang
219
- }
220
- }
221
- )
222
 
223
- if response.status_code == 200:
224
- result = response.json()
225
- if isinstance(result, list) and len(result) > 0:
226
- return result[0]['translation_text']
227
- return result['translation_text']
 
 
228
  else:
229
- print(f"Translation error: {response.status_code}, {response.text}")
230
- return text # Return original text if translation fails
231
  except Exception as e:
232
- print(f"Translation error: {e}")
233
- return text # Return original text if translation fails
234
-
235
 
236
- class OpenRouterLLM:
237
- """LLM service using OpenRouter API."""
 
 
 
238
 
239
- def __init__(self, key: str):
240
- try:
241
- self.client = OpenAI(
242
- base_url="https://openrouter.ai/api/v1",
243
- api_key=key
244
- )
245
- self.headers = {
246
- "HTTP-Referer": "http://localhost:3000",
247
- "X-Title": "Local Development"
248
- }
249
- except Exception as e:
250
- print(f"Initialization error: {e}")
251
- raise
252
 
253
- def stream(self, prompt: str) -> Iterator[str]:
254
- """Stream response from LLM."""
255
- try:
256
- completion = self.client.chat.completions.create(
257
- # model="meta-llama/llama-3.3-70b-instruct:free",
258
- model="meta-llama/llama-4-maverick:free",
259
- messages=[{"role": "user", "content": prompt}],
260
- stream=True
261
- )
262
-
263
- for chunk in completion:
264
- delta = chunk.choices[0].delta
265
- if hasattr(delta, "content") and delta.content:
266
- yield delta.content
267
- except Exception as e:
268
- yield f"Streaming error: {str(e)}"
269
-
270
 
 
271
  class UserSession:
272
- """Manage user session information and conversation history."""
273
-
274
- def __init__(self, llm: OpenRouterLLM):
275
  self.current_user = None
276
  self.welcome_message = None
277
  self.conversation_history = []
278
  self.llm = llm
279
-
280
- def set_user(self, user_info: Dict[str, Any]) -> None:
281
- """Set current user and initialize welcome message."""
282
  self.current_user = user_info
283
  self.set_welcome_message(user_info.get("Nickname", "Guest"))
284
  # Initialize conversation history with welcome message
@@ -286,454 +95,309 @@ class UserSession:
286
  self.conversation_history = [
287
  {"role": "assistant", "content": welcome},
288
  ]
289
-
290
- def get_user(self) -> Dict[str, Any]:
291
- """Get current user information."""
292
  return self.current_user
293
-
294
- def set_welcome_message(self, nickname: str, src_lang: str = "eng_Latn", tgt_lang: str = "kin_Latn") -> None:
295
  """Set a dynamic welcome message using the LLM."""
 
296
  prompt = (
297
- f"Create a very brief welcome message for {nickname}. "
298
  f"The message should: "
299
  f"1. Welcome {nickname} warmly and professionally. "
300
  f"2. Emphasize that this is a safe and trusted space. "
301
  f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. "
302
  f"4. Use a tone that is warm, reassuring, and professional. "
303
- f"5. Keep the message concise and impactful."
304
  )
305
-
306
- try:
307
- # Use the LLM to generate the message
308
- welcome = "".join(list(self.llm.stream(prompt)))
309
-
310
- # Get translation service and translate welcome message
311
- api_token = os.environ.get('Token')
312
- if not api_token:
313
- self.welcome_message = f"Welcome {nickname}! This is a safe space where you can find support and resources."
314
- return
315
-
316
- translator = TranslationService(api_token)
317
- welcome_text = translator.translate_text(welcome, src_lang, tgt_lang)
318
-
319
- # Format the message with HTML styling
320
- self.welcome_message = welcome_text
321
- except Exception as e:
322
- print(f"Error generating welcome message: {e}")
323
- self.welcome_message = f"Welcome {nickname}! This is a safe space where you can find support and resources."
324
-
325
- def get_welcome_message(self) -> str:
326
- """Get the welcome message."""
327
- return self.welcome_message or "Welcome! This is a safe space where you can find support."
328
-
329
- def add_to_history(self, role: str, message: str) -> None:
330
- """Add a message to the conversation history."""
331
  self.conversation_history.append({"role": role, "content": message})
332
-
333
- def get_conversation_history(self) -> List[Dict[str, str]]:
334
- """Get the full conversation history."""
335
  return self.conversation_history
336
-
337
- def get_formatted_history(self) -> str:
338
- """Get conversation history formatted as a string for the LLM."""
339
  formatted_history = ""
340
  for entry in self.conversation_history:
341
  role = "User" if entry["role"] == "user" else "Assistant"
342
  formatted_history += f"{role}: {entry['content']}\n\n"
343
  return formatted_history
344
 
 
 
 
345
 
346
- class GBVSupportChatbot:
347
- """Main chatbot application class."""
 
 
348
 
349
- def __init__(self):
350
- self.api_key = os.environ.get('A1')
351
- self.api_token = os.environ.get('Token')
352
-
353
- # Add fallback for missing environment variables
354
- if not self.api_key:
355
- print("Warning: V2 API key not found in environment variables.")
356
- self.api_key = "demo_key" # Use a placeholder value
357
-
358
- if not self.api_token:
359
- print("Warning: Token not found in environment variables.")
360
- self.api_token = "demo_token" # Use a placeholder value
361
-
362
- self.llm_instance = OpenRouterLLM(key=self.api_key)
363
- self.user_session = UserSession(self.llm_instance)
364
- self.translator = TranslationService(self.api_token)
365
-
366
- # Initialize embedding model
367
- try:
368
- self.embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
369
-
370
- # Initialize vector store
371
- self.vectorstore = Chroma(
372
- collection_name="Dataset",
373
- embedding_function=self.embed_model,
374
- )
375
- except Exception as e:
376
- print(f"Error initializing embeddings: {e}")
377
- # Create a simple placeholder for vectorstore if initialization fails
378
- self.vectorstore = None
379
-
380
- # Template for GBV support chatbot
381
- self.template = """
382
- You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
383
-
384
- **Previous conversation:** {conversation_history}
385
- **Context information:** {context}
386
- **User's Question:** {question}
387
 
388
- When responding follow these guidelines:
 
389
 
390
- 1. **Strict Context Adherence**
391
- - Only use information that appears in the provided {context}
392
- - If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response
393
 
394
- 2. **Personalized Communication**
395
- - Avoid contractions (e.g., use I am instead of I'm)
396
- - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
397
- - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
398
- - Balance warmth with professionalism
399
-
400
- 3. **Emotional Intelligence**
401
- - Validate feelings without judgment
402
- - Offer reassurance when appropriate, always centered on empowerment
403
- - Adjust your tone based on the emotional state conveyed
404
-
405
- 4. **Conversation Management**
406
- - Refer to {conversation_history} to maintain continuity and avoid repetition
407
- - Use clear paragraph breaks for readability
408
-
409
- 5. **Information Delivery**
410
- - Extract only relevant information from {context} that directly addresses the question
411
- - Present information in accessible, non-technical language
412
- - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
413
-
414
- 6. **Safety and Ethics**
415
- - Do not generate any speculative content or advice not supported by the context
416
- - If the context contains safety information, prioritize sharing that information
417
-
418
- Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials.
419
-
420
- **Context:** {context}
421
- **User's Question:** {question}
422
- **Your Response:**
423
- """
424
-
425
- def load_data(self) -> None:
426
- """Load and process all data sources."""
427
- if not self.vectorstore:
428
- print("Warning: Vector store not initialized. Skipping data loading.")
429
- return
430
-
431
- try:
432
- # Process all data sources
433
- data_processor = DataProcessor()
434
- context_data = data_processor.process_tabular_data()
435
-
436
- # Process PDFs
437
- pdf_documents = data_processor.process_pdf_files()
438
- text_chunks = data_processor.split_documents(pdf_documents)
439
-
440
- # Combine all data
441
- all_data = []
442
- all_data.extend(context_data)
443
- all_data.extend([item for item in text_chunks if item not in all_data])
444
-
445
- if all_data:
446
- # Add data to vector store
447
- self.vectorstore.add_texts(all_data)
448
- else:
449
- print("Warning: No data found to load into vector store.")
450
- except Exception as e:
451
- print(f"Error loading data: {e}")
452
-
453
- def create_rag_chain(self):
454
- """Create RAG chain with user context and conversation history."""
455
- try:
456
- if self.vectorstore:
457
- retriever = self.vectorstore.as_retriever()
458
- else:
459
- # Create a simple fallback if vectorstore is not available
460
- retriever = FallbackRetriever()
461
-
462
- rag_prompt = PromptTemplate.from_template(self.template)
463
-
464
- def stream_func(input_dict):
465
- try:
466
- # Get context using the retriever's invoke method
467
- if self.vectorstore:
468
- context = retriever.invoke(input_dict["question"])
469
- context_str = "\n".join([doc.page_content for doc in context])
470
- else:
471
- context_str = "No specific information available on this topic."
472
-
473
- # Get user info from the session
474
- user_info = self.user_session.get_user() or {}
475
- first_name = user_info.get("Nickname", "User")
476
-
477
- # Get conversation history
478
- conversation_history = self.user_session.get_formatted_history()
479
-
480
- # Format prompt with user context and conversation history
481
- prompt = rag_prompt.format(
482
- context=context_str,
483
- question=input_dict["question"],
484
- first_name=first_name,
485
- conversation_history=conversation_history
486
- )
487
-
488
- # Stream response
489
- return self.llm_instance.stream(prompt)
490
- except Exception as e:
491
- print(f"Error in RAG chain: {e}")
492
- yield f"I apologize, but I'm having trouble processing your request. Please try again or rephrase your question."
493
-
494
- return stream_func
495
- except Exception as e:
496
- print(f"Error creating RAG chain: {e}")
497
-
498
- # Return a simple fallback function
499
- def fallback_func(input_dict):
500
- yield "I apologize, but I'm having technical difficulties. Please try again later."
501
-
502
- return fallback_func
503
-
504
- def collect_user_info(self, nickname: str):
505
- """Collect and process user information."""
506
- if not nickname:
507
- return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), []
508
-
509
- # Store user info for chat session
510
- user_info = {
511
- "Nickname": nickname,
512
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
513
  }
514
-
515
- # Set user in session
516
- self.user_session.set_user(user_info)
517
-
518
- # Generate welcome message
519
- welcome_message = self.user_session.get_welcome_message()
520
-
521
- # Create welcome message in the new messages format for Gradio chatbot
522
- chat_history = [{"role": "assistant", "content": welcome_message}]
523
-
524
- # Return welcome message and update UI
525
- return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history
526
 
527
- def rag_memory_stream(self, message: str, history):
528
- """Process user message, translate, and generate response."""
529
- try:
530
- # First, yield the current history to show user message
531
- history_copy = history.copy()
532
- history_copy.append({"role": "user", "content": message})
533
- yield history_copy, ""
534
-
535
- # Translate user message to English (from Kinyarwanda by default)
536
- try:
537
- english_message = self.translator.translate_text(message, "kin_Latn", "eng_Latn")
538
- except Exception as e:
539
- print(f"Translation error: {e}")
540
- english_message = message # Fallback to original message if translation fails
541
-
542
- # Add translated message to history
543
- self.user_session.add_to_history("user", english_message)
544
-
545
- # Generate response using RAG chain
546
- full_response = ""
547
- rag_chain = self.create_rag_chain()
548
-
549
- # Generate chunks of response and update as they come
550
- for new_text in rag_chain({"question": english_message}):
551
- full_response += new_text
552
-
553
- # Translate response back to user language (Kinyarwanda by default)
554
- try:
555
- translated_response = self.translator.translate_text(full_response, "eng_Latn", "kin_Latn")
556
- except Exception as e:
557
- print(f"Translation error: {e}")
558
- translated_response = full_response # Fallback to original message if translation fails
559
-
560
- # Update history with current response
561
- current_history = history_copy.copy()
562
- current_history.append({"role": "assistant", "content": translated_response})
563
- yield current_history, ""
564
-
565
- # Add final response to session history
566
- self.user_session.add_to_history("assistant", full_response)
567
-
568
- except Exception as e:
569
- print(f"Error in chat processing: {e}")
570
- # Provide a fallback response if something goes wrong
571
- error_history = history.copy()
572
- error_history.append({"role": "user", "content": message})
573
- error_history.append({
574
- "role": "assistant",
575
- "content": "I apologize, but I'm having trouble processing your request. Please try again."
576
- })
577
- yield error_history, ""
578
 
579
- def create_chatbot_interface(self):
580
- """Create and configure the chatbot UI."""
581
- with gr.Blocks() as demo:
582
- # User registration section
583
- with gr.Column(visible=True, elem_id="registration_container") as registration_container:
584
- gr.Markdown("### Your privacy matters to us! Just share a nickname you feel comfy with to start chatting..")
585
-
586
- with gr.Row():
587
- first_name = gr.Textbox(
588
- label="Nickname",
589
- placeholder="Enter a nickname you feel comfortable with",
590
- scale=1,
591
- elem_id="input_nickname"
592
- )
593
-
594
- with gr.Row():
595
- submit_btn = gr.Button("Start Chatting", variant="primary", scale=2)
596
-
597
- response_message = gr.Markdown()
598
-
599
- # Chatbot section (initially hidden)
600
- with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container:
601
- # Use the new messages format for the chatbot
602
- chatbot = gr.Chatbot(
603
- label="Chat with GBVR",
604
- height=500,
605
- show_label=True,
606
- elem_id="chat_interface",
607
- type="messages" # Use messages format instead of tuples
608
- )
609
-
610
- with gr.Row():
611
- msg = gr.Textbox(
612
- placeholder="Type your message here...",
613
- label="Your message",
614
- show_label=False,
615
- container=False,
616
- scale=7
617
- )
618
- send_btn = gr.Button("Send", variant="primary", scale=1)
619
-
620
- # Configure event handlers
621
- msg_event = msg.submit(
622
- self.rag_memory_stream,
623
- inputs=[msg, chatbot],
624
- outputs=[chatbot, msg]
625
- )
626
- send_event = send_btn.click(
627
- self.rag_memory_stream,
628
- inputs=[msg, chatbot],
629
- outputs=[chatbot, msg]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630
  )
631
-
632
- # Clear textbox after sending
633
- msg_event.then(lambda: "", None, msg)
634
- send_event.then(lambda: "", None, msg)
635
-
636
- # Footer with version info
637
- gr.Markdown("Ijwi ry'Ubufasha Chatbot v1.0.0 © 2025")
638
-
639
- # Handle user registration
640
- submit_btn.click(
641
- self.collect_user_info,
642
- inputs=[first_name],
643
- outputs=[response_message, chatbot_container, registration_container, chatbot]
644
  )
645
-
646
- # Add CSS styles
647
- demo.css = """
648
- :root {
649
- --background: #f0f0f0;
650
- --text: #000000;
651
- }
652
 
653
- body, .gradio-container {
654
- margin: 0;
655
- padding: 0;
656
- width: 100%;
657
- height: 100vh;
658
- display: flex;
659
- flex-direction: column;
660
- justify-content: center;
661
- align-items: center;
662
- background: var(--background);
663
- color: var(--text);
664
- }
665
 
666
- .gradio-container {
667
- max-width: 100%;
668
- max-height: 100%;
669
- }
 
 
670
 
671
- .gr-box {
672
- background: var(--background);
673
- color: var(--text);
674
- border-radius: 12px;
675
- padding: 2rem;
676
- border: 1px solid rgba(0, 0, 0, 0.1);
677
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
678
- }
679
 
680
- .gr-button-primary {
681
- background: var(--background);
682
- color: var(--text);
683
- padding: 12px 24px;
684
- border-radius: 8px;
685
- transition: all 0.3s ease;
686
- border: 1px solid rgba(0, 0, 0, 0.1);
687
- }
 
 
 
 
688
 
689
- .gr-button-primary:hover {
690
- transform: translateY(-1px);
691
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
692
- }
693
 
694
- footer {
695
- text-align: center;
696
- color: var(--text);
697
- opacity: 0.7;
698
- padding: 1rem;
699
- font-size: 0.9em;
700
- }
 
701
 
702
- .gr-markdown h3 {
703
- color: var(--text);
704
- margin-bottom: 1rem;
705
- }
 
 
 
 
706
 
707
- .registration-markdown, .chat-title h1 {
708
- color: var(--text);
709
- }
710
- """
711
-
712
- return demo
713
 
 
 
 
 
 
 
 
714
 
715
- # Fallback retriever class for when vectorstore is not available
716
- class FallbackRetriever:
717
- def invoke(self, query):
718
- # Return a list of document-like objects with empty content
719
- return [Document(page_content="No specific information available on this topic.", metadata={})]
720
 
 
 
 
 
721
 
722
- # Main execution function
723
- def main():
724
- # Initialize the chatbot
725
- chatbot = GBVSupportChatbot()
726
-
727
- try:
728
- # Load data
729
- chatbot.load_data()
730
-
731
- # Create and launch the interface
732
- demo = chatbot.create_chatbot_interface()
733
- demo.launch(share=True)
734
- except Exception as e:
735
- print(f"Error in main execution: {e}")
736
-
737
 
 
738
  if __name__ == "__main__":
739
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import time
 
 
 
 
 
3
  import pandas as pd
4
+ import gradio as gr
5
+ from langchain_groq import ChatGroq
 
 
 
 
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
  from langchain_core.prompts import PromptTemplate
9
  from langchain_core.output_parsers import StrOutputParser
10
  from langchain_core.runnables import RunnablePassthrough
 
 
11
 
12
+ import os
13
+ from langchain_groq import ChatGroq
14
+ from langchain.prompts import ChatPromptTemplate, PromptTemplate
15
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
16
+ from urllib.parse import urljoin, urlparse
17
+ import requests
18
+ from io import BytesIO
19
+ from langchain_chroma import Chroma
20
+ import requests
21
+ from bs4 import BeautifulSoup
22
+ from langchain_core.prompts import ChatPromptTemplate
23
  import gradio as gr
24
+ from PyPDF2 import PdfReader
25
 
26
+ groq_api_key= os.environ.get('grop_API_KEY')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Set up embedding model
29
+ embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
30
 
31
+ # Process data from Drive
32
+ def process_data_files():
33
+ folder_path = "./"
34
+ context_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Get list of data files
37
+ all_files = os.listdir(folder_path)
38
+ data_files = [f for f in all_files if f.lower().endswith(('.csv', '.xlsx', '.xls'))]
 
 
39
 
40
+ # Process each file
41
+ for index, file_name in enumerate(data_files, 1):
42
+ file_path = os.path.join(folder_path, file_name)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  try:
45
+ # Read file
46
+ if file_name.lower().endswith('.csv'):
47
+ df = pd.read_csv(file_path)
48
+ else:
49
+ df = pd.read_excel(file_path)
 
 
 
 
 
 
50
 
51
+ # Check if column 3 exists
52
+ if df.shape[1] > 2:
53
+ column_data = df.iloc[:, 2].dropna().astype(str).tolist()
54
+
55
+ # Each row becomes one chunk
56
+ for i, text in enumerate(column_data):
57
+ context_data.append({"page_content": text, "metadata": {"source": file_name, "row": i+1}})
58
  else:
59
+ print(f"Warning: File {file_name} has fewer than 3 columns.")
60
+
61
  except Exception as e:
62
+ print(f"Error processing file {file_name}: {e}")
63
+
64
+ return context_data
65
 
66
+ # Create vectorstore
67
+ def create_vectorstore(data):
68
+ # Extract just the text content from each Document object in the list
69
+ cleaned_texts = [doc["page_content"] for doc in data]
70
+ metadatas = [doc["metadata"] for doc in data]
71
 
72
+ # Create vector store
73
+ vectorstore = Chroma(
74
+ collection_name="GBVRS",
75
+ embedding_function=embed_model,
76
+ )
 
 
 
 
 
 
 
 
77
 
78
+ # Add data to vector store
79
+ vectorstore.add_texts(cleaned_texts, metadatas=metadatas)
80
+ return vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ # User session management
83
  class UserSession:
84
+ def __init__(self, llm):
 
 
85
  self.current_user = None
86
  self.welcome_message = None
87
  self.conversation_history = []
88
  self.llm = llm
89
+
90
+ def set_user(self, user_info):
 
91
  self.current_user = user_info
92
  self.set_welcome_message(user_info.get("Nickname", "Guest"))
93
  # Initialize conversation history with welcome message
 
95
  self.conversation_history = [
96
  {"role": "assistant", "content": welcome},
97
  ]
98
+
99
+ def get_user(self):
 
100
  return self.current_user
101
+
102
+ def set_welcome_message(self, nickname):
103
  """Set a dynamic welcome message using the LLM."""
104
+ # Define a prompt for the LLM to generate a welcome message
105
  prompt = (
106
+ f"Create a very brief welcome message for {nickname} that fits in 3 lines. "
107
  f"The message should: "
108
  f"1. Welcome {nickname} warmly and professionally. "
109
  f"2. Emphasize that this is a safe and trusted space. "
110
  f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. "
111
  f"4. Use a tone that is warm, reassuring, and professional. "
112
+ f"5. Keep the message concise and impactful, ensuring it fits within the character limit."
113
  )
114
+
115
+ # Use the LLM to generate the message
116
+ response = self.llm.invoke(prompt)
117
+ welcome = response.content
118
+
119
+ # Format the message with HTML styling
120
+ self.welcome_message = (
121
+ f"<div style='font-size: 24px; font-weight: bold; color: #2E86C1;'>"
122
+ f"<div style='font-size: 20px;'>"
123
+ f"{welcome}"
124
+ f"</div>"
125
+ )
126
+
127
+ def get_welcome_message(self):
128
+ return self.welcome_message
129
+
130
+ def add_to_history(self, role, message):
131
+ """Add a message to the conversation history"""
 
 
 
 
 
 
 
 
132
  self.conversation_history.append({"role": role, "content": message})
133
+
134
+ def get_conversation_history(self):
135
+ """Get the full conversation history"""
136
  return self.conversation_history
137
+
138
+ def get_formatted_history(self):
139
+ """Get conversation history formatted as a string for the LLM"""
140
  formatted_history = ""
141
  for entry in self.conversation_history:
142
  role = "User" if entry["role"] == "user" else "Assistant"
143
  formatted_history += f"{role}: {entry['content']}\n\n"
144
  return formatted_history
145
 
146
+ # Format context from documents
147
+ def format_context(retrieved_docs):
148
+ return "\n".join([doc.page_content for doc in retrieved_docs])
149
 
150
+ # RAG Chain creation with updated approach
151
+ def create_rag_chain(retriever, template, api_key):
152
+ llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=api_key)
153
+ rag_prompt = PromptTemplate.from_template(template)
154
 
155
+ # Define the RAG chain using the recommended approach
156
+ def get_context_and_question(query):
157
+ # Get user info from the session
158
+ user_info = user_session.get_user() or {}
159
+ first_name = user_info.get("Nickname", "User")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ # Get conversation history
162
+ conversation_history = user_session.get_formatted_history()
163
 
164
+ # Retrieve documents
165
+ retrieved_docs = retriever.invoke(query)
166
+ context_str = format_context(retrieved_docs)
167
 
168
+ # Return the combined inputs for the prompt
169
+ return {
170
+ "context": context_str,
171
+ "question": query,
172
+ "first_name": first_name,
173
+ "conversation_history": conversation_history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  }
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
+ # Build the chain
177
+ rag_chain = (
178
+ RunnablePassthrough()
179
+ | get_context_and_question
180
+ | rag_prompt
181
+ | llm
182
+ | StrOutputParser()
183
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ return rag_chain
186
+
187
+ # RAG memory function for user interaction (without translation)
188
+ def rag_memory_stream(message, history):
189
+ # Add user message to history
190
+ user_session.add_to_history("user", message)
191
+
192
+ # Get response from RAG chain
193
+ response = rag_chain.invoke(message)
194
+
195
+ # Add assistant response to history
196
+ user_session.add_to_history("assistant", response)
197
+
198
+ # Yield the response
199
+ yield response
200
+
201
+ # Add initial message to start the conversation
202
+ def add_initial_message(chatbot):
203
+ return chatbot
204
+
205
+ # Store user details and handle session
206
+ def collect_user_info(nickname):
207
+ if not nickname:
208
+ return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), []
209
+
210
+ # Store user info for chat session
211
+ user_info = {
212
+ "Nickname": nickname,
213
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
214
+ }
215
+
216
+ # Set user in session
217
+ user_session.set_user(user_info)
218
+
219
+ # Generate welcome message
220
+ welcome_message = user_session.get_welcome_message()
221
+
222
+ # Add initial message to start the conversation
223
+ chat_history = add_initial_message([(None, welcome_message)])
224
+
225
+ # Return welcome message and update UI
226
+ return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history
227
+
228
+ # Gradio Interface Setup with improved UX
229
+ def chatbot_interface():
230
+ global template, rag_chain
231
+
232
+ template = """
233
+ **Role**: Compassionate Regal Assistance and GBV Support Specialist with Emotional Awareness.
234
+ You are a friendly and empathetic chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and emotionally supportive responses based on the provided context: {context}. Follow these guidelines:
235
+
236
+ 1. **Emotional Awareness**
237
+ - Acknowledge the user's emotions and respond with empathy.
238
+ - Use phrases like "I understand how you feel," "That sounds challenging," or "I'm here to support you."
239
+ - If the user expresses negative emotions, offer comfort and reassurance.
240
+
241
+ 2. **Contextual Interaction**
242
+ - Begin with a warm and empathetic welcome message.
243
+ - Extract precise details from the provided context: {context}.
244
+ - Respond directly to the user's question: {question}.
245
+ - Only provide detailed information if user requests it.
246
+ - Remember the user's name is {first_name}.
247
+
248
+ 3. **Communication Guidelines**
249
+ - Maintain a warm, conversational tone (avoid over-familiarity).
250
+ - Use occasional emojis for engagement (e.g., 😊, 🤗, ❤️).
251
+ - Provide clear, concise, and emotionally supportive information.
252
+
253
+ 4. **Response Strategies**
254
+ - Greet users naturally and ask about their wellbeing (e.g., "Welcome, {first_name}! 😊 How are you feeling today?", "Hello {first_name}! 🤗 What's on your mind?").
255
+ - Always start with a check-in about the user's wellbeing or current situation.
256
+ - Provide a concise summary with only relevant information.
257
+ - Avoid generating content beyond the context.
258
+ - Handle missing information transparently.
259
+
260
+ 5. **No Extra Content**
261
+ - If no information in {context} matches the user's request {question} :
262
+ * Respond politely: "I don't have that information at the moment, {first_name}. 😊"
263
+ * Offer alternative assistance options.
264
+ - Strictly avoid generating unsupported content.
265
+ - Prevent information padding or speculation.
266
+
267
+ 6. **Extracting Relevant Links**
268
+ - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
269
+ - Example response:
270
+ - "Here is the link you requested, [URL]"
271
+
272
+ 7. **Real-Time Awareness**
273
+ - Acknowledge the current context when appropriate.
274
+ - Stay focused on the user's immediate needs.
275
+
276
+ 8. **Previous Conversation Context**
277
+ - Consider the conversation history: {conversation_history}
278
+ - Maintain continuity with previous exchanges.
279
+
280
+ **Context:** {context}
281
+ **User's Question:** {question}
282
+ **Your Response:**
283
+ """
284
+
285
+ with gr.Blocks() as demo:
286
+ # User registration section
287
+ with gr.Column(visible=True, elem_id="registration_container") as registration_container:
288
+ gr.Markdown("### Your privacy is our concern, please provide your nickname.")
289
+
290
+ with gr.Row():
291
+ first_name = gr.Textbox(
292
+ label="Nickname",
293
+ placeholder="Enter your Nickname",
294
+ scale=1,
295
+ elem_id="input_nickname"
296
  )
297
+
298
+ with gr.Row():
299
+ submit_btn = gr.Button("Start Chatting", variant="primary", scale=2)
300
+
301
+ response_message = gr.Markdown()
302
+
303
+ # Chatbot section (initially hidden)
304
+ with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container:
305
+ chat_interface = gr.ChatInterface(
306
+ fn=rag_memory_stream,
307
+ title="Chat with GBVR",
308
+ fill_height=True
 
309
  )
 
 
 
 
 
 
 
310
 
311
+ # Footer with version info
312
+ gr.Markdown("Ijwi ry'Ubufasha v1.0.0 © 2025")
 
 
 
 
 
 
 
 
 
 
313
 
314
+ # Handle user registration
315
+ submit_btn.click(
316
+ collect_user_info,
317
+ inputs=[first_name],
318
+ outputs=[response_message, chatbot_container, registration_container, chat_interface.chatbot]
319
+ )
320
 
321
+ demo.css = """
322
+ :root {
323
+ --background: #f0f0f0;
324
+ --text: #000000;
325
+ }
 
 
 
326
 
327
+ body, .gradio-container {
328
+ margin: 0;
329
+ padding: 0;
330
+ width: 100vw;
331
+ height: 100vh;
332
+ display: flex;
333
+ flex-direction: column;
334
+ justify-content: center;
335
+ align-items: center;
336
+ background: var(--background);
337
+ color: var(--text);
338
+ }
339
 
340
+ .gradio-container {
341
+ max-width: 100%;
342
+ max-height: 100%;
343
+ }
344
 
345
+ .gr-box {
346
+ background: var(--background);
347
+ color: var(--text);
348
+ border-radius: 12px;
349
+ padding: 2rem;
350
+ border: 1px solid rgba(0, 0, 0, 0.1);
351
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
352
+ }
353
 
354
+ .gr-button-primary {
355
+ background: var(--background);
356
+ color: var(--text);
357
+ padding: 12px 24px;
358
+ border-radius: 8px;
359
+ transition: all 0.3s ease;
360
+ border: 1px solid rgba(0, 0, 0, 0.1);
361
+ }
362
 
363
+ .gr-button-primary:hover {
364
+ transform: translateY(-1px);
365
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
366
+ }
 
 
367
 
368
+ footer {
369
+ text-align: center;
370
+ color: var(--text);
371
+ opacity: 0.7;
372
+ padding: 1rem;
373
+ font-size: 0.9em;
374
+ }
375
 
376
+ .gr-markdown h3 {
377
+ color: var(--text);
378
+ margin-bottom: 1rem;
379
+ }
 
380
 
381
+ .registration-markdown, .chat-title h1 {
382
+ color: var(--text);
383
+ }
384
+ """
385
 
386
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
+ # Main execution
389
  if __name__ == "__main__":
390
+ # Process data and create vectorstore
391
+ data = process_data_files()
392
+ vectorstore = create_vectorstore(data)
393
+ retriever = vectorstore.as_retriever()
394
+
395
+ # Initialize LLM for the user session
396
+ llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
397
+ user_session = UserSession(llm)
398
+
399
+ # Create RAG chain with the new approach
400
+ rag_chain = create_rag_chain(retriever, template, groq_api_key)
401
+
402
+ # Launch the interface
403
+ chatbot_interface().launch(share=True)