Rabbit-Innotech commited on
Commit
aef0372
·
verified ·
1 Parent(s): 9787937

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +474 -569
app.py CHANGED
@@ -1,358 +1,242 @@
1
  import os
2
- import PyPDF2
3
- from PyPDF2 import PdfReader
4
- import pandas as pd
5
-
6
- ## Embedding model!
7
- from langchain_huggingface import HuggingFaceEmbeddings
8
- embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
9
-
10
-
11
-
12
- folder_path = "./"
13
- context_data = []
14
-
15
- # List all files in the folder
16
- files = os.listdir(folder_path)
17
-
18
- # Get list of CSV and Excel files
19
- data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))]
20
-
21
- # Process each file
22
- for f, file in enumerate(data_files, 1):
23
- print(f"\nProcessing file {f}: {file}")
24
- file_path = os.path.join(folder_path, file)
25
-
26
- try:
27
- # Read the file based on its extension
28
- if file.endswith('.csv'):
29
- df = pd.read_csv(file_path)
30
- else:
31
- df = pd.read_excel(file_path)
32
-
33
- # Extract non-empty values from column 2 and append them
34
- context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())
35
-
36
- except Exception as e:
37
- print(f"Error processing file {file}: {str(e)}")
38
-
39
-
40
-
41
-
42
- import os
43
- import PyPDF2
44
- from langchain.text_splitter import RecursiveCharacterTextSplitter
45
- from langchain.schema import Document
46
-
47
- def extract_text_from_pdf(pdf_path):
48
- """Extract text from a PDF file."""
49
- try:
50
- with open(pdf_path, "rb") as file:
51
- reader = PyPDF2.PdfReader(file)
52
- return "".join(page.extract_text() or "" for page in reader.pages)
53
- except Exception as e:
54
- print(f"Error with {pdf_path}: {e}")
55
- return ""
56
-
57
- pdf_files = [f for f in files if f.lower().endswith(".pdf")]
58
-
59
- # Process PDFs
60
- documents = []
61
- for file in pdf_files:
62
- print(f"Processing: {file}")
63
- pdf_path = os.path.join(folder_path, file)
64
- text = extract_text_from_pdf(pdf_path)
65
- if text:
66
- documents.append(Document(page_content=text, metadata={"source": file}))
67
-
68
- # Split into chunks
69
- text_splitter = RecursiveCharacterTextSplitter(
70
- separators=['\n\n', '\n', '.', ','],
71
- chunk_size=500,
72
- chunk_overlap=50
73
- )
74
- chunks = text_splitter.split_documents(documents)
75
- text_only_chunks = [chunk.page_content for chunk in chunks]
76
-
77
-
78
- from urllib.parse import urljoin, urlparse
79
  import requests
 
80
  from io import BytesIO
 
 
81
 
 
 
 
 
82
  from bs4 import BeautifulSoup
83
- from langchain_core.prompts import ChatPromptTemplate
84
- import gradio as gr
85
-
86
-
87
- def scrape_websites(base_urls):
88
- try:
89
- visited_links = set() # To avoid revisiting the same link
90
- content_by_url = {} # Store content from each URL
91
-
92
- for base_url in base_urls:
93
- if not base_url.strip():
94
- continue # Skip empty or invalid URLs
95
-
96
- print(f"Scraping base URL: {base_url}")
97
- html_content = fetch_page_content(base_url)
98
- if html_content:
99
- cleaned_content = clean_body_content(html_content)
100
- content_by_url[base_url] = cleaned_content
101
- visited_links.add(base_url)
102
-
103
- # Extract and process all internal links
104
- soup = BeautifulSoup(html_content, "html.parser")
105
- links = extract_internal_links(base_url, soup)
106
-
107
- for link in links:
108
- if link not in visited_links:
109
- print(f"Scraping link: {link}")
110
- page_content = fetch_page_content(link)
111
- if page_content:
112
- cleaned_content = clean_body_content(page_content)
113
- content_by_url[link] = cleaned_content
114
- visited_links.add(link)
115
-
116
- # If the link is a PDF file, extract its content
117
- if link.lower().endswith('.pdf'):
118
- print(f"Extracting PDF content from: {link}")
119
- pdf_content = extract_pdf_text(link)
120
- if pdf_content:
121
- content_by_url[link] = pdf_content
122
-
123
- return content_by_url
124
-
125
- except Exception as e:
126
- print(f"Error during scraping: {e}")
127
- return {}
128
-
129
-
130
- def fetch_page_content(url):
131
- try:
132
- response = requests.get(url, timeout=10)
133
- response.raise_for_status()
134
- return response.text
135
- except requests.exceptions.RequestException as e:
136
- print(f"Error fetching {url}: {e}")
137
- return None
138
-
139
-
140
- def extract_internal_links(base_url, soup):
141
- links = set()
142
- for anchor in soup.find_all("a", href=True):
143
- href = anchor["href"]
144
- full_url = urljoin(base_url, href)
145
- if is_internal_link(base_url, full_url):
146
- links.add(full_url)
147
- return links
148
-
149
-
150
- def is_internal_link(base_url, link_url):
151
- base_netloc = urlparse(base_url).netloc
152
- link_netloc = urlparse(link_url).netloc
153
- return base_netloc == link_netloc
154
-
155
-
156
- def extract_pdf_text(pdf_url):
157
- try:
158
- response = requests.get(pdf_url)
159
- response.raise_for_status()
160
-
161
- # Open the PDF from the response content
162
- with BytesIO(response.content) as file:
163
- reader = PdfReader(file)
164
- pdf_text = ""
165
- for page in reader.pages:
166
- pdf_text += page.extract_text()
167
-
168
- return pdf_text if pdf_text else None
169
- except requests.exceptions.RequestException as e:
170
- print(f"Error fetching PDF {pdf_url}: {e}")
171
- return None
172
- except Exception as e:
173
- print(f"Error reading PDF {pdf_url}: {e}")
174
- return None
175
-
176
-
177
- def clean_body_content(html_content):
178
- soup = BeautifulSoup(html_content, "html.parser")
179
-
180
- # Remove scripts and styles
181
- for script_or_style in soup(["script", "style"]):
182
- script_or_style.extract()
183
-
184
- # Get text and clean up
185
- cleaned_content = soup.get_text(separator="\n")
186
- cleaned_content = "\n".join(
187
- line.strip() for line in cleaned_content.splitlines() if line.strip()
188
- )
189
- return cleaned_content
190
-
191
-
192
-
193
- # if __name__ == "__main__":
194
- # website = [
195
- # #"https://www.rib.gov.rw/index.php?id=371",
196
- # "https://haguruka.org.rw/our-work/"
197
- # ]
198
- # all_content = scrape_websites(website)
199
-
200
- # # Temporary list to store (url, content) tuples
201
- # temp_list = []
202
-
203
- # # Process and store each URL with its content
204
- # for url, content in all_content.items():
205
- # temp_list.append((url, content))
206
-
207
-
208
-
209
- # processed_texts = []
210
-
211
- # # Process each element in the temporary list
212
- # for element in temp_list:
213
- # if isinstance(element, tuple):
214
- # url, content = element # Unpack the tuple
215
- # processed_texts.append(f"url: {url}, content: {content}")
216
- # elif isinstance(element, str):
217
- # processed_texts.append(element)
218
- # else:
219
- # processed_texts.append(str(element))
220
-
221
- # def chunk_string(s, chunk_size=2000):
222
- # return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
223
-
224
- # # List to store the chunks
225
- # chunked_texts = []
226
-
227
- # for text in processed_texts:
228
- # chunked_texts.extend(chunk_string(text))
229
-
230
- data = []
231
- data.extend(context_data)
232
- #data.extend([item for item in text_only_chunks if item not in data])
233
- #data.extend([item for item in chunked_texts if item not in data])
234
-
235
-
236
-
237
- #from langchain_community.vectorstores import Chroma
238
- from langchain_chroma import Chroma
239
-
240
-
241
-
242
- vectorstore = Chroma(
243
- collection_name="Dataset",
244
- embedding_function=embed_model,
245
- )
246
-
247
- vectorstore.get().keys()
248
-
249
- # add data to vector nstore
250
- vectorstore.add_texts(data)
251
-
252
-
253
- api= os.environ.get('V2')
254
-
255
 
 
256
  from openai import OpenAI
 
 
257
  from langchain_core.prompts import PromptTemplate
258
  from langchain_core.output_parsers import StrOutputParser
259
  from langchain_core.runnables import RunnablePassthrough
260
- import gradio as gr
261
- from typing import Iterator
262
- import time
263
-
264
-
265
-
266
- #template for GBV support chatbot
267
- template = ("""
268
- You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
269
- You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner.
270
-
271
- When responding follow these guidelines:
272
-
273
- 1. **Emotional Intelligence**
274
- - Validate feelings without judgment (e.g., "It is completely understandable to feel this way")
275
- - Offer reassurance when appropriate, always centered on empowerment
276
- - Adjust your tone based on the emotional state conveyed
277
-
278
- 2. **Personalized Communication**
279
- - Avoid contractions (e.g., use I am instead of I'm)
280
- - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
281
- - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
282
- - Balance warmth with professionalism
283
-
284
- 3. **Conversation Management**
285
- - Refer to {conversation_history} to maintain continuity and avoid repetition
286
- - Keep responses concise unless greater detail is explicitly requested
287
- - Use clear paragraph breaks for readability
288
- - Prioritize immediate concerns before addressing secondary issues
289
-
290
- 4. **Information Delivery**
291
- - Extract only relevant information from {context} that directly addresses the question
292
- - Present information in accessible, non-technical language
293
- - Organize resource recommendations in order of relevance and accessibility
294
- - Provide links [URL] only when specifically requested, prefaced with clear descriptions
295
- - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
296
-
297
- 5. **Safety and Ethics**
298
- - Prioritize user safety in all responses
299
- - Never generate speculative content about their specific situation
300
- - Avoid phrases that could minimize experiences or create pressure
301
- - Include gentle reminders about professional help when discussing serious issues
302
-
303
- Your response should balance emotional support with practical guidance.
304
-
305
- **Context:** {context}
306
- **User's Question:** {question}
307
- **Your Response:**
308
- """)
309
-
310
- rag_prompt = PromptTemplate.from_template(template)
311
-
312
- retriever = vectorstore.as_retriever()
313
 
314
- import requests
 
315
 
316
- API_TOKEN = os.environ.get('Token')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
- model_name = "facebook/nllb-200-distilled-600M"
319
 
320
- url = f"https://api-inference.huggingface.co/models/{model_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- headers = {
323
- "Authorization": f"Bearer {API_TOKEN}"
324
- }
325
 
326
- def translate_text(text, src_lang, tgt_lang):
327
- """Translate text using Hugging Face API"""
328
- response = requests.post(
329
- url,
330
- headers=headers,
331
- json={
332
- "inputs": text,
333
- "parameters": {
334
- "src_lang": src_lang,
335
- "tgt_lang": tgt_lang
 
 
 
 
 
 
 
 
 
336
  }
337
- }
338
- )
339
-
340
- if response.status_code == 200:
341
- result = response.json()
342
- if isinstance(result, list) and len(result) > 0:
343
- return result[0]['translation_text']
344
- return result['translation_text']
345
- else:
346
- print(f"Translation error: {response.status_code}, {response.text}")
347
- return text # Return original text if translation fails
348
 
349
 
350
  class OpenRouterLLM:
 
 
351
  def __init__(self, key: str):
352
  try:
353
  self.client = OpenAI(
354
  base_url="https://openrouter.ai/api/v1",
355
- api_key=key
356
  )
357
  self.headers = {
358
  "HTTP-Referer": "http://localhost:3000",
@@ -363,11 +247,10 @@ class OpenRouterLLM:
363
  raise
364
 
365
  def stream(self, prompt: str) -> Iterator[str]:
 
366
  try:
367
  completion = self.client.chat.completions.create(
368
- #model="deepseek/deepseek-r1-distill-llama-70b:free",
369
  model="meta-llama/llama-3.3-70b-instruct:free",
370
- #model="google/gemini-2.5-pro-exp-03-25:free",
371
  messages=[{"role": "user", "content": prompt}],
372
  stream=True
373
  )
@@ -381,13 +264,16 @@ class OpenRouterLLM:
381
 
382
 
383
  class UserSession:
384
- def __init__(self, llm: OpenRouterLLM): # Accept an instance of OpenRouterLLM
 
 
385
  self.current_user = None
386
  self.welcome_message = None
387
- self.conversation_history = [] # Add conversation history storage
388
- self.llm = llm # Store the LLM instance
389
-
390
- def set_user(self, user_info):
 
391
  self.current_user = user_info
392
  self.set_welcome_message(user_info.get("Nickname", "Guest"))
393
  # Initialize conversation history with welcome message
@@ -395,164 +281,80 @@ class UserSession:
395
  self.conversation_history = [
396
  {"role": "assistant", "content": welcome},
397
  ]
398
-
399
- def get_user(self):
 
400
  return self.current_user
401
-
402
- def set_welcome_message(self, Nickname, src_lang="eng_Latn", tgt_lang="kin_Latn"):
403
- """Set a dynamic welcome message using the OpenRouterLLM."""
404
  prompt = (
405
- f"Create a very brief welcome message for {Nickname}. "
406
  f"The message should: "
407
- f"1. Welcome {Nickname} warmly and professionally. "
408
  f"2. Emphasize that this is a safe and trusted space. "
409
  f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. "
410
  f"4. Use a tone that is warm, reassuring, and professional. "
411
  f"5. Keep the message concise and impactful."
412
  )
413
-
414
- # Use the OpenRouterLLM to generate the message
415
- welcome = "".join(self.llm.stream(prompt)) # Stream and concatenate the response
416
- welcome_text=translate_text(welcome, src_lang, tgt_lang)
417
-
 
 
 
 
418
  # Format the message with HTML styling
419
  self.welcome_message = (
420
  f"<div style='font-size: 20px;'>"
421
  f"{welcome_text}"
422
  f"</div>"
423
  )
424
-
425
- def get_welcome_message(self):
 
426
  return self.welcome_message
427
-
428
- def add_to_history(self, role, message):
429
- """Add a message to the conversation history"""
430
  self.conversation_history.append({"role": role, "content": message})
431
-
432
- def get_conversation_history(self):
433
- """Get the full conversation history"""
434
  return self.conversation_history
435
-
436
- def get_formatted_history(self):
437
- """Get conversation history formatted as a string for the LLM"""
438
  formatted_history = ""
439
  for entry in self.conversation_history:
440
  role = "User" if entry["role"] == "user" else "Assistant"
441
  formatted_history += f"{role}: {entry['content']}\n\n"
442
  return formatted_history
443
 
444
- api_key =api
445
- llm_instance = OpenRouterLLM(key=api_key)
446
- #llm_instance = model
447
- user_session = UserSession(llm_instance)
448
-
449
-
450
- def collect_user_info(Nickname):
451
- if not Nickname:
452
- return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), []
453
-
454
- # Store user info for chat session
455
- user_info = {
456
- "Nickname": Nickname,
457
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
458
- }
459
-
460
- # Set user in session
461
- user_session.set_user(user_info)
462
-
463
- # Generate welcome message
464
- welcome_message = user_session.get_welcome_message()
465
-
466
- # Add initial message to start the conversation
467
- chat_history = add_initial_message([(None, welcome_message)])
468
 
469
- # Return welcome message and update UI
470
- return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history
471
-
472
- # Add initial message to start the conversation
473
- def add_initial_message(chatbot):
474
- #initial_message = (" "
475
- # )
476
- return chatbot #+ [(None, initial_message)]
477
-
478
- # Create RAG chain with user context and conversation history
479
- def create_rag_chain(retriever, template, api_key):
480
- llm = OpenRouterLLM(api_key)
481
- rag_prompt = PromptTemplate.from_template(template)
482
-
483
- def stream_func(input_dict):
484
- # Get context using the retriever's invoke method
485
- context = retriever.invoke(input_dict["question"])
486
- context_str = "\n".join([doc.page_content for doc in context])
487
-
488
- # Get user info from the session
489
- user_info = user_session.get_user() or {}
490
- first_name = user_info.get("Nickname", "User")
491
 
492
- # Get conversation history
493
- conversation_history = user_session.get_formatted_history()
494
-
495
- # Format prompt with user context and conversation history
496
- prompt = rag_prompt.format(
497
- context=context_str,
498
- question=input_dict["question"],
499
- first_name=first_name,
500
- conversation_history=conversation_history
501
  )
502
-
503
- # Stream response
504
- return llm.stream(prompt)
505
-
506
- return stream_func
507
-
508
- # def rag_memory_stream(message, history):
509
- # # Add user message to history
510
- # user_session.add_to_history("user", message)
511
-
512
- # # Initialize with empty response
513
- # partial_text = ""
514
- # full_response = ""
515
-
516
- # # Use the rag_chain with the question
517
- # for new_text in rag_chain({"question": message}):
518
- # partial_text += new_text
519
- # full_response = partial_text
520
- # yield partial_text
521
-
522
- # # After generating the complete response, add it to history
523
- # user_session.add_to_history("assistant", full_response)
524
-
525
-
526
- def rag_memory_stream(message, history, user_lang="kin_Latn", system_lang="eng_Latn"):
527
- english_message = translate_text(message, user_lang, system_lang)
528
-
529
- user_session.add_to_history("user", english_message)
530
-
531
- full_response = ""
532
-
533
- for new_text in rag_chain({"question": english_message}):
534
- full_response += new_text
535
-
536
-
537
- translated_response = translate_text(full_response, system_lang, user_lang)
538
-
539
- user_session.add_to_history("assistant", full_response)
540
-
541
- yield translated_response
542
-
543
-
544
-
545
- import gradio as gr
546
-
547
-
548
- api_key = api
549
-
550
- def chatbot_interface():
551
- api_key = api
552
-
553
- global template
554
-
555
- template = """
556
  You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
557
 
558
  **Previous conversation:** {conversation_history}
@@ -584,7 +386,6 @@ def chatbot_interface():
584
  - Extract only relevant information from {context} that directly addresses the question
585
  - Present information in accessible, non-technical language
586
  - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
587
-
588
 
589
  6. **Safety and Ethics**
590
  - Do not generate any speculative content or advice not supported by the context
@@ -595,115 +396,219 @@ def chatbot_interface():
595
  **Context:** {context}
596
  **User's Question:** {question}
597
  **Your Response:**
598
- """
599
-
600
-
601
- global rag_chain
602
- rag_chain = create_rag_chain(retriever, template, api_key)
603
-
604
- with gr.Blocks() as demo:
605
- # User registration section
606
- with gr.Column(visible=True, elem_id="registration_container") as registration_container:
607
- gr.Markdown("### Your privacy matters to us! Just share a nickname you feel comfy with to start chatting..")
608
-
609
- with gr.Row():
610
- first_name = gr.Textbox(
611
- label="Nickname",
612
- placeholder="Enter your Nickname You feel comfy",
613
- scale=1,
614
- elem_id="input_nickname"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  )
616
-
617
- with gr.Row():
618
- submit_btn = gr.Button("Start Chatting", variant="primary", scale=2)
619
-
620
- response_message = gr.Markdown()
621
-
622
- # Chatbot section (initially hidden)
623
- with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container:
624
- chat_interface = gr.ChatInterface(
625
- fn=rag_memory_stream,
626
- title="Chat with GBVR",
627
- fill_height=True
628
  )
 
 
 
 
 
 
 
629
 
630
- # Footer with version info
631
- gr.Markdown("Ijwi ry'Ubufasha Chatbot v1.0.0 © 2025")
632
-
633
- # Handle user registration
634
- submit_btn.click(
635
- collect_user_info,
636
- inputs=[first_name],
637
- outputs=[response_message, chatbot_container, registration_container, chat_interface.chatbot]
638
- )
639
-
640
- demo.css = """
641
- :root {
642
- --background: #f0f0f0;
643
- --text: #000000;
644
- }
645
 
646
- body, .gradio-container {
647
- margin: 0;
648
- padding: 0;
649
- width: 100vw;
650
- height: 100vh;
651
- display: flex;
652
- flex-direction: column;
653
- justify-content: center;
654
- align-items: center;
655
- background: var(--background);
656
- color: var(--text);
657
- }
658
 
659
- .gradio-container {
660
- max-width: 100%;
661
- max-height: 100%;
662
- }
 
 
 
 
663
 
664
- .gr-box {
665
- background: var(--background);
666
- color: var(--text);
667
- border-radius: 12px;
668
- padding: 2rem;
669
- border: 1px solid rgba(0, 0, 0, 0.1);
670
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
671
- }
672
 
673
- .gr-button-primary {
674
- background: var(--background);
675
- color: var(--text);
676
- padding: 12px 24px;
677
- border-radius: 8px;
678
- transition: all 0.3s ease;
679
- border: 1px solid rgba(0, 0, 0, 0.1);
680
- }
681
 
682
- .gr-button-primary:hover {
683
- transform: translateY(-1px);
684
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
685
- }
 
 
 
686
 
687
- footer {
688
- text-align: center;
689
- color: var(--text);
690
- opacity: 0.7;
691
- padding: 1rem;
692
- font-size: 0.9em;
693
- }
694
 
695
- .gr-markdown h3 {
696
- color: var(--text);
697
- margin-bottom: 1rem;
698
- }
 
 
699
 
700
- .registration-markdown, .chat-title h1 {
701
- color: var(--text);
702
- }
703
- """
 
 
 
704
 
705
- return demo
 
 
706
 
707
- # Launch the interface
708
  if __name__ == "__main__":
709
- chatbot_interface().launch(share=True)
 
1
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import requests
3
+ import time
4
  from io import BytesIO
5
+ from typing import Iterator, List, Dict, Any, Optional
6
+ from urllib.parse import urljoin, urlparse
7
 
8
+ # Data processing imports
9
+ import pandas as pd
10
+ import PyPDF2
11
+ from PyPDF2 import PdfReader
12
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # AI and NLP imports
15
  from openai import OpenAI
16
+ from langchain_huggingface import HuggingFaceEmbeddings
17
+ from langchain_chroma import Chroma
18
  from langchain_core.prompts import PromptTemplate
19
  from langchain_core.output_parsers import StrOutputParser
20
  from langchain_core.runnables import RunnablePassthrough
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+ from langchain.schema import Document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # UI import
25
+ import gradio as gr
26
 
27
+ class DataProcessor:
28
+ """Handles processing of various data sources including CSV, Excel, PDF, and web content."""
29
+
30
+ def __init__(self, folder_path: str = "./"):
31
+ self.folder_path = folder_path
32
+ self.files = os.listdir(folder_path)
33
+
34
+ def process_tabular_data(self) -> List[str]:
35
+ """Process CSV and Excel files to extract data."""
36
+ context_data = []
37
+ data_files = [f for f in self.files if f.endswith(('.csv', '.xlsx', '.xls'))]
38
+
39
+ for f, file in enumerate(data_files, 1):
40
+ print(f"\nProcessing file {f}: {file}")
41
+ file_path = os.path.join(self.folder_path, file)
42
+
43
+ try:
44
+ # Read file based on extension
45
+ if file.endswith('.csv'):
46
+ df = pd.read_csv(file_path)
47
+ else:
48
+ df = pd.read_excel(file_path)
49
+
50
+ # Extract non-empty values from column 2
51
+ context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())
52
+ except Exception as e:
53
+ print(f"Error processing file {file}: {str(e)}")
54
+
55
+ return context_data
56
+
57
+ def extract_text_from_pdf(self, pdf_path: str) -> str:
58
+ """Extract text content from a PDF file."""
59
+ try:
60
+ with open(pdf_path, "rb") as file:
61
+ reader = PyPDF2.PdfReader(file)
62
+ return "".join(page.extract_text() or "" for page in reader.pages)
63
+ except Exception as e:
64
+ print(f"Error with {pdf_path}: {e}")
65
+ return ""
66
+
67
+ def process_pdf_files(self) -> List[Document]:
68
+ """Process all PDF files and return documents."""
69
+ pdf_files = [f for f in self.files if f.lower().endswith(".pdf")]
70
+ documents = []
71
+
72
+ for file in pdf_files:
73
+ print(f"Processing: {file}")
74
+ pdf_path = os.path.join(self.folder_path, file)
75
+ text = self.extract_text_from_pdf(pdf_path)
76
+ if text:
77
+ documents.append(Document(page_content=text, metadata={"source": file}))
78
+
79
+ return documents
80
+
81
+ def split_documents(self, documents: List[Document], chunk_size: int = 500) -> List[str]:
82
+ """Split documents into manageable chunks."""
83
+ text_splitter = RecursiveCharacterTextSplitter(
84
+ separators=['\n\n', '\n', '.', ','],
85
+ chunk_size=chunk_size,
86
+ chunk_overlap=50
87
+ )
88
+ chunks = text_splitter.split_documents(documents)
89
+ return [chunk.page_content for chunk in chunks]
90
+
91
+ def extract_pdf_text_from_url(self, pdf_url: str) -> Optional[str]:
92
+ """Extract text from a PDF URL."""
93
+ try:
94
+ response = requests.get(pdf_url)
95
+ response.raise_for_status()
96
+
97
+ with BytesIO(response.content) as file:
98
+ reader = PdfReader(file)
99
+ pdf_text = ""
100
+ for page in reader.pages:
101
+ pdf_text += page.extract_text()
102
+
103
+ return pdf_text if pdf_text else None
104
+ except requests.exceptions.RequestException as e:
105
+ print(f"Error fetching PDF {pdf_url}: {e}")
106
+ return None
107
+ except Exception as e:
108
+ print(f"Error reading PDF {pdf_url}: {e}")
109
+ return None
110
 
 
111
 
112
+ class WebScraper:
113
+ """Web scraping functionality for collecting data from websites."""
114
+
115
+ def scrape_websites(self, base_urls: List[str]) -> Dict[str, str]:
116
+ """Scrape content from a list of base URLs and their internal links."""
117
+ try:
118
+ visited_links = set()
119
+ content_by_url = {}
120
+
121
+ for base_url in base_urls:
122
+ if not base_url.strip():
123
+ continue
124
+
125
+ print(f"Scraping base URL: {base_url}")
126
+ html_content = self.fetch_page_content(base_url)
127
+ if html_content:
128
+ cleaned_content = self.clean_body_content(html_content)
129
+ content_by_url[base_url] = cleaned_content
130
+ visited_links.add(base_url)
131
+
132
+ # Extract and process internal links
133
+ soup = BeautifulSoup(html_content, "html.parser")
134
+ links = self.extract_internal_links(base_url, soup)
135
+
136
+ for link in links:
137
+ if link not in visited_links:
138
+ print(f"Scraping link: {link}")
139
+ page_content = self.fetch_page_content(link)
140
+ if page_content:
141
+ cleaned_content = self.clean_body_content(page_content)
142
+ content_by_url[link] = cleaned_content
143
+ visited_links.add(link)
144
+
145
+ # Extract PDF content if link is a PDF
146
+ if link.lower().endswith('.pdf'):
147
+ print(f"Extracting PDF content from: {link}")
148
+ pdf_processor = DataProcessor()
149
+ pdf_content = pdf_processor.extract_pdf_text_from_url(link)
150
+ if pdf_content:
151
+ content_by_url[link] = pdf_content
152
+
153
+ return content_by_url
154
+ except Exception as e:
155
+ print(f"Error during scraping: {e}")
156
+ return {}
157
+
158
+ def fetch_page_content(self, url: str) -> Optional[str]:
159
+ """Fetch HTML content from a URL."""
160
+ try:
161
+ response = requests.get(url, timeout=10)
162
+ response.raise_for_status()
163
+ return response.text
164
+ except requests.exceptions.RequestException as e:
165
+ print(f"Error fetching {url}: {e}")
166
+ return None
167
+
168
+ def extract_internal_links(self, base_url: str, soup: BeautifulSoup) -> set:
169
+ """Extract internal links from a BeautifulSoup object."""
170
+ links = set()
171
+ for anchor in soup.find_all("a", href=True):
172
+ href = anchor["href"]
173
+ full_url = urljoin(base_url, href)
174
+ if self.is_internal_link(base_url, full_url):
175
+ links.add(full_url)
176
+ return links
177
+
178
+ def is_internal_link(self, base_url: str, link_url: str) -> bool:
179
+ """Check if a link is internal to the base URL."""
180
+ base_netloc = urlparse(base_url).netloc
181
+ link_netloc = urlparse(link_url).netloc
182
+ return base_netloc == link_netloc
183
+
184
+ def clean_body_content(self, html_content: str) -> str:
185
+ """Clean HTML content to extract useful text."""
186
+ soup = BeautifulSoup(html_content, "html.parser")
187
+
188
+ # Remove scripts and styles
189
+ for script_or_style in soup(["script", "style"]):
190
+ script_or_style.extract()
191
+
192
+ # Get text and clean up
193
+ cleaned_content = soup.get_text(separator="\n")
194
+ cleaned_content = "\n".join(
195
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
196
+ )
197
+ return cleaned_content
198
 
 
 
 
199
 
200
+ class TranslationService:
201
+ """Translation service using Hugging Face API."""
202
+
203
+ def __init__(self, api_token: str, model_name: str = "facebook/nllb-200-distilled-600M"):
204
+ self.model_name = model_name
205
+ self.url = f"https://api-inference.huggingface.co/models/{model_name}"
206
+ self.headers = {"Authorization": f"Bearer {api_token}"}
207
+
208
+ def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> str:
209
+ """Translate text using Hugging Face API."""
210
+ response = requests.post(
211
+ self.url,
212
+ headers=self.headers,
213
+ json={
214
+ "inputs": text,
215
+ "parameters": {
216
+ "src_lang": src_lang,
217
+ "tgt_lang": tgt_lang
218
+ }
219
  }
220
+ )
221
+
222
+ if response.status_code == 200:
223
+ result = response.json()
224
+ if isinstance(result, list) and len(result) > 0:
225
+ return result[0]['translation_text']
226
+ return result['translation_text']
227
+ else:
228
+ print(f"Translation error: {response.status_code}, {response.text}")
229
+ return text # Return original text if translation fails
 
230
 
231
 
232
  class OpenRouterLLM:
233
+ """LLM service using OpenRouter API."""
234
+
235
  def __init__(self, key: str):
236
  try:
237
  self.client = OpenAI(
238
  base_url="https://openrouter.ai/api/v1",
239
+ api_key=key
240
  )
241
  self.headers = {
242
  "HTTP-Referer": "http://localhost:3000",
 
247
  raise
248
 
249
  def stream(self, prompt: str) -> Iterator[str]:
250
+ """Stream response from LLM."""
251
  try:
252
  completion = self.client.chat.completions.create(
 
253
  model="meta-llama/llama-3.3-70b-instruct:free",
 
254
  messages=[{"role": "user", "content": prompt}],
255
  stream=True
256
  )
 
264
 
265
 
266
  class UserSession:
267
+ """Manage user session information and conversation history."""
268
+
269
+ def __init__(self, llm: OpenRouterLLM):
270
  self.current_user = None
271
  self.welcome_message = None
272
+ self.conversation_history = []
273
+ self.llm = llm
274
+
275
+ def set_user(self, user_info: Dict[str, Any]) -> None:
276
+ """Set current user and initialize welcome message."""
277
  self.current_user = user_info
278
  self.set_welcome_message(user_info.get("Nickname", "Guest"))
279
  # Initialize conversation history with welcome message
 
281
  self.conversation_history = [
282
  {"role": "assistant", "content": welcome},
283
  ]
284
+
285
+ def get_user(self) -> Dict[str, Any]:
286
+ """Get current user information."""
287
  return self.current_user
288
+
289
+ def set_welcome_message(self, nickname: str, src_lang: str = "eng_Latn", tgt_lang: str = "kin_Latn") -> None:
290
+ """Set a dynamic welcome message using the LLM."""
291
  prompt = (
292
+ f"Create a very brief welcome message for {nickname}. "
293
  f"The message should: "
294
+ f"1. Welcome {nickname} warmly and professionally. "
295
  f"2. Emphasize that this is a safe and trusted space. "
296
  f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. "
297
  f"4. Use a tone that is warm, reassuring, and professional. "
298
  f"5. Keep the message concise and impactful."
299
  )
300
+
301
+ # Use the LLM to generate the message
302
+ welcome = "".join(self.llm.stream(prompt))
303
+
304
+ # Get translation service and translate welcome message
305
+ api_token = os.environ.get('Token')
306
+ translator = TranslationService(api_token)
307
+ welcome_text = translator.translate_text(welcome, src_lang, tgt_lang)
308
+
309
  # Format the message with HTML styling
310
  self.welcome_message = (
311
  f"<div style='font-size: 20px;'>"
312
  f"{welcome_text}"
313
  f"</div>"
314
  )
315
+
316
+ def get_welcome_message(self) -> str:
317
+ """Get the welcome message."""
318
  return self.welcome_message
319
+
320
+ def add_to_history(self, role: str, message: str) -> None:
321
+ """Add a message to the conversation history."""
322
  self.conversation_history.append({"role": role, "content": message})
323
+
324
+ def get_conversation_history(self) -> List[Dict[str, str]]:
325
+ """Get the full conversation history."""
326
  return self.conversation_history
327
+
328
+ def get_formatted_history(self) -> str:
329
+ """Get conversation history formatted as a string for the LLM."""
330
  formatted_history = ""
331
  for entry in self.conversation_history:
332
  role = "User" if entry["role"] == "user" else "Assistant"
333
  formatted_history += f"{role}: {entry['content']}\n\n"
334
  return formatted_history
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
+ class GBVSupportChatbot:
338
+ """Main chatbot application class."""
339
+
340
+ def __init__(self):
341
+ self.api_key = os.environ.get('V2')
342
+ self.api_token = os.environ.get('Token')
343
+ self.llm_instance = OpenRouterLLM(key=self.api_key)
344
+ self.user_session = UserSession(self.llm_instance)
345
+ self.translator = TranslationService(self.api_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
+ # Initialize embedding model
348
+ self.embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
349
+
350
+ # Initialize vector store
351
+ self.vectorstore = Chroma(
352
+ collection_name="Dataset",
353
+ embedding_function=self.embed_model,
 
 
354
  )
355
+
356
+ # Template for GBV support chatbot
357
+ self.template = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
359
 
360
  **Previous conversation:** {conversation_history}
 
386
  - Extract only relevant information from {context} that directly addresses the question
387
  - Present information in accessible, non-technical language
388
  - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
 
389
 
390
  6. **Safety and Ethics**
391
  - Do not generate any speculative content or advice not supported by the context
 
396
  **Context:** {context}
397
  **User's Question:** {question}
398
  **Your Response:**
399
+ """
400
+
401
+ def load_data(self) -> None:
402
+ """Load and process all data sources."""
403
+ # Process all data sources
404
+ data_processor = DataProcessor()
405
+ context_data = data_processor.process_tabular_data()
406
+
407
+ # Process PDFs
408
+ pdf_documents = data_processor.process_pdf_files()
409
+ text_chunks = data_processor.split_documents(pdf_documents)
410
+
411
+ # Combine all data
412
+ all_data = []
413
+ all_data.extend(context_data)
414
+ #all_data.extend([item for item in text_chunks if item not in all_data])
415
+
416
+ # Add data to vector store
417
+ self.vectorstore.add_texts(all_data)
418
+
419
+ def create_rag_chain(self):
420
+ """Create RAG chain with user context and conversation history."""
421
+ retriever = self.vectorstore.as_retriever()
422
+ rag_prompt = PromptTemplate.from_template(self.template)
423
+
424
+ def stream_func(input_dict):
425
+ # Get context using the retriever's invoke method
426
+ context = retriever.invoke(input_dict["question"])
427
+ context_str = "\n".join([doc.page_content for doc in context])
428
+
429
+ # Get user info from the session
430
+ user_info = self.user_session.get_user() or {}
431
+ first_name = user_info.get("Nickname", "User")
432
+
433
+ # Get conversation history
434
+ conversation_history = self.user_session.get_formatted_history()
435
+
436
+ # Format prompt with user context and conversation history
437
+ prompt = rag_prompt.format(
438
+ context=context_str,
439
+ question=input_dict["question"],
440
+ first_name=first_name,
441
+ conversation_history=conversation_history
442
+ )
443
+
444
+ # Stream response
445
+ return self.llm_instance.stream(prompt)
446
+
447
+ return stream_func
448
+
449
+ def collect_user_info(self, nickname: str):
450
+ """Collect and process user information."""
451
+ if not nickname:
452
+ return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), []
453
+
454
+ # Store user info for chat session
455
+ user_info = {
456
+ "Nickname": nickname,
457
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
458
+ }
459
+
460
+ # Set user in session
461
+ self.user_session.set_user(user_info)
462
+
463
+ # Generate welcome message
464
+ welcome_message = self.user_session.get_welcome_message()
465
+
466
+ # Add initial message to start the conversation
467
+ chat_history = [(None, welcome_message)]
468
+
469
+ # Return welcome message and update UI
470
+ return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history
471
+
472
+ def rag_memory_stream(self, message: str, history, user_lang: str = "kin_Latn", system_lang: str = "eng_Latn"):
473
+ """Process user message, translate, and generate response."""
474
+ # Translate user message to English
475
+ english_message = self.translator.translate_text(message, user_lang, system_lang)
476
+
477
+ # Add translated message to history
478
+ self.user_session.add_to_history("user", english_message)
479
+
480
+ # Generate response using RAG chain
481
+ full_response = ""
482
+ rag_chain = self.create_rag_chain()
483
+
484
+ for new_text in rag_chain({"question": english_message}):
485
+ full_response += new_text
486
+
487
+ # Translate response back to user language
488
+ translated_response = self.translator.translate_text(full_response, system_lang, user_lang)
489
+
490
+ # Add response to history
491
+ self.user_session.add_to_history("assistant", full_response)
492
+
493
+ yield translated_response
494
+
495
+ def create_chatbot_interface(self):
496
+ """Create and configure the chatbot UI."""
497
+ with gr.Blocks() as demo:
498
+ # User registration section
499
+ with gr.Column(visible=True, elem_id="registration_container") as registration_container:
500
+ gr.Markdown("### Your privacy matters to us! Just share a nickname you feel comfy with to start chatting..")
501
+
502
+ with gr.Row():
503
+ first_name = gr.Textbox(
504
+ label="Nickname",
505
+ placeholder="Enter a nickname you feel comfortable with",
506
+ scale=1,
507
+ elem_id="input_nickname"
508
+ )
509
+
510
+ with gr.Row():
511
+ submit_btn = gr.Button("Start Chatting", variant="primary", scale=2)
512
+
513
+ response_message = gr.Markdown()
514
+
515
+ # Chatbot section (initially hidden)
516
+ with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container:
517
+ chat_interface = gr.ChatInterface(
518
+ fn=self.rag_memory_stream,
519
+ title="Chat with GBVR",
520
+ fill_height=True
521
  )
522
+
523
+ # Footer with version info
524
+ gr.Markdown("Ijwi ry'Ubufasha Chatbot v1.0.0 © 2025")
525
+
526
+ # Handle user registration
527
+ submit_btn.click(
528
+ self.collect_user_info,
529
+ inputs=[first_name],
530
+ outputs=[response_message, chatbot_container, registration_container, chat_interface.chatbot]
 
 
 
531
  )
532
+
533
+ # Add CSS styles
534
+ demo.css = """
535
+ :root {
536
+ --background: #f0f0f0;
537
+ --text: #000000;
538
+ }
539
 
540
+ body, .gradio-container {
541
+ margin: 0;
542
+ padding: 0;
543
+ width: 100vw;
544
+ height: 100vh;
545
+ display: flex;
546
+ flex-direction: column;
547
+ justify-content: center;
548
+ align-items: center;
549
+ background: var(--background);
550
+ color: var(--text);
551
+ }
 
 
 
552
 
553
+ .gradio-container {
554
+ max-width: 100%;
555
+ max-height: 100%;
556
+ }
 
 
 
 
 
 
 
 
557
 
558
+ .gr-box {
559
+ background: var(--background);
560
+ color: var(--text);
561
+ border-radius: 12px;
562
+ padding: 2rem;
563
+ border: 1px solid rgba(0, 0, 0, 0.1);
564
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
565
+ }
566
 
567
+ .gr-button-primary {
568
+ background: var(--background);
569
+ color: var(--text);
570
+ padding: 12px 24px;
571
+ border-radius: 8px;
572
+ transition: all 0.3s ease;
573
+ border: 1px solid rgba(0, 0, 0, 0.1);
574
+ }
575
 
576
+ .gr-button-primary:hover {
577
+ transform: translateY(-1px);
578
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
579
+ }
 
 
 
 
580
 
581
+ footer {
582
+ text-align: center;
583
+ color: var(--text);
584
+ opacity: 0.7;
585
+ padding: 1rem;
586
+ font-size: 0.9em;
587
+ }
588
 
589
+ .gr-markdown h3 {
590
+ color: var(--text);
591
+ margin-bottom: 1rem;
592
+ }
 
 
 
593
 
594
+ .registration-markdown, .chat-title h1 {
595
+ color: var(--text);
596
+ }
597
+ """
598
+
599
+ return demo
600
 
601
+ # Main execution function
602
+ def main():
603
+ # Initialize the chatbot
604
+ chatbot = GBVSupportChatbot()
605
+
606
+ # Load data
607
+ chatbot.load_data()
608
 
609
+ # Create and launch the interface
610
+ demo = chatbot.create_chatbot_interface()
611
+ demo.launch(share=True)
612
 
 
613
  if __name__ == "__main__":
614
+ main()