Macdensten91 commited on
Commit
1e6efef
·
verified ·
1 Parent(s): 6ac4ea2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -112
app.py CHANGED
@@ -23,10 +23,10 @@ if ENABLE_SCRAPING:
23
  options.headless = True # Run browser in headless mode.
24
  driver = webdriver.Chrome(options=options)
25
  driver.get(url)
26
- # Use explicit waits in production code; here we use a simple sleep.
27
  time.sleep(5)
28
  try:
29
- # Customize the selector as per your site's HTML structure.
30
  content_element = driver.find_element(By.ID, "content")
31
  page_text = content_element.text
32
  except Exception as e:
@@ -43,135 +43,66 @@ else:
43
 
44
  # --- Multilingual Helpers ---
45
 
46
- # Language-specific greeting detection
47
  def is_greeting(query: str, lang: str) -> bool:
48
  greetings = {
49
  "en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"],
50
  "fr": ["bonjour", "salut", "coucou", "bonsoir"],
51
  "am": ["ሰላም", "ሰላም እንደምን", "እንዴት"]
52
  }
53
- # Retrieve greetings for the provided language; default to English if unavailable.
54
  greet_list = greetings.get(lang, greetings["en"])
55
- # For Amharic, no transformation; for Latin scripts, convert to lower case.
56
  if lang != "am":
57
  query = query.lower()
58
  return any(query.startswith(greet) for greet in greet_list)
59
 
60
- # Language-specific out-of-scope messages
61
- def get_out_of_scope_message(lang: str) -> str:
62
- messages = {
63
- "en": [
64
- "I appreciate your curiosity. However, my expertise lies exclusively in agricultural and agro-investment insights. Could you please frame your question accordingly?",
65
- "That’s an interesting thought, but I'm tailored specifically for topics concerning agriculture and agro-investment. Please ask a question within that realm.",
66
- "While I value your inquiry, I'm optimized to provide insights solely on agriculture and related investment matters. Could you rephrase your query to align with these topics?",
67
- "It appears your question may not be directly tied to agriculture or agro-investment. Please ask something along those lines so I can assist effectively."
68
- ],
69
- "fr": [
70
- "J'apprécie votre curiosité. Cependant, mon expertise se limite exclusivement aux informations sur l'agriculture et les investissements agroalimentaires. Pourriez-vous reformuler votre question en ce sens ?",
71
- "C'est une pensée intéressante, mais je suis spécialisé dans les domaines de l'agriculture et des investissements agroalimentaires. Merci de poser une question dans ce domaine.",
72
- "Bien que votre question soit pertinente, je me concentre uniquement sur l'agriculture et les investissements associés. Pourriez-vous reformuler votre demande en conséquence ?",
73
- "Votre interrogation semble éloignée de l'agriculture ou des investissements agroalimentaires. Merci de poser une question dans ces domaines pour que je puisse vous aider efficacement."
74
- ],
75
- "am": [
76
- "እባክዎ ልጠይቁት ጥያቄ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ዙሪያ መሆኑን አላስቀምጥም። እባኮትን ጥያቄዎን እንደዚህ በማቅረብ ደግሞ ይሞክሩ።",
77
- "ልዩ ጥያቄዎችን ማቅረብ ይፈልጋሉ እንጂ፣ እኔ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች ላይ ብቻ እንደሚሰራ ተዘጋጅቻለሁ። እባክዎ ጥያቄዎን በእነዚህ ክስተቶች ውስጥ ያቅርቡ።",
78
- "እንደምታዩት ጥያቄዎ በግብርና ወይም በአገልግሎት ስርዓተ-ቢዝነስ ላይ የተመረጠ አይደለም። እባክዎ በዚህ አውድ የሆነ ጥያቄ ይጠይቁ።"
79
- ]
80
  }
81
- # Return a random message for the given language; default to English if not available.
82
- return random.choice(messages.get(lang, messages["en"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # Helper to determine if a query is relevant to our domain (English check only; can be expanded).
85
  def is_domain_query(query: str) -> bool:
86
  domain_keywords = [
87
  "agriculture", "farming", "crop", "agro", "investment", "soil",
88
  "irrigation", "harvest", "organic", "sustainable", "agribusiness",
89
- "livestock", # additional English keywords
90
- "agriculture", "agroalimentaire", "agriculture durable" # French terms can also be included
91
  ]
92
  return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)
93
 
94
  def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str:
95
- """
96
- A simple retrieval function that searches for sentences in the text
97
- containing domain keywords from the query.
98
- Returns a snippet limited to max_length characters.
99
- """
100
  sentences = re.split(r'[.?!]', text)
101
- for sentence in sentences:
102
- if is_domain_query(sentence) and all(word.lower() in sentence.lower() for word in query.split()):
103
- snippet = sentence.strip()
104
- return snippet[:max_length] + "..." if len(snippet) > max_length else snippet
105
- return ""
106
-
107
- # --- Chat Assistant Response Function ---
108
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
109
-
110
- def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, language):
111
- # language is expected as a string code: "en", "fr", or "am"
112
-
113
- # Check for a greeting in the appropriate language.
114
- if is_greeting(message, language):
115
- greetings = {
116
- "en": "Hello! How can I assist you today with your agriculture or agro-investment inquiries?",
117
- "fr": "Bonjour! Comment puis-je vous aider aujourd'hui en matière d'agriculture ou d'investissements agroalimentaires?",
118
- "am": "ሰላም! ዛሬ ስለ ግብርና ወይም ስለ አገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች እንዴት ልረዳዎት?"
119
- }
120
- yield greetings.get(language, greetings["en"])
121
- return
122
-
123
- # If the query is not recognized as domain related, return an out-of-scope message.
124
- if not is_domain_query(message):
125
- yield get_out_of_scope_message(language)
126
- return
127
-
128
- # Build conversation context starting with the system message.
129
- messages_context = [{"role": "system", "content": system_message}]
130
- for user_msg, assistant_msg in history:
131
- if user_msg:
132
- messages_context.append({"role": "user", "content": user_msg})
133
- if assistant_msg:
134
- messages_context.append({"role": "assistant", "content": assistant_msg})
135
-
136
- # Optional: Append retrieved context from scraped site content.
137
- if knowledge_base:
138
- snippet = retrieve_relevant_snippet(message, knowledge_base)
139
- if snippet:
140
- retrieval_context = f"Reference info from Agri Future Investment platform: {snippet}"
141
- messages_context.insert(0, {"role": "system", "content": retrieval_context})
142
-
143
- # Append the new user message.
144
- messages_context.append({"role": "user", "content": message})
145
-
146
- # Stream the model's reply token-by-token.
147
- response = ""
148
- for message_resp in client.chat_completion(
149
- messages_context,
150
- max_tokens=max_tokens,
151
- stream=True,
152
- temperature=temperature,
153
- top_p=top_p,
154
- ):
155
- token = message_resp.choices[0].delta.content
156
- response += token
157
- yield response
158
-
159
- # --- Gradio Chat Interface ---
160
-
161
- # The language selection dropdown uses language codes: "en" for English, "fr" for French, "am" for Amharic.
162
- demo = gr.ChatInterface(
163
- respond,
164
- additional_inputs=[
165
- gr.Textbox(
166
- value="You are AgriFutureBot, designed to help visitors of the Agri Future Investment platform understand content about the site and answer questions strictly related to agriculture and agro-investment topics.",
167
- label="System Message"
168
- ),
169
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
170
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
171
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)"),
172
- gr.Dropdown(choices=["en", "fr", "am"], value="en", label="Language (en, fr, am)")
173
- ],
174
- )
175
-
176
- if __name__ == "__main__":
177
- demo.launch()
 
23
  options.headless = True # Run browser in headless mode.
24
  driver = webdriver.Chrome(options=options)
25
  driver.get(url)
26
+ # Use explicit waits in production; here we use a basic sleep.
27
  time.sleep(5)
28
  try:
29
+ # Customize the selector based on your site's HTML structure.
30
  content_element = driver.find_element(By.ID, "content")
31
  page_text = content_element.text
32
  except Exception as e:
 
43
 
44
  # --- Multilingual Helpers ---
45
 
 
46
  def is_greeting(query: str, lang: str) -> bool:
47
  greetings = {
48
  "en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"],
49
  "fr": ["bonjour", "salut", "coucou", "bonsoir"],
50
  "am": ["ሰላም", "ሰላም እንደምን", "እንዴት"]
51
  }
 
52
  greet_list = greetings.get(lang, greetings["en"])
53
+ # For languages using Latin script, convert to lower case.
54
  if lang != "am":
55
  query = query.lower()
56
  return any(query.startswith(greet) for greet in greet_list)
57
 
58
+ # Rather than using fixed out-of-scope messages, use the model via Hugging Face to generate them.
59
+ def generate_dynamic_out_of_scope_message(language: str) -> str:
60
+ # Define language-specific system prompts for generating a dynamic out-of-scope message.
61
+ system_prompts = {
62
+ "en": (
63
+ "You are a helpful chatbot specializing in agriculture and agro-investment. "
64
+ "A user just asked a question that is not related to these topics. "
65
+ "Generate a friendly, varied, and intelligent out-of-scope response in English that kindly encourages the user to ask about agriculture or agro-investment."
66
+ ),
67
+ "fr": (
68
+ "Vous êtes un chatbot utile spécialisé dans l'agriculture et les investissements agroalimentaires. "
69
+ "Un utilisateur vient de poser une question qui ne concerne pas ces sujets. "
70
+ "Générez une réponse élégante, variée et intelligente en français pour indiquer que la question est hors de portée, en invitant l'utilisateur à poser une question sur l'agriculture ou les investissements agroalimentaires."
71
+ ),
72
+ "am": (
73
+ "እርስዎ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ውስጥ በተለይ የተሞሉ ቻትቦት ናቸው። "
74
+ "ተጠቃሚው ለግብርና ወይም ለአገልግሎት ስርዓተ-ቢዝነስ ተያይዞ ያልሆነ ጥያቄ አስቀድመዋል። "
75
+ "በአማርኛ በተለያዩ መልኩ የውጭ ክፍል መልእክት ፍጥረት ያድርጉ፤ እባኮትን ተጠቃሚውን ለግብርና ወይም ለአገልግሎት ጥያቄዎች ለመጠየቅ ያነጋግሩ።"
76
+ )
 
77
  }
78
+ prompt = system_prompts.get(language, system_prompts["en"])
79
+ messages = [{"role": "system", "content": prompt}]
80
+
81
+ # Call the model without streaming to generate the complete message.
82
+ response = client.chat_completion(
83
+ messages,
84
+ max_tokens=80,
85
+ stream=False,
86
+ temperature=0.7,
87
+ top_p=0.95,
88
+ )
89
+ # Depending on the client structure, adjust the extraction of the generated text.
90
+ try:
91
+ out_message = response.choices[0].message.content
92
+ except AttributeError:
93
+ # If the response structure differs, do a fallback conversion.
94
+ out_message = str(response)
95
+ return out_message.strip()
96
 
97
+ # A helper to determine domain relevance (basic implementation; can be expanded).
98
  def is_domain_query(query: str) -> bool:
99
  domain_keywords = [
100
  "agriculture", "farming", "crop", "agro", "investment", "soil",
101
  "irrigation", "harvest", "organic", "sustainable", "agribusiness",
102
+ "livestock", "agroalimentaire", "agriculture durable"
 
103
  ]
104
  return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)
105
 
106
  def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str:
 
 
 
 
 
107
  sentences = re.split(r'[.?!]', text)
108
+ for sentence in sentences12