Update app.py
Browse files
app.py
CHANGED
@@ -1,93 +1,177 @@
|
|
1 |
-
import random
|
2 |
import re
|
|
|
|
|
|
|
3 |
from huggingface_hub import InferenceClient
|
4 |
|
5 |
-
#
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
# Multilingual
|
12 |
-
greetings = {
|
13 |
-
"en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"],
|
14 |
-
"fr": ["bonjour", "salut", "coucou", "bonsoir"],
|
15 |
-
"am": ["ሰላም", "ሰላም እንደምን", "እንዴት"]
|
16 |
-
}
|
17 |
|
|
|
18 |
def is_greeting(query: str, lang: str) -> bool:
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
22 |
greet_list = greetings.get(lang, greetings["en"])
|
23 |
-
#
|
24 |
if lang != "am":
|
25 |
query = query.lower()
|
26 |
return any(query.startswith(greet) for greet in greet_list)
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
"
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
"
|
41 |
-
"
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
"
|
46 |
-
"
|
47 |
-
|
48 |
-
)
|
49 |
}
|
50 |
-
|
51 |
-
messages
|
52 |
-
|
53 |
-
# Call the model to generate the response
|
54 |
-
response = client.chat_completion(
|
55 |
-
messages,
|
56 |
-
max_tokens=80,
|
57 |
-
temperature=0.7,
|
58 |
-
top_p=0.95,
|
59 |
-
)
|
60 |
-
# Extract the generated message content
|
61 |
-
try:
|
62 |
-
out_message = response.choices[0].message.content
|
63 |
-
except AttributeError:
|
64 |
-
out_message = str(response)
|
65 |
-
return out_message.strip()
|
66 |
|
|
|
67 |
def is_domain_query(query: str) -> bool:
|
68 |
-
"""
|
69 |
-
Determine if the query is related to agriculture or agro-investment.
|
70 |
-
"""
|
71 |
domain_keywords = [
|
72 |
"agriculture", "farming", "crop", "agro", "investment", "soil",
|
73 |
"irrigation", "harvest", "organic", "sustainable", "agribusiness",
|
74 |
-
"livestock",
|
|
|
75 |
]
|
76 |
return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)
|
77 |
|
78 |
-
def
|
79 |
"""
|
80 |
-
|
|
|
|
|
81 |
"""
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
+
import time
|
3 |
+
import random
|
4 |
+
import gradio as gr
|
5 |
from huggingface_hub import InferenceClient
|
6 |
|
7 |
+
# Optional: Enable scraping if your site is deployed.
|
8 |
+
ENABLE_SCRAPING = False
|
9 |
+
SITE_URL = "https://your-agri-future-site.com"
|
10 |
+
|
11 |
+
# Global variable to hold scraped content
|
12 |
+
knowledge_base = ""
|
13 |
+
|
14 |
+
# --- Optional: Scraping Functionality ---
|
15 |
+
if ENABLE_SCRAPING:
|
16 |
+
try:
|
17 |
+
from selenium import webdriver
|
18 |
+
from selenium.webdriver.chrome.options import Options
|
19 |
+
from selenium.webdriver.common.by import By
|
20 |
+
|
21 |
+
def scrape_site(url):
|
22 |
+
options = Options()
|
23 |
+
options.headless = True # Run browser in headless mode.
|
24 |
+
driver = webdriver.Chrome(options=options)
|
25 |
+
driver.get(url)
|
26 |
+
# Use explicit waits in production code; here we use a simple sleep.
|
27 |
+
time.sleep(5)
|
28 |
+
try:
|
29 |
+
# Customize the selector as per your site's HTML structure.
|
30 |
+
content_element = driver.find_element(By.ID, "content")
|
31 |
+
page_text = content_element.text
|
32 |
+
except Exception as e:
|
33 |
+
page_text = "Error encountered during scraping: " + str(e)
|
34 |
+
driver.quit()
|
35 |
+
return page_text
|
36 |
+
|
37 |
+
knowledge_base = scrape_site(SITE_URL)
|
38 |
+
print("Scraped knowledge base successfully.")
|
39 |
+
except Exception as e:
|
40 |
+
print("Scraping failed or Selenium is not configured:", e)
|
41 |
+
else:
|
42 |
+
print("Scraping is disabled; proceeding without scraped site content.")
|
43 |
|
44 |
+
# --- Multilingual Helpers ---
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
# Language-specific greeting detection
|
47 |
def is_greeting(query: str, lang: str) -> bool:
|
48 |
+
greetings = {
|
49 |
+
"en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"],
|
50 |
+
"fr": ["bonjour", "salut", "coucou", "bonsoir"],
|
51 |
+
"am": ["ሰላም", "ሰላም እንደምን", "እንዴት"]
|
52 |
+
}
|
53 |
+
# Retrieve greetings for the provided language; default to English if unavailable.
|
54 |
greet_list = greetings.get(lang, greetings["en"])
|
55 |
+
# For Amharic, no transformation; for Latin scripts, convert to lower case.
|
56 |
if lang != "am":
|
57 |
query = query.lower()
|
58 |
return any(query.startswith(greet) for greet in greet_list)
|
59 |
|
60 |
+
# Language-specific out-of-scope messages
|
61 |
+
def get_out_of_scope_message(lang: str) -> str:
|
62 |
+
messages = {
|
63 |
+
"en": [
|
64 |
+
"I appreciate your curiosity. However, my expertise lies exclusively in agricultural and agro-investment insights. Could you please frame your question accordingly?",
|
65 |
+
"That’s an interesting thought, but I'm tailored specifically for topics concerning agriculture and agro-investment. Please ask a question within that realm.",
|
66 |
+
"While I value your inquiry, I'm optimized to provide insights solely on agriculture and related investment matters. Could you rephrase your query to align with these topics?",
|
67 |
+
"It appears your question may not be directly tied to agriculture or agro-investment. Please ask something along those lines so I can assist effectively."
|
68 |
+
],
|
69 |
+
"fr": [
|
70 |
+
"J'apprécie votre curiosité. Cependant, mon expertise se limite exclusivement aux informations sur l'agriculture et les investissements agroalimentaires. Pourriez-vous reformuler votre question en ce sens ?",
|
71 |
+
"C'est une pensée intéressante, mais je suis spécialisé dans les domaines de l'agriculture et des investissements agroalimentaires. Merci de poser une question dans ce domaine.",
|
72 |
+
"Bien que votre question soit pertinente, je me concentre uniquement sur l'agriculture et les investissements associés. Pourriez-vous reformuler votre demande en conséquence ?",
|
73 |
+
"Votre interrogation semble éloignée de l'agriculture ou des investissements agroalimentaires. Merci de poser une question dans ces domaines pour que je puisse vous aider efficacement."
|
74 |
+
],
|
75 |
+
"am": [
|
76 |
+
"እባክዎ ልጠይቁት ጥያቄ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ዙሪያ መሆኑን አላስቀምጥም። እባኮትን ጥያቄዎን እንደዚህ በማቅረብ ደግሞ ይሞክሩ።",
|
77 |
+
"ልዩ ጥያቄዎችን ማቅረብ ይፈልጋሉ እንጂ፣ እኔ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች ላይ ብቻ እንደሚሰራ ተዘጋጅቻለሁ። እባክዎ ጥያቄዎን በእነዚህ ክስተቶች ውስጥ ያቅርቡ።",
|
78 |
+
"እንደምታዩት ጥያቄዎ በግብርና ወይም በአገልግሎት ስርዓተ-ቢዝነስ ላይ የተመረጠ አይደለም። እባክዎ በዚህ አውድ የሆነ ጥያቄ ይጠይቁ።"
|
79 |
+
]
|
|
|
80 |
}
|
81 |
+
# Return a random message for the given language; default to English if not available.
|
82 |
+
return random.choice(messages.get(lang, messages["en"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
# Helper to determine if a query is relevant to our domain (English check only; can be expanded).
|
85 |
def is_domain_query(query: str) -> bool:
|
|
|
|
|
|
|
86 |
domain_keywords = [
|
87 |
"agriculture", "farming", "crop", "agro", "investment", "soil",
|
88 |
"irrigation", "harvest", "organic", "sustainable", "agribusiness",
|
89 |
+
"livestock", # additional English keywords
|
90 |
+
"agriculture", "agroalimentaire", "agriculture durable" # French terms can also be included
|
91 |
]
|
92 |
return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)
|
93 |
|
94 |
+
def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str:
|
95 |
"""
|
96 |
+
A simple retrieval function that searches for sentences in the text
|
97 |
+
containing domain keywords from the query.
|
98 |
+
Returns a snippet limited to max_length characters.
|
99 |
"""
|
100 |
+
sentences = re.split(r'[.?!]', text)
|
101 |
+
for sentence in sentences:
|
102 |
+
if is_domain_query(sentence) and all(word.lower() in sentence.lower() for word in query.split()):
|
103 |
+
snippet = sentence.strip()
|
104 |
+
return snippet[:max_length] + "..." if len(snippet) > max_length else snippet
|
105 |
+
return ""
|
106 |
+
|
107 |
+
# --- Chat Assistant Response Function ---
|
108 |
+
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
109 |
+
|
110 |
+
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, language):
|
111 |
+
# language is expected as a string code: "en", "fr", or "am"
|
112 |
+
|
113 |
+
# Check for a greeting in the appropriate language.
|
114 |
+
if is_greeting(message, language):
|
115 |
+
greetings = {
|
116 |
+
"en": "Hello! How can I assist you today with your agriculture or agro-investment inquiries?",
|
117 |
+
"fr": "Bonjour! Comment puis-je vous aider aujourd'hui en matière d'agriculture ou d'investissements agroalimentaires?",
|
118 |
+
"am": "ሰላም! ዛሬ ስለ ግብርና ወይም ስለ አገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች እንዴት ልረዳዎት?"
|
119 |
+
}
|
120 |
+
yield greetings.get(language, greetings["en"])
|
121 |
+
return
|
122 |
+
|
123 |
+
# If the query is not recognized as domain related, return an out-of-scope message.
|
124 |
+
if not is_domain_query(message):
|
125 |
+
yield get_out_of_scope_message(language)
|
126 |
+
return
|
127 |
+
|
128 |
+
# Build conversation context starting with the system message.
|
129 |
+
messages_context = [{"role": "system", "content": system_message}]
|
130 |
+
for user_msg, assistant_msg in history:
|
131 |
+
if user_msg:
|
132 |
+
messages_context.append({"role": "user", "content": user_msg})
|
133 |
+
if assistant_msg:
|
134 |
+
messages_context.append({"role": "assistant", "content": assistant_msg})
|
135 |
+
|
136 |
+
# Optional: Append retrieved context from scraped site content.
|
137 |
+
if knowledge_base:
|
138 |
+
snippet = retrieve_relevant_snippet(message, knowledge_base)
|
139 |
+
if snippet:
|
140 |
+
retrieval_context = f"Reference info from Agri Future Investment platform: {snippet}"
|
141 |
+
messages_context.insert(0, {"role": "system", "content": retrieval_context})
|
142 |
+
|
143 |
+
# Append the new user message.
|
144 |
+
messages_context.append({"role": "user", "content": message})
|
145 |
+
|
146 |
+
# Stream the model's reply token-by-token.
|
147 |
+
response = ""
|
148 |
+
for message_resp in client.chat_completion(
|
149 |
+
messages_context,
|
150 |
+
max_tokens=max_tokens,
|
151 |
+
stream=True,
|
152 |
+
temperature=temperature,
|
153 |
+
top_p=top_p,
|
154 |
+
):
|
155 |
+
token = message_resp.choices[0].delta.content
|
156 |
+
response += token
|
157 |
+
yield response
|
158 |
+
|
159 |
+
# --- Gradio Chat Interface ---
|
160 |
+
|
161 |
+
# The language selection dropdown uses language codes: "en" for English, "fr" for French, "am" for Amharic.
|
162 |
+
demo = gr.ChatInterface(
|
163 |
+
respond,
|
164 |
+
additional_inputs=[
|
165 |
+
gr.Textbox(
|
166 |
+
value="You are AgriFutureBot, designed to help visitors of the Agri Future Investment platform understand content about the site and answer questions strictly related to agriculture and agro-investment topics.",
|
167 |
+
label="System Message"
|
168 |
+
),
|
169 |
+
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
|
170 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
171 |
+
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)"),
|
172 |
+
gr.Dropdown(choices=["en", "fr", "am"], value="en", label="Language (en, fr, am)")
|
173 |
+
],
|
174 |
+
)
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
demo.launch()
|