Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
import re
|
2 |
import time
|
|
|
3 |
import gradio as gr
|
4 |
from huggingface_hub import InferenceClient
|
5 |
|
6 |
# Optional: Enable scraping if your site is deployed.
|
7 |
-
# Set this flag to False until your site is available.
|
8 |
ENABLE_SCRAPING = False
|
9 |
SITE_URL = "https://your-agri-future-site.com"
|
10 |
|
@@ -23,10 +23,10 @@ if ENABLE_SCRAPING:
|
|
23 |
options.headless = True # Run browser in headless mode.
|
24 |
driver = webdriver.Chrome(options=options)
|
25 |
driver.get(url)
|
26 |
-
# Use explicit
|
27 |
time.sleep(5)
|
28 |
try:
|
29 |
-
# Customize the selector
|
30 |
content_element = driver.find_element(By.ID, "content")
|
31 |
page_text = content_element.text
|
32 |
except Exception as e:
|
@@ -41,63 +41,112 @@ if ENABLE_SCRAPING:
|
|
41 |
else:
|
42 |
print("Scraping is disabled; proceeding without scraped site content.")
|
43 |
|
44 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
def is_domain_query(query: str) -> bool:
|
46 |
-
"""Check if the query is relevant to agriculture and agro-investment."""
|
47 |
domain_keywords = [
|
48 |
"agriculture", "farming", "crop", "agro", "investment", "soil",
|
49 |
"irrigation", "harvest", "organic", "sustainable", "agribusiness",
|
50 |
-
"livestock"
|
|
|
51 |
]
|
52 |
return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)
|
53 |
|
54 |
def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str:
|
55 |
"""
|
56 |
-
A simple retrieval function that searches for
|
57 |
-
|
58 |
Returns a snippet limited to max_length characters.
|
59 |
"""
|
60 |
sentences = re.split(r'[.?!]', text)
|
61 |
for sentence in sentences:
|
62 |
if is_domain_query(sentence) and all(word.lower() in sentence.lower() for word in query.split()):
|
63 |
snippet = sentence.strip()
|
64 |
-
if len(snippet) > max_length
|
65 |
-
snippet = snippet[:max_length] + "..."
|
66 |
-
return snippet
|
67 |
return ""
|
68 |
|
69 |
# --- Chat Assistant Response Function ---
|
70 |
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
71 |
|
72 |
-
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
|
73 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
if not is_domain_query(message):
|
75 |
-
yield
|
76 |
return
|
77 |
|
78 |
-
# Build
|
79 |
-
|
80 |
for user_msg, assistant_msg in history:
|
81 |
if user_msg:
|
82 |
-
|
83 |
if assistant_msg:
|
84 |
-
|
85 |
|
86 |
-
# Optional: Append
|
87 |
if knowledge_base:
|
88 |
snippet = retrieve_relevant_snippet(message, knowledge_base)
|
89 |
if snippet:
|
90 |
-
|
91 |
-
|
92 |
-
messages.insert(0, {"role": "system", "content": retrieval_context})
|
93 |
|
94 |
-
# Append the new user
|
95 |
-
|
96 |
|
97 |
# Stream the model's reply token-by-token.
|
98 |
response = ""
|
99 |
for message_resp in client.chat_completion(
|
100 |
-
|
101 |
max_tokens=max_tokens,
|
102 |
stream=True,
|
103 |
temperature=temperature,
|
@@ -108,6 +157,8 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
|
|
108 |
yield response
|
109 |
|
110 |
# --- Gradio Chat Interface ---
|
|
|
|
|
111 |
demo = gr.ChatInterface(
|
112 |
respond,
|
113 |
additional_inputs=[
|
@@ -118,6 +169,7 @@ demo = gr.ChatInterface(
|
|
118 |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
|
119 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
120 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)"),
|
|
|
121 |
],
|
122 |
)
|
123 |
|
|
|
1 |
import re
|
2 |
import time
|
3 |
+
import random
|
4 |
import gradio as gr
|
5 |
from huggingface_hub import InferenceClient
|
6 |
|
7 |
# Optional: Enable scraping if your site is deployed.
|
|
|
8 |
ENABLE_SCRAPING = False
|
9 |
SITE_URL = "https://your-agri-future-site.com"
|
10 |
|
|
|
23 |
options.headless = True # Run browser in headless mode.
|
24 |
driver = webdriver.Chrome(options=options)
|
25 |
driver.get(url)
|
26 |
+
# Use explicit waits in production code; here we use a simple sleep.
|
27 |
time.sleep(5)
|
28 |
try:
|
29 |
+
# Customize the selector as per your site's HTML structure.
|
30 |
content_element = driver.find_element(By.ID, "content")
|
31 |
page_text = content_element.text
|
32 |
except Exception as e:
|
|
|
41 |
else:
|
42 |
print("Scraping is disabled; proceeding without scraped site content.")
|
43 |
|
44 |
+
# --- Multilingual Helpers ---
|
45 |
+
|
46 |
+
# Language-specific greeting detection
|
47 |
+
def is_greeting(query: str, lang: str) -> bool:
|
48 |
+
greetings = {
|
49 |
+
"en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"],
|
50 |
+
"fr": ["bonjour", "salut", "coucou", "bonsoir"],
|
51 |
+
"am": ["ሰላም", "ሰላም እንደምን", "እንዴት"]
|
52 |
+
}
|
53 |
+
# Retrieve greetings for the provided language; default to English if unavailable.
|
54 |
+
greet_list = greetings.get(lang, greetings["en"])
|
55 |
+
# For Amharic, no transformation; for Latin scripts, convert to lower case.
|
56 |
+
if lang != "am":
|
57 |
+
query = query.lower()
|
58 |
+
return any(query.startswith(greet) for greet in greet_list)
|
59 |
+
|
60 |
+
# Language-specific out-of-scope messages
|
61 |
+
def get_out_of_scope_message(lang: str) -> str:
|
62 |
+
messages = {
|
63 |
+
"en": [
|
64 |
+
"I appreciate your curiosity. However, my expertise lies exclusively in agricultural and agro-investment insights. Could you please frame your question accordingly?",
|
65 |
+
"That’s an interesting thought, but I'm tailored specifically for topics concerning agriculture and agro-investment. Please ask a question within that realm.",
|
66 |
+
"While I value your inquiry, I'm optimized to provide insights solely on agriculture and related investment matters. Could you rephrase your query to align with these topics?",
|
67 |
+
"It appears your question may not be directly tied to agriculture or agro-investment. Please ask something along those lines so I can assist effectively."
|
68 |
+
],
|
69 |
+
"fr": [
|
70 |
+
"J'apprécie votre curiosité. Cependant, mon expertise se limite exclusivement aux informations sur l'agriculture et les investissements agroalimentaires. Pourriez-vous reformuler votre question en ce sens ?",
|
71 |
+
"C'est une pensée intéressante, mais je suis spécialisé dans les domaines de l'agriculture et des investissements agroalimentaires. Merci de poser une question dans ce domaine.",
|
72 |
+
"Bien que votre question soit pertinente, je me concentre uniquement sur l'agriculture et les investissements associés. Pourriez-vous reformuler votre demande en conséquence ?",
|
73 |
+
"Votre interrogation semble éloignée de l'agriculture ou des investissements agroalimentaires. Merci de poser une question dans ces domaines pour que je puisse vous aider efficacement."
|
74 |
+
],
|
75 |
+
"am": [
|
76 |
+
"እባክዎ ልጠይቁት ጥያቄ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ዙሪያ መሆኑን አላስቀምጥም። እባኮትን ጥያቄዎን እንደዚህ በማቅረብ ደግሞ ይሞክሩ።",
|
77 |
+
"ልዩ ጥያቄዎችን ማቅረብ ይፈልጋሉ እንጂ፣ እኔ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች ላይ ብቻ እንደሚሰራ ተዘጋጅቻለሁ። እባክዎ ጥያቄዎን በእነዚህ ክስተቶች ውስጥ ያቅርቡ።",
|
78 |
+
"እንደምታዩት ጥያቄዎ በግብርና ወይም በአገልግሎት ስርዓተ-ቢዝነስ ላይ የተመረጠ አይደለም። እባክዎ በዚህ አውድ የሆነ ጥያቄ ይጠይቁ።"
|
79 |
+
]
|
80 |
+
}
|
81 |
+
# Return a random message for the given language; default to English if not available.
|
82 |
+
return random.choice(messages.get(lang, messages["en"]))
|
83 |
+
|
84 |
+
# Helper to determine if a query is relevant to our domain (English check only; can be expanded).
|
85 |
def is_domain_query(query: str) -> bool:
|
|
|
86 |
domain_keywords = [
|
87 |
"agriculture", "farming", "crop", "agro", "investment", "soil",
|
88 |
"irrigation", "harvest", "organic", "sustainable", "agribusiness",
|
89 |
+
"livestock", # additional English keywords
|
90 |
+
"agriculture", "agroalimentaire", "agriculture durable" # French terms can also be included
|
91 |
]
|
92 |
return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)
|
93 |
|
94 |
def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str:
|
95 |
"""
|
96 |
+
A simple retrieval function that searches for sentences in the text
|
97 |
+
containing domain keywords from the query.
|
98 |
Returns a snippet limited to max_length characters.
|
99 |
"""
|
100 |
sentences = re.split(r'[.?!]', text)
|
101 |
for sentence in sentences:
|
102 |
if is_domain_query(sentence) and all(word.lower() in sentence.lower() for word in query.split()):
|
103 |
snippet = sentence.strip()
|
104 |
+
return snippet[:max_length] + "..." if len(snippet) > max_length else snippet
|
|
|
|
|
105 |
return ""
|
106 |
|
107 |
# --- Chat Assistant Response Function ---
|
108 |
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
109 |
|
110 |
+
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, language):
|
111 |
+
# language is expected as a string code: "en", "fr", or "am"
|
112 |
+
|
113 |
+
# Check for a greeting in the appropriate language.
|
114 |
+
if is_greeting(message, language):
|
115 |
+
greetings = {
|
116 |
+
"en": "Hello! How can I assist you today with your agriculture or agro-investment inquiries?",
|
117 |
+
"fr": "Bonjour! Comment puis-je vous aider aujourd'hui en matière d'agriculture ou d'investissements agroalimentaires?",
|
118 |
+
"am": "ሰላም! ዛሬ ስለ ግብርና ወይም ስለ አገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች እንዴት ልረዳዎት?"
|
119 |
+
}
|
120 |
+
yield greetings.get(language, greetings["en"])
|
121 |
+
return
|
122 |
+
|
123 |
+
# If the query is not recognized as domain related, return an out-of-scope message.
|
124 |
if not is_domain_query(message):
|
125 |
+
yield get_out_of_scope_message(language)
|
126 |
return
|
127 |
|
128 |
+
# Build conversation context starting with the system message.
|
129 |
+
messages_context = [{"role": "system", "content": system_message}]
|
130 |
for user_msg, assistant_msg in history:
|
131 |
if user_msg:
|
132 |
+
messages_context.append({"role": "user", "content": user_msg})
|
133 |
if assistant_msg:
|
134 |
+
messages_context.append({"role": "assistant", "content": assistant_msg})
|
135 |
|
136 |
+
# Optional: Append retrieved context from scraped site content.
|
137 |
if knowledge_base:
|
138 |
snippet = retrieve_relevant_snippet(message, knowledge_base)
|
139 |
if snippet:
|
140 |
+
retrieval_context = f"Reference info from Agri Future Investment platform: {snippet}"
|
141 |
+
messages_context.insert(0, {"role": "system", "content": retrieval_context})
|
|
|
142 |
|
143 |
+
# Append the new user message.
|
144 |
+
messages_context.append({"role": "user", "content": message})
|
145 |
|
146 |
# Stream the model's reply token-by-token.
|
147 |
response = ""
|
148 |
for message_resp in client.chat_completion(
|
149 |
+
messages_context,
|
150 |
max_tokens=max_tokens,
|
151 |
stream=True,
|
152 |
temperature=temperature,
|
|
|
157 |
yield response
|
158 |
|
159 |
# --- Gradio Chat Interface ---
|
160 |
+
|
161 |
+
# The language selection dropdown uses language codes: "en" for English, "fr" for French, "am" for Amharic.
|
162 |
demo = gr.ChatInterface(
|
163 |
respond,
|
164 |
additional_inputs=[
|
|
|
169 |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
|
170 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
171 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)"),
|
172 |
+
gr.Dropdown(choices=["en", "fr", "am"], value="en", label="Language (en, fr, am)")
|
173 |
],
|
174 |
)
|
175 |
|