File size: 5,065 Bytes
c010699
 
6ac4ea2
0e6072b
 
 
c010699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e6efef
c010699
 
1e6efef
c010699
 
 
 
 
 
 
 
 
 
 
 
 
 
6ac4ea2
 
 
 
 
 
 
 
 
1e6efef
6ac4ea2
 
 
 
1e6efef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ac4ea2
1e6efef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ac4ea2
1e6efef
c010699
 
 
 
1e6efef
c010699
 
 
 
 
f035190
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re
import time
import random
import gradio as gr
from huggingface_hub import InferenceClient

# Optional: Enable scraping if your site is deployed.
ENABLE_SCRAPING = False
SITE_URL = "https://your-agri-future-site.com"

# Global variable to hold scraped content
knowledge_base = ""

# --- Optional: Scraping Functionality ---
if ENABLE_SCRAPING:
    try:
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        from selenium.webdriver.common.by import By

        def scrape_site(url):
            options = Options()
            options.headless = True  # Run browser in headless mode.
            driver = webdriver.Chrome(options=options)
            driver.get(url)
            # Use explicit waits in production; here we use a basic sleep.
            time.sleep(5)
            try:
                # Customize the selector based on your site's HTML structure.
                content_element = driver.find_element(By.ID, "content")
                page_text = content_element.text
            except Exception as e:
                page_text = "Error encountered during scraping: " + str(e)
            driver.quit()
            return page_text

        knowledge_base = scrape_site(SITE_URL)
        print("Scraped knowledge base successfully.")
    except Exception as e:
        print("Scraping failed or Selenium is not configured:", e)
else:
    print("Scraping is disabled; proceeding without scraped site content.")

# --- Multilingual Helpers ---

def is_greeting(query: str, lang: str) -> bool:
    greetings = {
        "en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"],
        "fr": ["bonjour", "salut", "coucou", "bonsoir"],
        "am": ["ሰላም", "ሰላም እንደምን", "እንዴት"]
    }
    greet_list = greetings.get(lang, greetings["en"])
    # For languages using Latin script, convert to lower case.
    if lang != "am":
        query = query.lower()
    return any(query.startswith(greet) for greet in greet_list)

# Rather than using fixed out-of-scope messages, use the model via Hugging Face to generate them.
def generate_dynamic_out_of_scope_message(language: str) -> str:
    # Define language-specific system prompts for generating a dynamic out-of-scope message.
    system_prompts = {
        "en": (
            "You are a helpful chatbot specializing in agriculture and agro-investment. "
            "A user just asked a question that is not related to these topics. "
            "Generate a friendly, varied, and intelligent out-of-scope response in English that kindly encourages the user to ask about agriculture or agro-investment."
        ),
        "fr": (
            "Vous êtes un chatbot utile spécialisé dans l'agriculture et les investissements agroalimentaires. "
            "Un utilisateur vient de poser une question qui ne concerne pas ces sujets. "
            "Générez une réponse élégante, variée et intelligente en français pour indiquer que la question est hors de portée, en invitant l'utilisateur à poser une question sur l'agriculture ou les investissements agroalimentaires."
        ),
        "am": (
            "እርስዎ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ውስጥ በተለይ የተሞሉ ቻትቦት ናቸው። "
            "ተጠቃሚው ለግብርና ወይም ለአገልግሎት ስርዓተ-ቢዝነስ ተያይዞ ያልሆነ ጥያቄ አስቀድመዋል። "
            "በአማርኛ በተለያዩ መልኩ የውጭ ክፍል መልእክት ፍጥረት ያድርጉ፤ እባኮትን ተጠቃሚውን ለግብርና ወይም ለአገልግሎት ጥያቄዎች ለመጠየቅ ያነጋግሩ።"
        )
    }
    prompt = system_prompts.get(language, system_prompts["en"])
    messages = [{"role": "system", "content": prompt}]

    # Call the model without streaming to generate the complete message.
    response = client.chat_completion(
        messages,
        max_tokens=80,
        stream=False,
        temperature=0.7,
        top_p=0.95,
    )
    # Depending on the client structure, adjust the extraction of the generated text.
    try:
        out_message = response.choices[0].message.content
    except AttributeError:
        # If the response structure differs, do a fallback conversion.
        out_message = str(response)
    return out_message.strip()

# A helper to determine domain relevance (basic implementation; can be expanded).
def is_domain_query(query: str) -> bool:
    domain_keywords = [
        "agriculture", "farming", "crop", "agro", "investment", "soil",
        "irrigation", "harvest", "organic", "sustainable", "agribusiness",
        "livestock", "agroalimentaire", "agriculture durable"
    ]
    return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords)

def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str:
    sentences = re.split(r'[.?!]', text)
    for sentence in sentences