Spaces:

acecalisto3
/

GitBot

Runtime error

App Files Files Community

acecalisto3 commited on Dec 30, 2024

Commit

b0016c8

verified ·

1 Parent(s): 6e5e86a

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -207

app.py CHANGED Viewed

@@ -1,234 +1,84 @@
-import os
 import gradio as gr
 import requests
-import uuid
-import json
 from huggingface_hub import InferenceClient
-from pypdf import PdfReader
-from bs4 import BeautifulSoup
-import zipfile
-import nltk
-from typing import List, Dict
-import lxml
-# Ensure NLTK resources
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
-# Initialize Hugging Face API
 HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-HF_TOKEN = os.environ.get("HF_TOKEN")
 client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
-# State to manage datasets
-datasets_queue = []
-def extract_text_from_url(url):
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, "lxml")  # Specify lxml here
-        return soup.get_text()
-    except Exception as e:
-        return f"Error scraping URL: {e}"
-# Helper Functions
-def extract_text_from_pdf(file_path):
-    try:
-        reader = PdfReader(file_path)
-        return "\n".join(page.extract_text() for page in reader.pages)
-    except Exception as e:
-        return f"Error reading PDF: {e}"
-def extract_text_from_url(url):
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, "lxml")
-        return soup.get_text()
-    except Exception as e:
-        return f"Error scraping URL: {e}"
-def process_uploaded_file(file):
-    try:
-        if file.name.endswith(".pdf"):
-            return extract_text_from_pdf(file.name)
-        elif file.name.endswith(".txt"):
-            with open(file.name, "r", encoding="utf-8") as f:
-                return f.read()
-        elif file.name.endswith(".zip"):
-            extracted_data = []
-            with zipfile.ZipFile(file.name, "r") as zip_ref:
-                for file_info in zip_ref.infolist():
-                    if file_info.filename.endswith((".pdf", ".txt")):
-                        with zip_ref.open(file_info) as f:
-                            content = f.read()
-                            if file_info.filename.endswith(".txt"):
-                                extracted_data.append(content.decode("utf-8"))
-                            elif file_info.filename.endswith(".pdf"):
-                                temp_path = f"/tmp/{uuid.uuid4()}"
-                                with open(temp_path, "wb") as temp_file:
-                                    temp_file.write(content)
-                                extracted_data.append(extract_text_from_pdf(temp_path))
-            return "\n".join(extracted_data)
-    except Exception as e:
-        return f"Error processing file: {e}"
-def chunk_text(text, max_chunk_size=2000):
-    sentences = nltk.sent_tokenize(text)
-    chunks, current_chunk = [], ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
-            chunks.append(current_chunk.strip())
-            current_chunk = ""
-        current_chunk += sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-def infer_dataset(data, instructions):
-    extracted = []
-    chunks = chunk_text(data)
-    for i, chunk in enumerate(chunks):
-        try:
-            response = client.text_generation(
-                prompt=instructions.format(history=chunk),
-                max_new_tokens=1024
-            )
-            extracted.append(response["generated_text"])
-        except Exception as e:
-            extracted.append(f"Error in chunk {i}: {e}")
-    return "\n".join(extracted)
-# Gradio Interface
-def scrape_data(instructions, files, urls):
-    combined_data = []
-    # Process uploaded files
-    if files:
-        for file in files:
-            combined_data.append(process_uploaded_file(file))
-    # Process URLs
-    if urls:
-        url_list = [url.strip() for url in urls.split(",") if url.strip()]
-        for url in url_list:
-            combined_data.append(extract_text_from_url(url))
-    # Combine and infer with instructions
-    full_text = "\n".join(combined_data)
-    if instructions:
-        dataset = infer_dataset(full_text, instructions)
-    else:
-        dataset = full_text
-    return dataset
-def add_to_queue(dataset):
-    datasets_queue.append(dataset)
-    return json.dumps(datasets_queue, indent=2)
-def combine_datasets():
-    combined_data = "\n".join(datasets_queue)
-    combined_json = {"combined_dataset": combined_data}
-    combined_file = "/tmp/combined_dataset.json"
-    with open(combined_file, "w") as f:
-        json.dump(combined_json, f, indent=2)
-    return json.dumps(combined_json, indent=2), combined_file
 def train_chatbot(dataset):
-    system_message = {"system": "You are a bot trained on the following dataset:"}
-    system_message["dataset"] = dataset
     return "Chatbot trained successfully!"
-def chat_with_bot(history, user_input):
-    if "dataset" not in system_message:
-        return history + [(user_input, "No dataset loaded for the chatbot.")]
-    bot_response = client.text_generation(
-        prompt=f"{system_message['dataset']} {user_input}",
-        max_new_tokens=128
-    )
-    return history + [(user_input, bot_response["generated_text"])]
-# Gradio Interface
-with gr.Blocks() as app:
-    gr.Markdown("# Intelligent Scraper, Dataset Handler, and Chatbot")
-    with gr.Tab("Scrape / Extract Data"):
-        gr.Markdown("Upload files or enter URLs to scrape data and generate JSON datasets.")
-        instruction_input = gr.Textbox(label="Optional Instructions", placeholder="Enter instructions for scraping.")
-        upload_files = gr.Files(label="Upload Files (PDF, TXT, ZIP)", file_types=[".pdf", ".txt", ".zip"])
-        url_input = gr.Textbox(label="Enter URLs (comma-separated or multiline)")
-        scrape_button = gr.Button("Scrape / Extract Data")
-        extracted_output = gr.Textbox(label="Extracted Output")
-        dataset_button = gr.Button("Add to Dataset Queue")
-        scraped_dataset = gr.Textbox(label="Current Dataset")
-        scrape_button.click(scrape_data, inputs=[instruction_input, upload_files, url_input], outputs=extracted_output)
-        dataset_button.click(add_to_queue, inputs=[extracted_output], outputs=scraped_dataset)
-    with gr.Tab("Combine Datasets"):
-        gr.Markdown("Combine queued datasets into a single JSON dataset.")
-        combine_button = gr.Button("Combine Datasets")
-        combined_output = gr.Textbox(label="Combined Dataset")
-        download_button = gr.Button("Download Combined Dataset")
-        download_output = gr.File(label="Download")
-        combine_button.click(combine_datasets, outputs=[combined_output, download_output])
-    with gr.Tab("Train and Chat"):
-        gr.Markdown("**Train a chatbot with a selected dataset and interact with it.**")
         chat_dataset = gr.Textbox(
             label="Dataset for Training",
-            placeholder="Paste or load a dataset for training.",
             lines=5,
         )
         train_button = gr.Button("Train Chatbot")
-        chatbot = gr.Chatbot(label="Chat with Trained Bot", type="messages")
         user_input = gr.Textbox(
             label="Your Message",
-            placeholder="Type a message and press Enter...",
             lines=1,
         )
-        # Persistent system message with dataset knowledge
-        system_message = {"system": "You are a bot trained on the following dataset:"}
-        bot_knowledge = {"dataset": None}
-        # Train the chatbot by setting the dataset
-        def train_chatbot(dataset):
-            bot_knowledge["dataset"] = dataset
-            return "Chatbot trained successfully!"
-        # Chat function for handling user messages
-        def chat_with_bot(history, user_message):
-            if not bot_knowledge["dataset"]:
-                return history + [{"role": "bot", "content": "No dataset loaded. Please train the bot first."}]
-            # Append user input to history
-            history.append({"role": "user", "content": user_message})
-            # Generate bot response based on the dataset
-            prompt = f"{bot_knowledge['dataset']} {user_message}"
-            response = client.text_generation(prompt=prompt, max_new_tokens=128)["generated_text"]
-            # Append bot response to history
-            history.append({"role": "bot", "content": response})
-            return history
-        # Train button event
-        train_button.click(train_chatbot, inputs=[chat_dataset], outputs=None)
-        # User input submission event
-        user_input.submit(
-            chat_with_bot, inputs=[chatbot, user_input], outputs=chatbot
-        )
-    app.launch()

 import gradio as gr
 import requests
 from huggingface_hub import InferenceClient
+# Initialize Hugging Face client
 HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+HF_TOKEN = "your_hugging_face_api_token"  # Replace with your token
 client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
+# Persistent bot knowledge state
+bot_knowledge = {"dataset": None}
+# Train chatbot by setting the dataset
 def train_chatbot(dataset):
+    bot_knowledge["dataset"] = dataset
     return "Chatbot trained successfully!"
+# Chat function to process user input and generate bot responses
+def chat_with_bot(history, user_input):
+    if not bot_knowledge["dataset"]:
+        return history + [{"role": "bot", "content": "No dataset loaded. Please train the bot first."}]
+    # Append user input to the chat history
+    history.append({"role": "user", "content": user_input})
+    # Generate bot response
+    prompt = f"{bot_knowledge['dataset']} {user_input}"
+    try:
+        response = client.text_generation(prompt=prompt, max_new_tokens=128)
+        bot_response = response.get("generated_text", "Sorry, I couldn't generate a response.")
+    except Exception as e:
+        bot_response = f"Error generating response: {e}"
+    # Append bot response to the history
+    history.append({"role": "bot", "content": bot_response})
+    return history
+# Gradio Interface
+with gr.Blocks(theme="default") as app:
+    gr.Markdown("# **Intelligent Chatbot with Knowledge Training**")
+    gr.Markdown(
+        """
+        Train a chatbot with custom datasets and interact with it dynamically.
+        The bot will persist knowledge from the dataset and answer questions accordingly.
+        """
+    )
+    # Train chatbot section
+    with gr.Row():
         chat_dataset = gr.Textbox(
             label="Dataset for Training",
+            placeholder="Paste a dataset here to train the chatbot.",
             lines=5,
         )
         train_button = gr.Button("Train Chatbot")
+    train_status = gr.Textbox(label="Training Status", interactive=False)
+    # Chat section
+    with gr.Row():
+        chatbot = gr.Chatbot(
+            label="Chat with Trained Bot",
+            type="messages",
+            avatar_user="https://example.com/user-avatar.png",
+            avatar_bot="https://example.com/bot-avatar.png",
+        )
         user_input = gr.Textbox(
             label="Your Message",
+            placeholder="Type your message and press Enter...",
             lines=1,
         )
+    # Train chatbot logic
+    train_button.click(train_chatbot, inputs=[chat_dataset], outputs=[train_status])
+    # Chat interaction logic
+    user_input.submit(chat_with_bot, inputs=[chatbot, user_input], outputs=chatbot)
+# Launch app
+if __name__ == "__main__":
+    app.launch(server_name="0.0.0.0", server_port=7860)