acecalisto3 commited on
Commit
b0016c8
·
verified ·
1 Parent(s): 6e5e86a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -207
app.py CHANGED
@@ -1,234 +1,84 @@
1
- import os
2
  import gradio as gr
3
  import requests
4
- import uuid
5
- import json
6
  from huggingface_hub import InferenceClient
7
- from pypdf import PdfReader
8
- from bs4 import BeautifulSoup
9
- import zipfile
10
- import nltk
11
- from typing import List, Dict
12
- import lxml
13
-
14
- # Ensure NLTK resources
15
- try:
16
- nltk.data.find('tokenizers/punkt')
17
- except LookupError:
18
- nltk.download('punkt')
19
-
20
- # Initialize Hugging Face API
21
  HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
22
- HF_TOKEN = os.environ.get("HF_TOKEN")
23
  client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
24
 
25
- # State to manage datasets
26
- datasets_queue = []
27
-
28
- def extract_text_from_url(url):
29
- try:
30
- response = requests.get(url, timeout=10)
31
- response.raise_for_status()
32
- soup = BeautifulSoup(response.content, "lxml") # Specify lxml here
33
- return soup.get_text()
34
- except Exception as e:
35
- return f"Error scraping URL: {e}"
36
-
37
- # Helper Functions
38
- def extract_text_from_pdf(file_path):
39
- try:
40
- reader = PdfReader(file_path)
41
- return "\n".join(page.extract_text() for page in reader.pages)
42
- except Exception as e:
43
- return f"Error reading PDF: {e}"
44
-
45
- def extract_text_from_url(url):
46
- try:
47
- response = requests.get(url, timeout=10)
48
- response.raise_for_status()
49
- soup = BeautifulSoup(response.content, "lxml")
50
- return soup.get_text()
51
- except Exception as e:
52
- return f"Error scraping URL: {e}"
53
-
54
- def process_uploaded_file(file):
55
- try:
56
- if file.name.endswith(".pdf"):
57
- return extract_text_from_pdf(file.name)
58
- elif file.name.endswith(".txt"):
59
- with open(file.name, "r", encoding="utf-8") as f:
60
- return f.read()
61
- elif file.name.endswith(".zip"):
62
- extracted_data = []
63
- with zipfile.ZipFile(file.name, "r") as zip_ref:
64
- for file_info in zip_ref.infolist():
65
- if file_info.filename.endswith((".pdf", ".txt")):
66
- with zip_ref.open(file_info) as f:
67
- content = f.read()
68
- if file_info.filename.endswith(".txt"):
69
- extracted_data.append(content.decode("utf-8"))
70
- elif file_info.filename.endswith(".pdf"):
71
- temp_path = f"/tmp/{uuid.uuid4()}"
72
- with open(temp_path, "wb") as temp_file:
73
- temp_file.write(content)
74
- extracted_data.append(extract_text_from_pdf(temp_path))
75
- return "\n".join(extracted_data)
76
- except Exception as e:
77
- return f"Error processing file: {e}"
78
-
79
- def chunk_text(text, max_chunk_size=2000):
80
- sentences = nltk.sent_tokenize(text)
81
- chunks, current_chunk = [], ""
82
- for sentence in sentences:
83
- if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
84
- chunks.append(current_chunk.strip())
85
- current_chunk = ""
86
- current_chunk += sentence + " "
87
- if current_chunk:
88
- chunks.append(current_chunk.strip())
89
- return chunks
90
-
91
- def infer_dataset(data, instructions):
92
- extracted = []
93
- chunks = chunk_text(data)
94
- for i, chunk in enumerate(chunks):
95
- try:
96
- response = client.text_generation(
97
- prompt=instructions.format(history=chunk),
98
- max_new_tokens=1024
99
- )
100
- extracted.append(response["generated_text"])
101
- except Exception as e:
102
- extracted.append(f"Error in chunk {i}: {e}")
103
- return "\n".join(extracted)
104
 
105
- # Gradio Interface
106
- def scrape_data(instructions, files, urls):
107
- combined_data = []
108
-
109
- # Process uploaded files
110
- if files:
111
- for file in files:
112
- combined_data.append(process_uploaded_file(file))
113
-
114
- # Process URLs
115
- if urls:
116
- url_list = [url.strip() for url in urls.split(",") if url.strip()]
117
- for url in url_list:
118
- combined_data.append(extract_text_from_url(url))
119
-
120
- # Combine and infer with instructions
121
- full_text = "\n".join(combined_data)
122
- if instructions:
123
- dataset = infer_dataset(full_text, instructions)
124
- else:
125
- dataset = full_text
126
-
127
- return dataset
128
-
129
- def add_to_queue(dataset):
130
- datasets_queue.append(dataset)
131
- return json.dumps(datasets_queue, indent=2)
132
-
133
- def combine_datasets():
134
- combined_data = "\n".join(datasets_queue)
135
- combined_json = {"combined_dataset": combined_data}
136
- combined_file = "/tmp/combined_dataset.json"
137
- with open(combined_file, "w") as f:
138
- json.dump(combined_json, f, indent=2)
139
- return json.dumps(combined_json, indent=2), combined_file
140
 
 
141
  def train_chatbot(dataset):
142
- system_message = {"system": "You are a bot trained on the following dataset:"}
143
- system_message["dataset"] = dataset
144
  return "Chatbot trained successfully!"
145
 
146
- def chat_with_bot(history, user_input):
147
- if "dataset" not in system_message:
148
- return history + [(user_input, "No dataset loaded for the chatbot.")]
149
-
150
- bot_response = client.text_generation(
151
- prompt=f"{system_message['dataset']} {user_input}",
152
- max_new_tokens=128
153
- )
154
- return history + [(user_input, bot_response["generated_text"])]
155
-
156
- # Gradio Interface
157
- with gr.Blocks() as app:
158
- gr.Markdown("# Intelligent Scraper, Dataset Handler, and Chatbot")
159
-
160
- with gr.Tab("Scrape / Extract Data"):
161
- gr.Markdown("Upload files or enter URLs to scrape data and generate JSON datasets.")
162
 
163
- instruction_input = gr.Textbox(label="Optional Instructions", placeholder="Enter instructions for scraping.")
164
- upload_files = gr.Files(label="Upload Files (PDF, TXT, ZIP)", file_types=[".pdf", ".txt", ".zip"])
165
- url_input = gr.Textbox(label="Enter URLs (comma-separated or multiline)")
166
- scrape_button = gr.Button("Scrape / Extract Data")
167
 
168
- extracted_output = gr.Textbox(label="Extracted Output")
169
- dataset_button = gr.Button("Add to Dataset Queue")
170
- scraped_dataset = gr.Textbox(label="Current Dataset")
171
 
172
- scrape_button.click(scrape_data, inputs=[instruction_input, upload_files, url_input], outputs=extracted_output)
173
- dataset_button.click(add_to_queue, inputs=[extracted_output], outputs=scraped_dataset)
 
 
 
 
 
174
 
175
- with gr.Tab("Combine Datasets"):
176
- gr.Markdown("Combine queued datasets into a single JSON dataset.")
 
177
 
178
- combine_button = gr.Button("Combine Datasets")
179
- combined_output = gr.Textbox(label="Combined Dataset")
180
- download_button = gr.Button("Download Combined Dataset")
181
- download_output = gr.File(label="Download")
182
 
183
- combine_button.click(combine_datasets, outputs=[combined_output, download_output])
 
 
 
 
 
 
 
 
184
 
185
- with gr.Tab("Train and Chat"):
186
- gr.Markdown("**Train a chatbot with a selected dataset and interact with it.**")
187
-
188
  chat_dataset = gr.Textbox(
189
  label="Dataset for Training",
190
- placeholder="Paste or load a dataset for training.",
191
  lines=5,
192
  )
193
  train_button = gr.Button("Train Chatbot")
194
- chatbot = gr.Chatbot(label="Chat with Trained Bot", type="messages")
 
 
 
 
 
 
 
 
 
 
195
  user_input = gr.Textbox(
196
  label="Your Message",
197
- placeholder="Type a message and press Enter...",
198
  lines=1,
199
  )
200
-
201
- # Persistent system message with dataset knowledge
202
- system_message = {"system": "You are a bot trained on the following dataset:"}
203
- bot_knowledge = {"dataset": None}
204
-
205
- # Train the chatbot by setting the dataset
206
- def train_chatbot(dataset):
207
- bot_knowledge["dataset"] = dataset
208
- return "Chatbot trained successfully!"
209
-
210
- # Chat function for handling user messages
211
- def chat_with_bot(history, user_message):
212
- if not bot_knowledge["dataset"]:
213
- return history + [{"role": "bot", "content": "No dataset loaded. Please train the bot first."}]
214
-
215
- # Append user input to history
216
- history.append({"role": "user", "content": user_message})
217
-
218
- # Generate bot response based on the dataset
219
- prompt = f"{bot_knowledge['dataset']} {user_message}"
220
- response = client.text_generation(prompt=prompt, max_new_tokens=128)["generated_text"]
221
-
222
- # Append bot response to history
223
- history.append({"role": "bot", "content": response})
224
- return history
225
-
226
- # Train button event
227
- train_button.click(train_chatbot, inputs=[chat_dataset], outputs=None)
228
-
229
- # User input submission event
230
- user_input.submit(
231
- chat_with_bot, inputs=[chatbot, user_input], outputs=chatbot
232
- )
233
-
234
- app.launch()
 
 
1
  import gradio as gr
2
  import requests
 
 
3
  from huggingface_hub import InferenceClient
4
+
5
+ # Initialize Hugging Face client
 
 
 
 
 
 
 
 
 
 
 
 
6
  HF_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
7
+ HF_TOKEN = "your_hugging_face_api_token" # Replace with your token
8
  client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
9
 
10
+ # Persistent bot knowledge state
11
+ bot_knowledge = {"dataset": None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Train chatbot by setting the dataset
15
  def train_chatbot(dataset):
16
+ bot_knowledge["dataset"] = dataset
 
17
  return "Chatbot trained successfully!"
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Chat function to process user input and generate bot responses
21
+ def chat_with_bot(history, user_input):
22
+ if not bot_knowledge["dataset"]:
23
+ return history + [{"role": "bot", "content": "No dataset loaded. Please train the bot first."}]
24
 
25
+ # Append user input to the chat history
26
+ history.append({"role": "user", "content": user_input})
 
27
 
28
+ # Generate bot response
29
+ prompt = f"{bot_knowledge['dataset']} {user_input}"
30
+ try:
31
+ response = client.text_generation(prompt=prompt, max_new_tokens=128)
32
+ bot_response = response.get("generated_text", "Sorry, I couldn't generate a response.")
33
+ except Exception as e:
34
+ bot_response = f"Error generating response: {e}"
35
 
36
+ # Append bot response to the history
37
+ history.append({"role": "bot", "content": bot_response})
38
+ return history
39
 
 
 
 
 
40
 
41
+ # Gradio Interface
42
+ with gr.Blocks(theme="default") as app:
43
+ gr.Markdown("# **Intelligent Chatbot with Knowledge Training**")
44
+ gr.Markdown(
45
+ """
46
+ Train a chatbot with custom datasets and interact with it dynamically.
47
+ The bot will persist knowledge from the dataset and answer questions accordingly.
48
+ """
49
+ )
50
 
51
+ # Train chatbot section
52
+ with gr.Row():
 
53
  chat_dataset = gr.Textbox(
54
  label="Dataset for Training",
55
+ placeholder="Paste a dataset here to train the chatbot.",
56
  lines=5,
57
  )
58
  train_button = gr.Button("Train Chatbot")
59
+
60
+ train_status = gr.Textbox(label="Training Status", interactive=False)
61
+
62
+ # Chat section
63
+ with gr.Row():
64
+ chatbot = gr.Chatbot(
65
+ label="Chat with Trained Bot",
66
+ type="messages",
67
+ avatar_user="https://example.com/user-avatar.png",
68
+ avatar_bot="https://example.com/bot-avatar.png",
69
+ )
70
  user_input = gr.Textbox(
71
  label="Your Message",
72
+ placeholder="Type your message and press Enter...",
73
  lines=1,
74
  )
75
+
76
+ # Train chatbot logic
77
+ train_button.click(train_chatbot, inputs=[chat_dataset], outputs=[train_status])
78
+
79
+ # Chat interaction logic
80
+ user_input.submit(chat_with_bot, inputs=[chatbot, user_input], outputs=chatbot)
81
+
82
+ # Launch app
83
+ if __name__ == "__main__":
84
+ app.launch(server_name="0.0.0.0", server_port=7860)