ACMCMC
commited on
Commit
·
e79532a
1
Parent(s):
326eaaa
Gemini 2.0
Browse files- .gitignore +3 -1
- app.py +37 -6
- requirements.txt +11 -10
- utils.py +53 -0
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
*.jsonl
|
2 |
-
__pycache__/
|
|
|
|
|
|
1 |
*.jsonl
|
2 |
+
__pycache__/
|
3 |
+
.venv/
|
4 |
+
.vscode/
|
app.py
CHANGED
@@ -11,6 +11,7 @@ import matplotlib.pyplot as plt
|
|
11 |
from utils import (
|
12 |
process_chat_file,
|
13 |
transform_conversations_dataset_into_training_examples,
|
|
|
14 |
)
|
15 |
from validation import check_format_errors, estimate_cost, get_distributions
|
16 |
|
@@ -78,6 +79,7 @@ def file_upload_callback(
|
|
78 |
split_conversation_threshold,
|
79 |
progress=gr.Progress(),
|
80 |
):
|
|
|
81 |
logger.info(f"Processing {files}")
|
82 |
full_system_prompt = f"""# Task
|
83 |
You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
|
@@ -186,11 +188,31 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
|
|
186 |
# However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
|
187 |
# We can use a UUID generator to create a unique file name.
|
188 |
uuid = str(uuid4())
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
# If there's less than 50 training examples, show a warning message
|
196 |
if len(training_examples_ds) < 50:
|
@@ -251,7 +273,7 @@ with gr.Blocks(theme=theme) as demo:
|
|
251 |
label="Upload WhatsApp Chat Files",
|
252 |
type="filepath",
|
253 |
file_count="multiple",
|
254 |
-
file_types=["txt"],
|
255 |
)
|
256 |
|
257 |
system_prompt = gr.Textbox(
|
@@ -268,6 +290,14 @@ with gr.Blocks(theme=theme) as demo:
|
|
268 |
info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
|
269 |
)
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
# Advanced parameters section, collapsed by default
|
272 |
with gr.Accordion(label="Advanced Parameters", open=False):
|
273 |
gr.Markdown(
|
@@ -383,6 +413,7 @@ with gr.Blocks(theme=theme) as demo:
|
|
383 |
user_role,
|
384 |
model_role,
|
385 |
whatsapp_name,
|
|
|
386 |
datetime_dayfirst,
|
387 |
message_line_format,
|
388 |
minutes_threshold,
|
|
|
11 |
from utils import (
|
12 |
process_chat_file,
|
13 |
transform_conversations_dataset_into_training_examples,
|
14 |
+
convert_gpt_to_gemini_format, # Add this import
|
15 |
)
|
16 |
from validation import check_format_errors, estimate_cost, get_distributions
|
17 |
|
|
|
79 |
split_conversation_threshold,
|
80 |
progress=gr.Progress(),
|
81 |
):
|
82 |
+
output_format = "GPT"
|
83 |
logger.info(f"Processing {files}")
|
84 |
full_system_prompt = f"""# Task
|
85 |
You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
|
|
|
188 |
# However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
|
189 |
# We can use a UUID generator to create a unique file name.
|
190 |
uuid = str(uuid4())
|
191 |
+
|
192 |
+
# Convert to Gemini format if selected
|
193 |
+
if output_format == "Gemini 2.0":
|
194 |
+
training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
|
195 |
+
validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
|
196 |
+
|
197 |
+
# Extract the gemini_format column and save as JSON files
|
198 |
+
training_gemini_list = training_examples_gemini["gemini_format"]
|
199 |
+
validation_gemini_list = validation_examples_gemini["gemini_format"]
|
200 |
+
|
201 |
+
# Save as JSON files with Gemini format
|
202 |
+
file_path = f"training_examples_gemini_{uuid}.json"
|
203 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
204 |
+
json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
|
205 |
+
|
206 |
+
file_path_validation = f"validation_examples_gemini_{uuid}.json"
|
207 |
+
with open(file_path_validation, 'w', encoding='utf-8') as f:
|
208 |
+
json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
|
209 |
+
else:
|
210 |
+
# Original GPT format - JSONL
|
211 |
+
file_path = f"training_examples_{uuid}.jsonl"
|
212 |
+
training_examples_ds.to_json(path_or_buf=file_path, force_ascii=False)
|
213 |
+
|
214 |
+
file_path_validation = f"validation_examples_{uuid}.jsonl"
|
215 |
+
validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
|
216 |
|
217 |
# If there's less than 50 training examples, show a warning message
|
218 |
if len(training_examples_ds) < 50:
|
|
|
273 |
label="Upload WhatsApp Chat Files",
|
274 |
type="filepath",
|
275 |
file_count="multiple",
|
276 |
+
# file_types=["txt"],
|
277 |
)
|
278 |
|
279 |
system_prompt = gr.Textbox(
|
|
|
290 |
info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
|
291 |
)
|
292 |
|
293 |
+
# Output format selection
|
294 |
+
output_format = gr.Radio(
|
295 |
+
choices=["GPT", "Gemini 2.0"],
|
296 |
+
value="GPT",
|
297 |
+
label="Output Format",
|
298 |
+
info="Choose the format for the generated training examples. GPT format for OpenAI/general use, Gemini 2.0 for Google's Gemini models.",
|
299 |
+
)
|
300 |
+
|
301 |
# Advanced parameters section, collapsed by default
|
302 |
with gr.Accordion(label="Advanced Parameters", open=False):
|
303 |
gr.Markdown(
|
|
|
413 |
user_role,
|
414 |
model_role,
|
415 |
whatsapp_name,
|
416 |
+
# output_format,
|
417 |
datetime_dayfirst,
|
418 |
message_line_format,
|
419 |
minutes_threshold,
|
requirements.txt
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
-
contextualSpellCheck
|
2 |
-
datasets
|
3 |
es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
|
4 |
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
5 |
-
matplotlib
|
6 |
-
numpy
|
7 |
-
pandas
|
8 |
-
spacy
|
9 |
-
tiktoken
|
10 |
-
torch
|
11 |
-
transformers
|
12 |
-
pyspellchecker
|
|
|
|
1 |
+
contextualSpellCheck
|
2 |
+
datasets>=2.18.0
|
3 |
es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
|
4 |
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
5 |
+
matplotlib>=3.8.3
|
6 |
+
numpy>=1.26.4
|
7 |
+
pandas>=2.2.1
|
8 |
+
spacy
|
9 |
+
tiktoken
|
10 |
+
torch>=2.2.1
|
11 |
+
transformers>=4.38.2
|
12 |
+
pyspellchecker
|
13 |
+
gradio==5.34.0
|
utils.py
CHANGED
@@ -521,3 +521,56 @@ def transform_conversations_dataset_into_training_examples(
|
|
521 |
) from e
|
522 |
|
523 |
return examples_filtered_by_length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
) from e
|
522 |
|
523 |
return examples_filtered_by_length
|
524 |
+
|
525 |
+
|
526 |
+
def convert_gpt_to_gemini_format(gpt_dataset):
|
527 |
+
"""
|
528 |
+
Convert GPT format Dataset to Gemini 2.0 format
|
529 |
+
|
530 |
+
GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
531 |
+
Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
|
532 |
+
"""
|
533 |
+
def process_examples(examples):
|
534 |
+
gemini_examples = []
|
535 |
+
|
536 |
+
for messages in examples["messages"]:
|
537 |
+
# Parse JSON messages if they're strings
|
538 |
+
if isinstance(messages, str):
|
539 |
+
import json
|
540 |
+
messages = json.loads(messages)
|
541 |
+
|
542 |
+
# Find system message and other contents
|
543 |
+
system_instruction = None
|
544 |
+
contents = []
|
545 |
+
|
546 |
+
for msg in messages:
|
547 |
+
if msg["role"] == "system":
|
548 |
+
system_instruction = {
|
549 |
+
"role": "system",
|
550 |
+
"parts": [{"text": msg["content"]}]
|
551 |
+
}
|
552 |
+
elif msg["role"] in ["user", "assistant", "model"]:
|
553 |
+
# Convert assistant to model for Gemini
|
554 |
+
role = "model" if msg["role"] == "assistant" else msg["role"]
|
555 |
+
contents.append({
|
556 |
+
"role": role,
|
557 |
+
"parts": [{"text": msg["content"]}]
|
558 |
+
})
|
559 |
+
|
560 |
+
gemini_example = {"contents": contents}
|
561 |
+
if system_instruction:
|
562 |
+
gemini_example["systemInstruction"] = system_instruction
|
563 |
+
|
564 |
+
gemini_examples.append(gemini_example)
|
565 |
+
|
566 |
+
# Return in the format expected by Dataset.map
|
567 |
+
return {"gemini_format": gemini_examples}
|
568 |
+
|
569 |
+
# Use Dataset.map to process the data
|
570 |
+
processed_dataset = gpt_dataset.map(
|
571 |
+
process_examples,
|
572 |
+
remove_columns=["messages"],
|
573 |
+
batched=True,
|
574 |
+
)
|
575 |
+
|
576 |
+
return processed_dataset
|