ACMCMC commited on
Commit
e79532a
·
1 Parent(s): 326eaaa

Gemini 2.0

Browse files
Files changed (4) hide show
  1. .gitignore +3 -1
  2. app.py +37 -6
  3. requirements.txt +11 -10
  4. utils.py +53 -0
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  *.jsonl
2
- __pycache__/
 
 
 
1
  *.jsonl
2
+ __pycache__/
3
+ .venv/
4
+ .vscode/
app.py CHANGED
@@ -11,6 +11,7 @@ import matplotlib.pyplot as plt
11
  from utils import (
12
  process_chat_file,
13
  transform_conversations_dataset_into_training_examples,
 
14
  )
15
  from validation import check_format_errors, estimate_cost, get_distributions
16
 
@@ -78,6 +79,7 @@ def file_upload_callback(
78
  split_conversation_threshold,
79
  progress=gr.Progress(),
80
  ):
 
81
  logger.info(f"Processing {files}")
82
  full_system_prompt = f"""# Task
83
  You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
@@ -186,11 +188,31 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
186
  # However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
187
  # We can use a UUID generator to create a unique file name.
188
  uuid = str(uuid4())
189
- file_path = f"training_examples_{uuid}.jsonl"
190
- training_examples_ds.to_json(path_or_buf=file_path, force_ascii=False)
191
-
192
- file_path_validation = f"validation_examples_{uuid}.jsonl"
193
- validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  # If there's less than 50 training examples, show a warning message
196
  if len(training_examples_ds) < 50:
@@ -251,7 +273,7 @@ with gr.Blocks(theme=theme) as demo:
251
  label="Upload WhatsApp Chat Files",
252
  type="filepath",
253
  file_count="multiple",
254
- file_types=["txt"],
255
  )
256
 
257
  system_prompt = gr.Textbox(
@@ -268,6 +290,14 @@ with gr.Blocks(theme=theme) as demo:
268
  info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
269
  )
270
 
 
 
 
 
 
 
 
 
271
  # Advanced parameters section, collapsed by default
272
  with gr.Accordion(label="Advanced Parameters", open=False):
273
  gr.Markdown(
@@ -383,6 +413,7 @@ with gr.Blocks(theme=theme) as demo:
383
  user_role,
384
  model_role,
385
  whatsapp_name,
 
386
  datetime_dayfirst,
387
  message_line_format,
388
  minutes_threshold,
 
11
  from utils import (
12
  process_chat_file,
13
  transform_conversations_dataset_into_training_examples,
14
+ convert_gpt_to_gemini_format, # Add this import
15
  )
16
  from validation import check_format_errors, estimate_cost, get_distributions
17
 
 
79
  split_conversation_threshold,
80
  progress=gr.Progress(),
81
  ):
82
+ output_format = "GPT"
83
  logger.info(f"Processing {files}")
84
  full_system_prompt = f"""# Task
85
  You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
 
188
  # However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
189
  # We can use a UUID generator to create a unique file name.
190
  uuid = str(uuid4())
191
+
192
+ # Convert to Gemini format if selected
193
+ if output_format == "Gemini 2.0":
194
+ training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
195
+ validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
196
+
197
+ # Extract the gemini_format column and save as JSON files
198
+ training_gemini_list = training_examples_gemini["gemini_format"]
199
+ validation_gemini_list = validation_examples_gemini["gemini_format"]
200
+
201
+ # Save as JSON files with Gemini format
202
+ file_path = f"training_examples_gemini_{uuid}.json"
203
+ with open(file_path, 'w', encoding='utf-8') as f:
204
+ json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
205
+
206
+ file_path_validation = f"validation_examples_gemini_{uuid}.json"
207
+ with open(file_path_validation, 'w', encoding='utf-8') as f:
208
+ json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
209
+ else:
210
+ # Original GPT format - JSONL
211
+ file_path = f"training_examples_{uuid}.jsonl"
212
+ training_examples_ds.to_json(path_or_buf=file_path, force_ascii=False)
213
+
214
+ file_path_validation = f"validation_examples_{uuid}.jsonl"
215
+ validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
216
 
217
  # If there's less than 50 training examples, show a warning message
218
  if len(training_examples_ds) < 50:
 
273
  label="Upload WhatsApp Chat Files",
274
  type="filepath",
275
  file_count="multiple",
276
+ # file_types=["txt"],
277
  )
278
 
279
  system_prompt = gr.Textbox(
 
290
  info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
291
  )
292
 
293
+ # Output format selection
294
+ output_format = gr.Radio(
295
+ choices=["GPT", "Gemini 2.0"],
296
+ value="GPT",
297
+ label="Output Format",
298
+ info="Choose the format for the generated training examples. GPT format for OpenAI/general use, Gemini 2.0 for Google's Gemini models.",
299
+ )
300
+
301
  # Advanced parameters section, collapsed by default
302
  with gr.Accordion(label="Advanced Parameters", open=False):
303
  gr.Markdown(
 
413
  user_role,
414
  model_role,
415
  whatsapp_name,
416
+ # output_format,
417
  datetime_dayfirst,
418
  message_line_format,
419
  minutes_threshold,
requirements.txt CHANGED
@@ -1,12 +1,13 @@
1
- contextualSpellCheck==0.4.4
2
- datasets==2.18.0
3
  es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
4
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
5
- matplotlib==3.8.3
6
- numpy==1.26.4
7
- pandas==2.2.1
8
- spacy==3.7.4
9
- tiktoken==0.6.0
10
- torch==2.2.1
11
- transformers==4.38.2
12
- pyspellchecker==0.8.1
 
 
1
+ contextualSpellCheck
2
+ datasets>=2.18.0
3
  es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
4
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
5
+ matplotlib>=3.8.3
6
+ numpy>=1.26.4
7
+ pandas>=2.2.1
8
+ spacy
9
+ tiktoken
10
+ torch>=2.2.1
11
+ transformers>=4.38.2
12
+ pyspellchecker
13
+ gradio==5.34.0
utils.py CHANGED
@@ -521,3 +521,56 @@ def transform_conversations_dataset_into_training_examples(
521
  ) from e
522
 
523
  return examples_filtered_by_length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  ) from e
522
 
523
  return examples_filtered_by_length
524
+
525
+
526
+ def convert_gpt_to_gemini_format(gpt_dataset):
527
+ """
528
+ Convert GPT format Dataset to Gemini 2.0 format
529
+
530
+ GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
531
+ Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
532
+ """
533
+ def process_examples(examples):
534
+ gemini_examples = []
535
+
536
+ for messages in examples["messages"]:
537
+ # Parse JSON messages if they're strings
538
+ if isinstance(messages, str):
539
+ import json
540
+ messages = json.loads(messages)
541
+
542
+ # Find system message and other contents
543
+ system_instruction = None
544
+ contents = []
545
+
546
+ for msg in messages:
547
+ if msg["role"] == "system":
548
+ system_instruction = {
549
+ "role": "system",
550
+ "parts": [{"text": msg["content"]}]
551
+ }
552
+ elif msg["role"] in ["user", "assistant", "model"]:
553
+ # Convert assistant to model for Gemini
554
+ role = "model" if msg["role"] == "assistant" else msg["role"]
555
+ contents.append({
556
+ "role": role,
557
+ "parts": [{"text": msg["content"]}]
558
+ })
559
+
560
+ gemini_example = {"contents": contents}
561
+ if system_instruction:
562
+ gemini_example["systemInstruction"] = system_instruction
563
+
564
+ gemini_examples.append(gemini_example)
565
+
566
+ # Return in the format expected by Dataset.map
567
+ return {"gemini_format": gemini_examples}
568
+
569
+ # Use Dataset.map to process the data
570
+ processed_dataset = gpt_dataset.map(
571
+ process_examples,
572
+ remove_columns=["messages"],
573
+ batched=True,
574
+ )
575
+
576
+ return processed_dataset