Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

ACMCMC commited on Jun 14

Commit

e79532a

1 Parent(s): 326eaaa

Gemini 2.0

Browse files

Files changed (4) hide show

.gitignore +3 -1
app.py +37 -6
requirements.txt +11 -10
utils.py +53 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 *.jsonl
-__pycache__/

 *.jsonl
+__pycache__/
+.venv/
+.vscode/

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import matplotlib.pyplot as plt
 from utils import (
     process_chat_file,
     transform_conversations_dataset_into_training_examples,
 )
 from validation import check_format_errors, estimate_cost, get_distributions
@@ -78,6 +79,7 @@ def file_upload_callback(
     split_conversation_threshold,
     progress=gr.Progress(),
 ):
     logger.info(f"Processing {files}")
     full_system_prompt = f"""# Task
 You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
@@ -186,11 +188,31 @@ The {model_role} and the {user_role} can send multiple messages in a row, as a J
     # However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
     # We can use a UUID generator to create a unique file name.
     uuid = str(uuid4())
-    file_path = f"training_examples_{uuid}.jsonl"
-    training_examples_ds.to_json(path_or_buf=file_path, force_ascii=False)
-    file_path_validation = f"validation_examples_{uuid}.jsonl"
-    validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
     # If there's less than 50 training examples, show a warning message
     if len(training_examples_ds) < 50:
@@ -251,7 +273,7 @@ with gr.Blocks(theme=theme) as demo:
         label="Upload WhatsApp Chat Files",
         type="filepath",
         file_count="multiple",
-        file_types=["txt"],
     )
     system_prompt = gr.Textbox(
@@ -268,6 +290,14 @@ with gr.Blocks(theme=theme) as demo:
         info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
     )
     # Advanced parameters section, collapsed by default
     with gr.Accordion(label="Advanced Parameters", open=False):
         gr.Markdown(
@@ -383,6 +413,7 @@ with gr.Blocks(theme=theme) as demo:
             user_role,
             model_role,
             whatsapp_name,
             datetime_dayfirst,
             message_line_format,
             minutes_threshold,

 from utils import (
     process_chat_file,
     transform_conversations_dataset_into_training_examples,
+    convert_gpt_to_gemini_format,  # Add this import
 )
 from validation import check_format_errors, estimate_cost, get_distributions
     split_conversation_threshold,
     progress=gr.Progress(),
 ):
+    output_format = "GPT"
     logger.info(f"Processing {files}")
     full_system_prompt = f"""# Task
 You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
     # However, if different users are using the app at the same time, we need to make sure that the file is unique AND that no user can access the file of another user.
     # We can use a UUID generator to create a unique file name.
     uuid = str(uuid4())
+    # Convert to Gemini format if selected
+    if output_format == "Gemini 2.0":
+        training_examples_gemini = convert_gpt_to_gemini_format(training_examples_ds)
+        validation_examples_gemini = convert_gpt_to_gemini_format(validation_examples_ds)
+        # Extract the gemini_format column and save as JSON files
+        training_gemini_list = training_examples_gemini["gemini_format"]
+        validation_gemini_list = validation_examples_gemini["gemini_format"]
+        # Save as JSON files with Gemini format
+        file_path = f"training_examples_gemini_{uuid}.json"
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(training_gemini_list, f, ensure_ascii=False, indent=2)
+        file_path_validation = f"validation_examples_gemini_{uuid}.json"
+        with open(file_path_validation, 'w', encoding='utf-8') as f:
+            json.dump(validation_gemini_list, f, ensure_ascii=False, indent=2)
+    else:
+        # Original GPT format - JSONL
+        file_path = f"training_examples_{uuid}.jsonl"
+        training_examples_ds.to_json(path_or_buf=file_path, force_ascii=False)
+        file_path_validation = f"validation_examples_{uuid}.jsonl"
+        validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
     # If there's less than 50 training examples, show a warning message
     if len(training_examples_ds) < 50:
         label="Upload WhatsApp Chat Files",
         type="filepath",
         file_count="multiple",
+        # file_types=["txt"],
     )
     system_prompt = gr.Textbox(
         info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
     )
+    # Output format selection
+    output_format = gr.Radio(
+        choices=["GPT", "Gemini 2.0"],
+        value="GPT",
+        label="Output Format",
+        info="Choose the format for the generated training examples. GPT format for OpenAI/general use, Gemini 2.0 for Google's Gemini models.",
+    )
     # Advanced parameters section, collapsed by default
     with gr.Accordion(label="Advanced Parameters", open=False):
         gr.Markdown(
             user_role,
             model_role,
             whatsapp_name,
+            # output_format,
             datetime_dayfirst,
             message_line_format,
             minutes_threshold,

requirements.txt CHANGED Viewed

@@ -1,12 +1,13 @@
-contextualSpellCheck==0.4.4
-datasets==2.18.0
 es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
-matplotlib==3.8.3
-numpy==1.26.4
-pandas==2.2.1
-spacy==3.7.4
-tiktoken==0.6.0
-torch==2.2.1
-transformers==4.38.2
-pyspellchecker==0.8.1

+contextualSpellCheck
+datasets>=2.18.0
 es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl#sha256=61e6e5530941f5880166855f09f60d7e6ba79ec1e8e45f96244bdb1eb169eb1d
 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+matplotlib>=3.8.3
+numpy>=1.26.4
+pandas>=2.2.1
+spacy
+tiktoken
+torch>=2.2.1
+transformers>=4.38.2
+pyspellchecker
+gradio==5.34.0

utils.py CHANGED Viewed

@@ -521,3 +521,56 @@ def transform_conversations_dataset_into_training_examples(
         ) from e
     return examples_filtered_by_length

         ) from e
     return examples_filtered_by_length
+def convert_gpt_to_gemini_format(gpt_dataset):
+    """
+    Convert GPT format Dataset to Gemini 2.0 format
+    GPT format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
+    Gemini format: {"systemInstruction": {"role": "system", "parts": [{"text": "..."}]}, "contents": [{"role": "user", "parts": [{"text": "..."}]}, {"role": "model", "parts": [{"text": "..."}]}]}
+    """
+    def process_examples(examples):
+        gemini_examples = []
+        for messages in examples["messages"]:
+            # Parse JSON messages if they're strings
+            if isinstance(messages, str):
+                import json
+                messages = json.loads(messages)
+            # Find system message and other contents
+            system_instruction = None
+            contents = []
+            for msg in messages:
+                if msg["role"] == "system":
+                    system_instruction = {
+                        "role": "system",
+                        "parts": [{"text": msg["content"]}]
+                    }
+                elif msg["role"] in ["user", "assistant", "model"]:
+                    # Convert assistant to model for Gemini
+                    role = "model" if msg["role"] == "assistant" else msg["role"]
+                    contents.append({
+                        "role": role,
+                        "parts": [{"text": msg["content"]}]
+                    })
+            gemini_example = {"contents": contents}
+            if system_instruction:
+                gemini_example["systemInstruction"] = system_instruction
+            gemini_examples.append(gemini_example)
+        # Return in the format expected by Dataset.map
+        return {"gemini_format": gemini_examples}
+    # Use Dataset.map to process the data
+    processed_dataset = gpt_dataset.map(
+        process_examples,
+        remove_columns=["messages"],
+        batched=True,
+    )
+    return processed_dataset