File size: 4,356 Bytes
f59de10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Data preparation script for training nanoGPT on the flytech/python-codes-25k dataset.
This script downloads the dataset, tokenizes it, and creates the binary files needed for training.
"""

import os
import pickle
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

def download_and_prepare_code_dataset():
    """Download and prepare the flytech/python-codes-25k dataset for nanoGPT training."""
    
    print("Loading flytech/python-codes-25k dataset...")
    dataset = load_dataset("flytech/python-codes-25k")
    
    print(f"Dataset structure: {dataset}")
    print(f"Available splits: {list(dataset.keys())}")
    print(f"Train split size: {len(dataset['train'])}")
    
    # Debug: Check the first few examples to understand the structure
    print("\nFirst example structure:")
    first_example = dataset['train'][0]
    for key, value in first_example.items():
        print(f"  {key}: {repr(value[:200])}...")  # Show first 200 chars
    
    # Create data directory
    data_dir = os.path.join('data', 'python-codes-25k')
    os.makedirs(data_dir, exist_ok=True)
    
    # Extract code content from the dataset
    print("Extracting code content...")
    train_texts = []
    test_texts = []
    
    # Process training data
    for item in tqdm(dataset['train'], desc="Processing train split"):
        # Try different possible field names for code content
        code = item.get('text', '') or item.get('output', '') or item.get('code', '')
        if code and isinstance(code, str) and len(code.strip()) > 0:
            train_texts.append(code)
    
    # Split training data into train and validation sets (90/10 split)
    print("Splitting data into train and validation sets...")
    total_samples = len(train_texts)
    split_idx = int(0.9 * total_samples)
    
    train_texts_final = train_texts[:split_idx]
    test_texts = train_texts[split_idx:]  # Use last 10% as validation
    
    print(f"Final train samples: {len(train_texts_final)}")
    print(f"Validation samples: {len(test_texts)}")
    
    print(f"Extracted {len(train_texts)} total samples")
    
    # Combine all texts for vocabulary building
    all_text = '\n'.join(train_texts_final + test_texts)
    print(f"Total characters: {len(all_text):,}")
    
    # Create vocabulary from the text
    print("Creating vocabulary...")
    chars = sorted(list(set(all_text)))
    vocab_size = len(chars)
    print(f"Vocabulary size: {vocab_size}")
    
    # Create character to integer mapping
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}
    
    # Save vocabulary metadata
    meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
    }
    with open(os.path.join(data_dir, 'meta.pkl'), 'wb') as f:
        pickle.dump(meta, f)
    print(f"Saved vocabulary to {os.path.join(data_dir, 'meta.pkl')}")
    
    # Tokenize and save training data
    print("Tokenizing training data...")
    train_ids = []
    for text in tqdm(train_texts_final, desc="Tokenizing train"):
        ids = [stoi[c] for c in text]
        train_ids.extend(ids)
    
    # Tokenize and save test data
    print("Tokenizing test data...")
    test_ids = []
    for text in tqdm(test_texts, desc="Tokenizing test"):
        ids = [stoi[c] for c in text]
        test_ids.extend(ids)
    
    # Save as binary files
    train_ids = np.array(train_ids, dtype=np.uint16)
    test_ids = np.array(test_ids, dtype=np.uint16)
    
    train_path = os.path.join(data_dir, 'train.bin')
    test_path = os.path.join(data_dir, 'val.bin')  # nanoGPT expects 'val.bin'
    
    train_ids.tofile(train_path)
    test_ids.tofile(test_path)
    
    print(f"Saved training data to {train_path} ({len(train_ids):,} tokens)")
    print(f"Saved validation data to {test_path} ({len(test_ids):,} tokens)")
    
    # Print some statistics
    print(f"\nDataset statistics:")
    print(f"Vocabulary size: {vocab_size}")
    print(f"Training tokens: {len(train_ids):,}")
    print(f"Validation tokens: {len(test_ids):,}")
    print(f"Total tokens: {len(train_ids) + len(test_ids):,}")
    
    # Show some example characters
    print(f"\nFirst 100 characters in vocabulary:")
    print(''.join(chars[:100]))
    
    return data_dir

if __name__ == '__main__':
    download_and_prepare_code_dataset()