File size: 632 Bytes
4ea6cf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import torch
import json
import numpy
from tokenizers import Tokenizer
from pathlib import Path

# Load tokenizer
tokenizer = Tokenizer.from_file("data/tokenizer.json")
VOCAB_SIZE = tokenizer.get_vocab_size()

# Load corpus
with open("data/corpus.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Encode with BPE tokenizer
encoded = tokenizer.encode(text).ids

# Convert to tensor and split into train/val
data = torch.tensor(encoded, dtype=torch.long)
split = int(0.9 * len(data))
train_data = data[:split]
val_data = data[split:]

# Save outputs
torch.save(train_data, "data/train.pt")
torch.save(val_data, "data/val.pt")