import torch | |
import json | |
import numpy | |
from tokenizers import Tokenizer | |
from pathlib import Path | |
# Load tokenizer | |
tokenizer = Tokenizer.from_file("data/tokenizer.json") | |
VOCAB_SIZE = tokenizer.get_vocab_size() | |
# Load corpus | |
with open("data/corpus.txt", "r", encoding="utf-8") as f: | |
text = f.read() | |
# Encode with BPE tokenizer | |
encoded = tokenizer.encode(text).ids | |
# Convert to tensor and split into train/val | |
data = torch.tensor(encoded, dtype=torch.long) | |
split = int(0.9 * len(data)) | |
train_data = data[:split] | |
val_data = data[split:] | |
# Save outputs | |
torch.save(train_data, "data/train.pt") | |
torch.save(val_data, "data/val.pt") | |