Spaces:
Sleeping
Sleeping
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
from torch.utils.data import DataLoader, Dataset | |
from sklearn.model_selection import train_test_split | |
from transformers import BertTokenizer | |
# Custom Dataset Class for Text Classification | |
class TextDataset(Dataset): | |
def __init__(self, texts, labels, tokenizer, max_length=512): | |
self.texts = texts | |
self.labels = labels | |
self.tokenizer = tokenizer | |
self.max_length = max_length | |
def __len__(self): | |
return len(self.texts) | |
def __getitem__(self, idx): | |
text = self.texts[idx] | |
label = self.labels[idx] | |
# Tokenize text (we can use any tokenizer, like BERT tokenizer) | |
encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt') | |
input_ids = encoding['input_ids'].squeeze(0) # Remove the extra dimension | |
return { | |
'input_ids': input_ids, | |
'labels': torch.tensor(label, dtype=torch.long) | |
} | |
# Define your simple custom model (Feed Forward NN for classification) | |
class SimpleNN(nn.Module): | |
def __init__(self, vocab_size, hidden_size, output_size): | |
super(SimpleNN, self).__init__() | |
self.embedding = nn.Embedding(vocab_size, hidden_size) | |
self.fc1 = nn.Linear(hidden_size, 128) | |
self.fc2 = nn.Linear(128, output_size) | |
self.relu = nn.ReLU() | |
self.softmax = nn.Softmax(dim=1) | |
def forward(self, input_ids): | |
embedded = self.embedding(input_ids) | |
x = embedded.mean(dim=1) # Simplified pooling (averaging embeddings) | |
x = self.relu(self.fc1(x)) | |
x = self.fc2(x) | |
return self.softmax(x) | |
# Example: Sample Dataset | |
texts = ["I love programming.", "I hate bugs.", "Python is great.", "I enjoy learning."] | |
labels = [1, 0, 1, 1] # For example, 1 for positive sentiment, 0 for negative | |
# Tokenizer (use any tokenizer - here, we're using a simple one for this example) | |
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
# Split into training and validation sets | |
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2) | |
# Create dataset and dataloaders | |
train_dataset = TextDataset(train_texts, train_labels, tokenizer) | |
val_dataset = TextDataset(val_texts, val_labels, tokenizer) | |
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) | |
val_loader = DataLoader(val_dataset, batch_size=2) | |
# Initialize the model, optimizer, and loss function | |
model = SimpleNN(vocab_size=30522, hidden_size=256, output_size=2) # Output size = 2 for binary classification | |
optimizer = optim.Adam(model.parameters(), lr=0.001) | |
criterion = nn.CrossEntropyLoss() | |
# Training Loop | |
for epoch in range(3): # 3 epochs for example | |
model.train() | |
for batch in train_loader: | |
optimizer.zero_grad() | |
input_ids = batch['input_ids'] | |
labels = batch['labels'] | |
# Forward pass | |
outputs = model(input_ids) | |
loss = criterion(outputs, labels) | |
# Backward pass | |
loss.backward() | |
optimizer.step() | |
print(f"Epoch {epoch + 1}, Loss: {loss.item()}") | |
# Save the trained model | |
torch.save(model.state_dict(), 'custom_model.pth') | |