import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, Dataset from sklearn.model_selection import train_test_split from transformers import BertTokenizer # Custom Dataset Class for Text Classification class TextDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_length=512): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.texts) def __getitem__(self, idx): text = self.texts[idx] label = self.labels[idx] # Tokenize text (we can use any tokenizer, like BERT tokenizer) encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt') input_ids = encoding['input_ids'].squeeze(0) # Remove the extra dimension return { 'input_ids': input_ids, 'labels': torch.tensor(label, dtype=torch.long) } # Define your simple custom model (Feed Forward NN for classification) class SimpleNN(nn.Module): def __init__(self, vocab_size, hidden_size, output_size): super(SimpleNN, self).__init__() self.embedding = nn.Embedding(vocab_size, hidden_size) self.fc1 = nn.Linear(hidden_size, 128) self.fc2 = nn.Linear(128, output_size) self.relu = nn.ReLU() self.softmax = nn.Softmax(dim=1) def forward(self, input_ids): embedded = self.embedding(input_ids) x = embedded.mean(dim=1) # Simplified pooling (averaging embeddings) x = self.relu(self.fc1(x)) x = self.fc2(x) return self.softmax(x) # Example: Sample Dataset texts = ["I love programming.", "I hate bugs.", "Python is great.", "I enjoy learning."] labels = [1, 0, 1, 1] # For example, 1 for positive sentiment, 0 for negative # Tokenizer (use any tokenizer - here, we're using a simple one for this example) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Split into training and validation sets train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2) # Create dataset and dataloaders train_dataset = TextDataset(train_texts, train_labels, tokenizer) val_dataset = TextDataset(val_texts, val_labels, tokenizer) train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2) # Initialize the model, optimizer, and loss function model = SimpleNN(vocab_size=30522, hidden_size=256, output_size=2) # Output size = 2 for binary classification optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() # Training Loop for epoch in range(3): # 3 epochs for example model.train() for batch in train_loader: optimizer.zero_grad() input_ids = batch['input_ids'] labels = batch['labels'] # Forward pass outputs = model(input_ids) loss = criterion(outputs, labels) # Backward pass loss.backward() optimizer.step() print(f"Epoch {epoch + 1}, Loss: {loss.item()}") # Save the trained model torch.save(model.state_dict(), 'custom_model.pth')