epicDev123 commited on
Commit
fb042ae
·
verified ·
1 Parent(s): 3e466a4

Create trainer.py

Browse files
Files changed (1) hide show
  1. trainer.py +40 -0
trainer.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Hugging Face libraries and datasets
2
+ from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer
3
+ from datasets import load_dataset
4
+
5
+ # Load dataset
6
+ dataset = load_dataset("imdb")
7
+
8
+ # Preprocess data: Tokenizer
9
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
10
+
11
+ def tokenize_function(examples):
12
+ return tokenizer(examples['text'], padding='max_length', truncation=True)
13
+
14
+ # Apply tokenization
15
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
16
+
17
+ # Load pre-trained model
18
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
19
+
20
+ # Training Arguments
21
+ training_args = TrainingArguments(
22
+ output_dir='/results', # Save path in the Space environment
23
+ evaluation_strategy="epoch", # Eval after each epoch
24
+ learning_rate=2e-5, # Learning rate
25
+ per_device_train_batch_size=8, # Training batch size
26
+ per_device_eval_batch_size=8, # Eval batch size
27
+ num_train_epochs=3, # Number of epochs
28
+ weight_decay=0.01 # Weight decay for regularization
29
+ )
30
+
31
+ # Trainer setup
32
+ trainer = Trainer(
33
+ model=model,
34
+ args=training_args,
35
+ train_dataset=tokenized_datasets["train"],
36
+ eval_dataset=tokenized_datasets["test"]
37
+ )
38
+
39
+ # Train the model (this will run in the cloud, no need for local machine)
40
+ trainer.train()