Update finetune3.py
Browse files- finetune3.py +10 -6
finetune3.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
-
import streamlit as st
|
3 |
import pandas as pd
|
|
|
4 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
5 |
from datasets import Dataset
|
6 |
from sklearn.model_selection import train_test_split
|
@@ -69,14 +69,9 @@ def main():
|
|
69 |
st.write(df.head())
|
70 |
|
71 |
# Prepare data
|
72 |
-
# First, select a model name
|
73 |
model_name = "bert-base-uncased"
|
74 |
-
|
75 |
-
# Initialize the tokenizer and model with a dummy number of labels for now
|
76 |
dummy_num_labels = 5
|
77 |
tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
|
78 |
-
|
79 |
-
# Prepare the data
|
80 |
dataset, num_labels = prepare_data(df, tokenizer)
|
81 |
|
82 |
# Update the model with the correct number of labels based on the data
|
@@ -96,6 +91,15 @@ def main():
|
|
96 |
train_dataset = create_dataset(train_data)
|
97 |
eval_dataset = create_dataset(eval_data)
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
# Fine-tune model
|
100 |
training_args = TrainingArguments(
|
101 |
output_dir='./results',
|
|
|
1 |
import os
|
|
|
2 |
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
5 |
from datasets import Dataset
|
6 |
from sklearn.model_selection import train_test_split
|
|
|
69 |
st.write(df.head())
|
70 |
|
71 |
# Prepare data
|
|
|
72 |
model_name = "bert-base-uncased"
|
|
|
|
|
73 |
dummy_num_labels = 5
|
74 |
tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
|
|
|
|
|
75 |
dataset, num_labels = prepare_data(df, tokenizer)
|
76 |
|
77 |
# Update the model with the correct number of labels based on the data
|
|
|
91 |
train_dataset = create_dataset(train_data)
|
92 |
eval_dataset = create_dataset(eval_data)
|
93 |
|
94 |
+
# Show a sample of the training data
|
95 |
+
st.subheader("Sample of Training Data")
|
96 |
+
train_df = pd.DataFrame({
|
97 |
+
'input_ids': [ids[:10] for ids in train_dataset['input_ids'][:5]], # Show first 10 tokens for brevity
|
98 |
+
'attention_mask': [mask[:10] for mask in train_dataset['attention_mask'][:5]],
|
99 |
+
'labels': train_dataset['labels'][:5]
|
100 |
+
})
|
101 |
+
st.write(train_df)
|
102 |
+
|
103 |
# Fine-tune model
|
104 |
training_args = TrainingArguments(
|
105 |
output_dir='./results',
|