talktorhutika commited on
Commit
863be65
·
verified ·
1 Parent(s): 1db9cf7

Update finetune3.py

Browse files
Files changed (1) hide show
  1. finetune3.py +10 -6
finetune3.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
- import streamlit as st
3
  import pandas as pd
 
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  from datasets import Dataset
6
  from sklearn.model_selection import train_test_split
@@ -69,14 +69,9 @@ def main():
69
  st.write(df.head())
70
 
71
  # Prepare data
72
- # First, select a model name
73
  model_name = "bert-base-uncased"
74
-
75
- # Initialize the tokenizer and model with a dummy number of labels for now
76
  dummy_num_labels = 5
77
  tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
78
-
79
- # Prepare the data
80
  dataset, num_labels = prepare_data(df, tokenizer)
81
 
82
  # Update the model with the correct number of labels based on the data
@@ -96,6 +91,15 @@ def main():
96
  train_dataset = create_dataset(train_data)
97
  eval_dataset = create_dataset(eval_data)
98
 
 
 
 
 
 
 
 
 
 
99
  # Fine-tune model
100
  training_args = TrainingArguments(
101
  output_dir='./results',
 
1
  import os
 
2
  import pandas as pd
3
+ import streamlit as st
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  from datasets import Dataset
6
  from sklearn.model_selection import train_test_split
 
69
  st.write(df.head())
70
 
71
  # Prepare data
 
72
  model_name = "bert-base-uncased"
 
 
73
  dummy_num_labels = 5
74
  tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
 
 
75
  dataset, num_labels = prepare_data(df, tokenizer)
76
 
77
  # Update the model with the correct number of labels based on the data
 
91
  train_dataset = create_dataset(train_data)
92
  eval_dataset = create_dataset(eval_data)
93
 
94
+ # Show a sample of the training data
95
+ st.subheader("Sample of Training Data")
96
+ train_df = pd.DataFrame({
97
+ 'input_ids': [ids[:10] for ids in train_dataset['input_ids'][:5]], # Show first 10 tokens for brevity
98
+ 'attention_mask': [mask[:10] for mask in train_dataset['attention_mask'][:5]],
99
+ 'labels': train_dataset['labels'][:5]
100
+ })
101
+ st.write(train_df)
102
+
103
  # Fine-tune model
104
  training_args = TrainingArguments(
105
  output_dir='./results',