talktorhutika commited on
Commit
1c5a2f2
·
verified ·
1 Parent(s): a1e2c30

Update finetune1.py

Browse files
Files changed (1) hide show
  1. finetune1.py +39 -15
finetune1.py CHANGED
@@ -1,5 +1,6 @@
1
- import streamlit as st
2
  import pandas as pd
 
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
4
  from datasets import Dataset
5
  from sklearn.model_selection import train_test_split
@@ -60,26 +61,23 @@ def prepare_data(df, tokenizer):
60
  def main():
61
  st.title("Patent Classification with Fine-Tuned BERT")
62
 
 
 
 
63
  # Load data
64
  df = load_data()
65
 
66
- # Show sample data
67
- st.subheader("Sample Data from January 2016")
68
  st.write(df.head())
69
 
70
  # Prepare data
71
- # First, select a model name
72
  model_name = "bert-base-uncased"
73
-
74
- # Initialize the tokenizer and model with a dummy number of labels for now
75
- dummy_num_labels = 5
76
- tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
77
-
78
- # Prepare the data
79
  dataset, num_labels = prepare_data(df, tokenizer)
80
 
81
  # Update the model with the correct number of labels based on the data
82
- if num_labels != dummy_num_labels:
83
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
84
 
85
  # Split the dataset
@@ -95,9 +93,18 @@ def main():
95
  train_dataset = create_dataset(train_data)
96
  eval_dataset = create_dataset(eval_data)
97
 
 
 
 
 
 
 
 
 
 
98
  # Fine-tune model
99
  training_args = TrainingArguments(
100
- output_dir='./results',
101
  evaluation_strategy="epoch",
102
  learning_rate=2e-5,
103
  per_device_train_batch_size=8,
@@ -118,14 +125,31 @@ def main():
118
  if st.button('Train Model'):
119
  with st.spinner('Training in progress...'):
120
  trainer.train()
121
- model.save_pretrained("./finetuned_model")
122
- tokenizer.save_pretrained("./finetuned_model")
123
  st.success("Model training complete and saved.")
124
 
125
  # Display pretrained model data
126
  st.subheader("Pretrained Model")
127
  if st.button('Show Pretrained Model'):
128
- st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  if __name__ == "__main__":
131
  main()
 
1
+ import os
2
  import pandas as pd
3
+ import streamlit as st
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  from datasets import Dataset
6
  from sklearn.model_selection import train_test_split
 
61
  def main():
62
  st.title("Patent Classification with Fine-Tuned BERT")
63
 
64
+ # Initialize model directory path
65
+ model_dir = './finetuned_model'
66
+
67
  # Load data
68
  df = load_data()
69
 
70
+ # Show data
71
+ st.subheader("Data from January 2016")
72
  st.write(df.head())
73
 
74
  # Prepare data
 
75
  model_name = "bert-base-uncased"
76
+ tokenizer, model = load_tokenizer_and_model(model_name, num_labels=5)
 
 
 
 
 
77
  dataset, num_labels = prepare_data(df, tokenizer)
78
 
79
  # Update the model with the correct number of labels based on the data
80
+ if num_labels != 5:
81
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
82
 
83
  # Split the dataset
 
93
  train_dataset = create_dataset(train_data)
94
  eval_dataset = create_dataset(eval_data)
95
 
96
+ # Show training data
97
+ st.subheader("Training Data")
98
+ train_df = pd.DataFrame({
99
+ 'input_ids': [ids[:10] for ids in train_dataset['input_ids'][:5]],
100
+ 'attention_mask': [mask[:10] for mask in train_dataset['attention_mask'][:5]],
101
+ 'labels': train_dataset['labels'][:5]
102
+ })
103
+ st.write(train_df)
104
+
105
  # Fine-tune model
106
  training_args = TrainingArguments(
107
+ output_dir=model_dir,
108
  evaluation_strategy="epoch",
109
  learning_rate=2e-5,
110
  per_device_train_batch_size=8,
 
125
  if st.button('Train Model'):
126
  with st.spinner('Training in progress...'):
127
  trainer.train()
128
+ model.save_pretrained(model_dir)
129
+ tokenizer.save_pretrained(model_dir)
130
  st.success("Model training complete and saved.")
131
 
132
  # Display pretrained model data
133
  st.subheader("Pretrained Model")
134
  if st.button('Show Pretrained Model'):
135
+ if os.path.exists(model_dir):
136
+ # Show model name
137
+ st.write(f"Model name: `{model_name}`")
138
+
139
+ # List .json files
140
+ json_files = [f for f in os.listdir(model_dir) if f.endswith('.json')]
141
+ if json_files:
142
+ st.write("Available `.json` files:")
143
+ for file in json_files:
144
+ file_path = os.path.join(model_dir, file)
145
+ with open(file_path, 'r', encoding='utf-8') as f:
146
+ file_content = f.read()
147
+ st.write(f"[{file}](data:file/{file})") # Create clickable link
148
+ st.text(file_content) # Display file content
149
+ else:
150
+ st.write("No `.json` files found in `./finetuned_model` directory.")
151
+ else:
152
+ st.write("Directory `./finetuned_model` does not exist.")
153
 
154
  if __name__ == "__main__":
155
  main()