Update finetune3.py
Browse files- finetune3.py +66 -36
finetune3.py
CHANGED
@@ -1,7 +1,8 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
4 |
-
from datasets import Dataset
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
import requests
|
7 |
from io import BytesIO
|
@@ -16,10 +17,9 @@ def load_data():
|
|
16 |
return df
|
17 |
|
18 |
# Tokenizer and model loading
|
19 |
-
|
20 |
-
def load_tokenizer_and_model(model_name):
|
21 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
23 |
return tokenizer, model
|
24 |
|
25 |
# Tokenize and prepare the dataset
|
@@ -27,14 +27,26 @@ def prepare_data(df, tokenizer):
|
|
27 |
df['filing_date'] = pd.to_datetime(df['filing_date'])
|
28 |
jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
|
29 |
|
30 |
-
|
31 |
-
|
|
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def tokenize_function(texts):
|
34 |
-
return tokenizer(texts, padding=
|
35 |
-
|
|
|
36 |
tokenized_data = tokenize_function(texts)
|
37 |
-
|
|
|
38 |
dataset_dict = {
|
39 |
'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
|
40 |
'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
|
@@ -43,16 +55,7 @@ def prepare_data(df, tokenizer):
|
|
43 |
|
44 |
dataset = Dataset.from_dict(dataset_dict)
|
45 |
|
46 |
-
return dataset
|
47 |
-
|
48 |
-
# Define a custom compute_loss function
|
49 |
-
def compute_loss(model, inputs):
|
50 |
-
labels = inputs.get("labels")
|
51 |
-
outputs = model(**inputs)
|
52 |
-
logits = outputs.logits
|
53 |
-
loss_fct = torch.nn.CrossEntropyLoss()
|
54 |
-
loss = loss_fct(logits, labels)
|
55 |
-
return loss
|
56 |
|
57 |
# Define Streamlit app
|
58 |
def main():
|
@@ -65,27 +68,33 @@ def main():
|
|
65 |
st.subheader("Sample Data from January 2016")
|
66 |
st.write(df.head())
|
67 |
|
68 |
-
#
|
|
|
69 |
model_name = "bert-base-uncased"
|
70 |
-
tokenizer, model = load_tokenizer_and_model(model_name)
|
71 |
|
72 |
-
#
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# Split the dataset
|
76 |
train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
'labels': [item[2] for item in eval_data]
|
88 |
-
})
|
89 |
|
90 |
# Fine-tune model
|
91 |
training_args = TrainingArguments(
|
@@ -103,7 +112,7 @@ def main():
|
|
103 |
args=training_args,
|
104 |
train_dataset=train_dataset,
|
105 |
eval_dataset=eval_dataset,
|
106 |
-
|
107 |
)
|
108 |
|
109 |
st.subheader("Training the Model")
|
@@ -117,7 +126,28 @@ def main():
|
|
117 |
# Display pretrained model data
|
118 |
st.subheader("Pretrained Model")
|
119 |
if st.button('Show Pretrained Model'):
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
if __name__ == "__main__":
|
123 |
main()
|
|
|
1 |
+
import os
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
5 |
+
from datasets import Dataset
|
6 |
from sklearn.model_selection import train_test_split
|
7 |
import requests
|
8 |
from io import BytesIO
|
|
|
17 |
return df
|
18 |
|
19 |
# Tokenizer and model loading
|
20 |
+
def load_tokenizer_and_model(model_name, num_labels):
|
|
|
21 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
23 |
return tokenizer, model
|
24 |
|
25 |
# Tokenize and prepare the dataset
|
|
|
27 |
df['filing_date'] = pd.to_datetime(df['filing_date'])
|
28 |
jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
|
29 |
|
30 |
+
# Get only 5 unique labels
|
31 |
+
unique_labels = jan_2016_df['patent_number'].astype('category').cat.categories[:5]
|
32 |
+
jan_2016_df = jan_2016_df[jan_2016_df['patent_number'].isin(unique_labels)]
|
33 |
|
34 |
+
# Re-map labels to integers starting from 0
|
35 |
+
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
|
36 |
+
jan_2016_df['label'] = jan_2016_df['patent_number'].map(label_mapping)
|
37 |
+
|
38 |
+
texts = jan_2016_df['invention_title'].tolist()
|
39 |
+
labels = jan_2016_df['label'].tolist()
|
40 |
+
num_labels = len(unique_labels)
|
41 |
+
|
42 |
+
# Define tokenization function
|
43 |
def tokenize_function(texts):
|
44 |
+
return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
45 |
+
|
46 |
+
# Tokenize texts
|
47 |
tokenized_data = tokenize_function(texts)
|
48 |
+
|
49 |
+
# Create dataset
|
50 |
dataset_dict = {
|
51 |
'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
|
52 |
'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
|
|
|
55 |
|
56 |
dataset = Dataset.from_dict(dataset_dict)
|
57 |
|
58 |
+
return dataset, num_labels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
# Define Streamlit app
|
61 |
def main():
|
|
|
68 |
st.subheader("Sample Data from January 2016")
|
69 |
st.write(df.head())
|
70 |
|
71 |
+
# Prepare data
|
72 |
+
# First, select a model name
|
73 |
model_name = "bert-base-uncased"
|
|
|
74 |
|
75 |
+
# Initialize the tokenizer and model with a dummy number of labels for now
|
76 |
+
dummy_num_labels = 5
|
77 |
+
tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
|
78 |
+
|
79 |
+
# Prepare the data
|
80 |
+
dataset, num_labels = prepare_data(df, tokenizer)
|
81 |
+
|
82 |
+
# Update the model with the correct number of labels based on the data
|
83 |
+
if num_labels != dummy_num_labels:
|
84 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
85 |
|
86 |
# Split the dataset
|
87 |
train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
|
88 |
|
89 |
+
def create_dataset(data):
|
90 |
+
return Dataset.from_dict({
|
91 |
+
'input_ids': [item[0] for item in data],
|
92 |
+
'attention_mask': [item[1] for item in data],
|
93 |
+
'labels': [item[2] for item in data]
|
94 |
+
})
|
95 |
+
|
96 |
+
train_dataset = create_dataset(train_data)
|
97 |
+
eval_dataset = create_dataset(eval_data)
|
|
|
|
|
98 |
|
99 |
# Fine-tune model
|
100 |
training_args = TrainingArguments(
|
|
|
112 |
args=training_args,
|
113 |
train_dataset=train_dataset,
|
114 |
eval_dataset=eval_dataset,
|
115 |
+
tokenizer=tokenizer
|
116 |
)
|
117 |
|
118 |
st.subheader("Training the Model")
|
|
|
126 |
# Display pretrained model data
|
127 |
st.subheader("Pretrained Model")
|
128 |
if st.button('Show Pretrained Model'):
|
129 |
+
model_dir = './finetuned_model'
|
130 |
+
|
131 |
+
# List files in the directory
|
132 |
+
if os.path.exists(model_dir):
|
133 |
+
files = os.listdir(model_dir)
|
134 |
+
st.write("Contents of `./finetuned_model` directory:")
|
135 |
+
st.write(files)
|
136 |
+
else:
|
137 |
+
st.write("Directory `./finetuned_model` does not exist.")
|
138 |
+
|
139 |
+
# Optionally, show the file content of model files (e.g., config.json, pytorch_model.bin)
|
140 |
+
st.subheader("Model File Contents")
|
141 |
+
if st.button('Show Model File Contents'):
|
142 |
+
if os.path.exists(model_dir):
|
143 |
+
files = os.listdir(model_dir)
|
144 |
+
for file in files:
|
145 |
+
file_path = os.path.join(model_dir, file)
|
146 |
+
with open(file_path, 'r') as f:
|
147 |
+
st.write(f"**{file}:**")
|
148 |
+
st.write(f.read())
|
149 |
+
else:
|
150 |
+
st.write("Directory `./finetuned_model` does not exist.")
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
main()
|