Update finetune1.py
Browse filestaking 5 entries only
- finetune1.py +21 -12
finetune1.py
CHANGED
@@ -5,6 +5,7 @@ from datasets import Dataset
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
import requests
|
7 |
from io import BytesIO
|
|
|
8 |
|
9 |
# Load the dataset
|
10 |
@st.cache_data
|
@@ -17,9 +18,9 @@ def load_data():
|
|
17 |
|
18 |
# Tokenizer and model loading
|
19 |
@st.cache_resource
|
20 |
-
def load_tokenizer_and_model(model_name):
|
21 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=
|
23 |
return tokenizer, model
|
24 |
|
25 |
# Tokenize and prepare the dataset
|
@@ -27,10 +28,17 @@ def prepare_data(df, tokenizer):
|
|
27 |
df['filing_date'] = pd.to_datetime(df['filing_date'])
|
28 |
jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
|
29 |
|
30 |
-
|
|
|
|
|
31 |
|
32 |
-
#
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Define tokenization function
|
36 |
def tokenize_function(texts):
|
@@ -48,7 +56,7 @@ def prepare_data(df, tokenizer):
|
|
48 |
|
49 |
dataset = Dataset.from_dict(dataset_dict)
|
50 |
|
51 |
-
return dataset
|
52 |
|
53 |
# Define Streamlit app
|
54 |
def main():
|
@@ -61,12 +69,13 @@ def main():
|
|
61 |
st.subheader("Sample Data from January 2016")
|
62 |
st.write(df.head())
|
63 |
|
64 |
-
# Load tokenizer and model
|
65 |
-
model_name = "bert-base-uncased"
|
66 |
-
tokenizer, model = load_tokenizer_and_model(model_name)
|
67 |
-
|
68 |
# Prepare data
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
# Split the dataset
|
72 |
train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
|
@@ -114,4 +123,4 @@ def main():
|
|
114 |
st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
|
115 |
|
116 |
if __name__ == "__main__":
|
117 |
-
main()
|
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
import requests
|
7 |
from io import BytesIO
|
8 |
+
import numpy as np
|
9 |
|
10 |
# Load the dataset
|
11 |
@st.cache_data
|
|
|
18 |
|
19 |
# Tokenizer and model loading
|
20 |
@st.cache_resource
|
21 |
+
def load_tokenizer_and_model(model_name, num_labels):
|
22 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
23 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
24 |
return tokenizer, model
|
25 |
|
26 |
# Tokenize and prepare the dataset
|
|
|
28 |
df['filing_date'] = pd.to_datetime(df['filing_date'])
|
29 |
jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
|
30 |
|
31 |
+
# Get only 5 unique labels
|
32 |
+
unique_labels = jan_2016_df['patent_number'].astype('category').cat.categories[:5]
|
33 |
+
jan_2016_df = jan_2016_df[jan_2016_df['patent_number'].isin(unique_labels)]
|
34 |
|
35 |
+
# Re-map labels to integers starting from 0
|
36 |
+
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
|
37 |
+
jan_2016_df['label'] = jan_2016_df['patent_number'].map(label_mapping)
|
38 |
+
|
39 |
+
texts = jan_2016_df['invention_title'].tolist()
|
40 |
+
labels = jan_2016_df['label'].tolist()
|
41 |
+
num_labels = len(unique_labels)
|
42 |
|
43 |
# Define tokenization function
|
44 |
def tokenize_function(texts):
|
|
|
56 |
|
57 |
dataset = Dataset.from_dict(dataset_dict)
|
58 |
|
59 |
+
return dataset, num_labels
|
60 |
|
61 |
# Define Streamlit app
|
62 |
def main():
|
|
|
69 |
st.subheader("Sample Data from January 2016")
|
70 |
st.write(df.head())
|
71 |
|
|
|
|
|
|
|
|
|
72 |
# Prepare data
|
73 |
+
tokenizer, model = None, None
|
74 |
+
dataset, num_labels = prepare_data(df, tokenizer)
|
75 |
+
|
76 |
+
# Load tokenizer and model with the correct number of labels
|
77 |
+
model_name = "bert-base-uncased"
|
78 |
+
tokenizer, model = load_tokenizer_and_model(model_name, num_labels)
|
79 |
|
80 |
# Split the dataset
|
81 |
train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
|
|
|
123 |
st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
|
124 |
|
125 |
if __name__ == "__main__":
|
126 |
+
main()
|