Upload 3 files
Browse filestesting improvised code
- finetune1.py +117 -0
- finetune3.py +123 -0
- requirements.txt +70 -0
finetune1.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
4 |
+
from datasets import Dataset
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
import requests
|
7 |
+
from io import BytesIO
|
8 |
+
|
9 |
+
# Load the dataset
|
10 |
+
@st.cache_data
|
11 |
+
def load_data():
|
12 |
+
url = "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather"
|
13 |
+
response = requests.get(url)
|
14 |
+
data = BytesIO(response.content)
|
15 |
+
df = pd.read_feather(data)
|
16 |
+
return df
|
17 |
+
|
18 |
+
# Tokenizer and model loading
|
19 |
+
@st.cache_resource
|
20 |
+
def load_tokenizer_and_model(model_name):
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Adjust num_labels as needed
|
23 |
+
return tokenizer, model
|
24 |
+
|
25 |
+
# Tokenize and prepare the dataset
|
26 |
+
def prepare_data(df, tokenizer):
|
27 |
+
df['filing_date'] = pd.to_datetime(df['filing_date'])
|
28 |
+
jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
|
29 |
+
|
30 |
+
texts = jan_2016_df['invention_title'].tolist()
|
31 |
+
|
32 |
+
# Assuming `patent_number` is categorical or needs to be converted to labels
|
33 |
+
labels = jan_2016_df['patent_number'].astype('category').cat.codes.tolist()
|
34 |
+
|
35 |
+
# Define tokenization function
|
36 |
+
def tokenize_function(texts):
|
37 |
+
return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
38 |
+
|
39 |
+
# Tokenize texts
|
40 |
+
tokenized_data = tokenize_function(texts)
|
41 |
+
|
42 |
+
# Create dataset
|
43 |
+
dataset_dict = {
|
44 |
+
'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
|
45 |
+
'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
|
46 |
+
'labels': labels
|
47 |
+
}
|
48 |
+
|
49 |
+
dataset = Dataset.from_dict(dataset_dict)
|
50 |
+
|
51 |
+
return dataset
|
52 |
+
|
53 |
+
# Define Streamlit app
|
54 |
+
def main():
|
55 |
+
st.title("Patent Classification with Fine-Tuned BERT")
|
56 |
+
|
57 |
+
# Load data
|
58 |
+
df = load_data()
|
59 |
+
|
60 |
+
# Show sample data
|
61 |
+
st.subheader("Sample Data from January 2016")
|
62 |
+
st.write(df.head())
|
63 |
+
|
64 |
+
# Load tokenizer and model
|
65 |
+
model_name = "bert-base-uncased"
|
66 |
+
tokenizer, model = load_tokenizer_and_model(model_name)
|
67 |
+
|
68 |
+
# Prepare data
|
69 |
+
dataset = prepare_data(df, tokenizer)
|
70 |
+
|
71 |
+
# Split the dataset
|
72 |
+
train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
|
73 |
+
|
74 |
+
def create_dataset(data):
|
75 |
+
return Dataset.from_dict({
|
76 |
+
'input_ids': [item[0] for item in data],
|
77 |
+
'attention_mask': [item[1] for item in data],
|
78 |
+
'labels': [item[2] for item in data]
|
79 |
+
})
|
80 |
+
|
81 |
+
train_dataset = create_dataset(train_data)
|
82 |
+
eval_dataset = create_dataset(eval_data)
|
83 |
+
|
84 |
+
# Fine-tune model
|
85 |
+
training_args = TrainingArguments(
|
86 |
+
output_dir='./results',
|
87 |
+
evaluation_strategy="epoch",
|
88 |
+
learning_rate=2e-5,
|
89 |
+
per_device_train_batch_size=8,
|
90 |
+
per_device_eval_batch_size=8,
|
91 |
+
num_train_epochs=3,
|
92 |
+
weight_decay=0.01,
|
93 |
+
)
|
94 |
+
|
95 |
+
trainer = Trainer(
|
96 |
+
model=model,
|
97 |
+
args=training_args,
|
98 |
+
train_dataset=train_dataset,
|
99 |
+
eval_dataset=eval_dataset,
|
100 |
+
tokenizer=tokenizer
|
101 |
+
)
|
102 |
+
|
103 |
+
st.subheader("Training the Model")
|
104 |
+
if st.button('Train Model'):
|
105 |
+
with st.spinner('Training in progress...'):
|
106 |
+
trainer.train()
|
107 |
+
model.save_pretrained("./finetuned_model")
|
108 |
+
tokenizer.save_pretrained("./finetuned_model")
|
109 |
+
st.success("Model training complete and saved.")
|
110 |
+
|
111 |
+
# Display pretrained model data
|
112 |
+
st.subheader("Pretrained Model")
|
113 |
+
if st.button('Show Pretrained Model'):
|
114 |
+
st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
|
115 |
+
|
116 |
+
if __name__ == "__main__":
|
117 |
+
main()
|
finetune3.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
4 |
+
from datasets import Dataset, DatasetDict
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
import requests
|
7 |
+
from io import BytesIO
|
8 |
+
|
9 |
+
# Load the dataset
|
10 |
+
@st.cache_data
|
11 |
+
def load_data():
|
12 |
+
url = "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather"
|
13 |
+
response = requests.get(url)
|
14 |
+
data = BytesIO(response.content)
|
15 |
+
df = pd.read_feather(data)
|
16 |
+
return df
|
17 |
+
|
18 |
+
# Tokenizer and model loading
|
19 |
+
@st.cache_resource
|
20 |
+
def load_tokenizer_and_model(model_name):
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
23 |
+
return tokenizer, model
|
24 |
+
|
25 |
+
# Tokenize and prepare the dataset
|
26 |
+
def prepare_data(df, tokenizer):
|
27 |
+
df['filing_date'] = pd.to_datetime(df['filing_date'])
|
28 |
+
jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
|
29 |
+
|
30 |
+
texts = jan_2016_df['invention_title'].tolist()
|
31 |
+
labels = jan_2016_df['patent_number'].tolist()
|
32 |
+
|
33 |
+
def tokenize_function(texts):
|
34 |
+
return tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=512)
|
35 |
+
|
36 |
+
tokenized_data = tokenize_function(texts)
|
37 |
+
|
38 |
+
dataset_dict = {
|
39 |
+
'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
|
40 |
+
'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
|
41 |
+
'labels': labels
|
42 |
+
}
|
43 |
+
|
44 |
+
dataset = Dataset.from_dict(dataset_dict)
|
45 |
+
|
46 |
+
return dataset
|
47 |
+
|
48 |
+
# Define a custom compute_loss function
|
49 |
+
def compute_loss(model, inputs):
|
50 |
+
labels = inputs.get("labels")
|
51 |
+
outputs = model(**inputs)
|
52 |
+
logits = outputs.logits
|
53 |
+
loss_fct = torch.nn.CrossEntropyLoss()
|
54 |
+
loss = loss_fct(logits, labels)
|
55 |
+
return loss
|
56 |
+
|
57 |
+
# Define Streamlit app
|
58 |
+
def main():
|
59 |
+
st.title("Patent Classification with Fine-Tuned BERT")
|
60 |
+
|
61 |
+
# Load data
|
62 |
+
df = load_data()
|
63 |
+
|
64 |
+
# Show sample data
|
65 |
+
st.subheader("Sample Data from January 2016")
|
66 |
+
st.write(df.head())
|
67 |
+
|
68 |
+
# Load tokenizer and model
|
69 |
+
model_name = "bert-base-uncased"
|
70 |
+
tokenizer, model = load_tokenizer_and_model(model_name)
|
71 |
+
|
72 |
+
# Prepare data
|
73 |
+
dataset = prepare_data(df, tokenizer)
|
74 |
+
|
75 |
+
# Split the dataset
|
76 |
+
train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
|
77 |
+
|
78 |
+
train_dataset = Dataset.from_dict({
|
79 |
+
'input_ids': [item[0] for item in train_data],
|
80 |
+
'attention_mask': [item[1] for item in train_data],
|
81 |
+
'labels': [item[2] for item in train_data]
|
82 |
+
})
|
83 |
+
|
84 |
+
eval_dataset = Dataset.from_dict({
|
85 |
+
'input_ids': [item[0] for item in eval_data],
|
86 |
+
'attention_mask': [item[1] for item in eval_data],
|
87 |
+
'labels': [item[2] for item in eval_data]
|
88 |
+
})
|
89 |
+
|
90 |
+
# Fine-tune model
|
91 |
+
training_args = TrainingArguments(
|
92 |
+
output_dir='./results',
|
93 |
+
evaluation_strategy="epoch",
|
94 |
+
learning_rate=2e-5,
|
95 |
+
per_device_train_batch_size=8,
|
96 |
+
per_device_eval_batch_size=8,
|
97 |
+
num_train_epochs=3,
|
98 |
+
weight_decay=0.01,
|
99 |
+
)
|
100 |
+
|
101 |
+
trainer = Trainer(
|
102 |
+
model=model,
|
103 |
+
args=training_args,
|
104 |
+
train_dataset=train_dataset,
|
105 |
+
eval_dataset=eval_dataset,
|
106 |
+
compute_loss=compute_loss # Use the custom loss function
|
107 |
+
)
|
108 |
+
|
109 |
+
st.subheader("Training the Model")
|
110 |
+
if st.button('Train Model'):
|
111 |
+
with st.spinner('Training in progress...'):
|
112 |
+
trainer.train()
|
113 |
+
model.save_pretrained("./finetuned_model")
|
114 |
+
tokenizer.save_pretrained("./finetuned_model")
|
115 |
+
st.success("Model training complete and saved.")
|
116 |
+
|
117 |
+
# Display pretrained model data
|
118 |
+
st.subheader("Pretrained Model")
|
119 |
+
if st.button('Show Pretrained Model'):
|
120 |
+
st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
|
121 |
+
|
122 |
+
if __name__ == "__main__":
|
123 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.33.0
|
2 |
+
aiohttp==3.9.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.3.0
|
5 |
+
attrs==23.2.0
|
6 |
+
blinker==1.8.2
|
7 |
+
cachetools==5.4.0
|
8 |
+
certifi==2024.7.4
|
9 |
+
charset-normalizer==3.3.2
|
10 |
+
click==8.1.7
|
11 |
+
datasets==2.20.0
|
12 |
+
dill==0.3.8
|
13 |
+
filelock==3.15.4
|
14 |
+
frozenlist==1.4.1
|
15 |
+
fsspec==2024.5.0
|
16 |
+
gitdb==4.0.11
|
17 |
+
GitPython==3.1.43
|
18 |
+
huggingface-hub==0.24.2
|
19 |
+
idna==3.7
|
20 |
+
Jinja2==3.1.4
|
21 |
+
joblib==1.4.2
|
22 |
+
jsonschema==4.23.0
|
23 |
+
jsonschema-specifications==2023.12.1
|
24 |
+
markdown-it-py==3.0.0
|
25 |
+
MarkupSafe==2.1.5
|
26 |
+
mdurl==0.1.2
|
27 |
+
mpmath==1.3.0
|
28 |
+
multidict==6.0.5
|
29 |
+
multiprocess==0.70.16
|
30 |
+
networkx==3.3
|
31 |
+
numpy==1.26.4
|
32 |
+
packaging==24.1
|
33 |
+
pandas==2.2.2
|
34 |
+
pillow==10.4.0
|
35 |
+
protobuf==5.27.2
|
36 |
+
psutil==6.0.0
|
37 |
+
pyarrow==17.0.0
|
38 |
+
pyarrow-hotfix==0.6
|
39 |
+
pydeck==0.9.1
|
40 |
+
Pygments==2.18.0
|
41 |
+
python-dateutil==2.9.0.post0
|
42 |
+
pytz==2024.1
|
43 |
+
PyYAML==6.0.1
|
44 |
+
referencing==0.35.1
|
45 |
+
regex==2024.7.24
|
46 |
+
requests==2.32.3
|
47 |
+
rich==13.7.1
|
48 |
+
rpds-py==0.19.1
|
49 |
+
safetensors==0.4.3
|
50 |
+
scikit-learn==1.5.1
|
51 |
+
scipy==1.14.0
|
52 |
+
setuptools==71.1.0
|
53 |
+
six==1.16.0
|
54 |
+
smmap==5.0.1
|
55 |
+
streamlit==1.37.0
|
56 |
+
sympy==1.13.1
|
57 |
+
tenacity==8.5.0
|
58 |
+
threadpoolctl==3.5.0
|
59 |
+
tokenizers==0.19.1
|
60 |
+
toml==0.10.2
|
61 |
+
toolz==0.12.1
|
62 |
+
torch==2.4.0
|
63 |
+
tornado==6.4.1
|
64 |
+
tqdm==4.66.4
|
65 |
+
transformers==4.43.2
|
66 |
+
typing_extensions==4.12.2
|
67 |
+
tzdata==2024.1
|
68 |
+
urllib3==2.2.2
|
69 |
+
xxhash==3.4.1
|
70 |
+
yarl==1.9.4
|