talktorhutika commited on
Commit
96cdb5a
·
verified ·
1 Parent(s): f2145a6

Upload 3 files

Browse files

testing improvised code

Files changed (3) hide show
  1. finetune1.py +117 -0
  2. finetune3.py +123 -0
  3. requirements.txt +70 -0
finetune1.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
4
+ from datasets import Dataset
5
+ from sklearn.model_selection import train_test_split
6
+ import requests
7
+ from io import BytesIO
8
+
9
+ # Load the dataset
10
+ @st.cache_data
11
+ def load_data():
12
+ url = "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather"
13
+ response = requests.get(url)
14
+ data = BytesIO(response.content)
15
+ df = pd.read_feather(data)
16
+ return df
17
+
18
+ # Tokenizer and model loading
19
+ @st.cache_resource
20
+ def load_tokenizer_and_model(model_name):
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Adjust num_labels as needed
23
+ return tokenizer, model
24
+
25
+ # Tokenize and prepare the dataset
26
+ def prepare_data(df, tokenizer):
27
+ df['filing_date'] = pd.to_datetime(df['filing_date'])
28
+ jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
29
+
30
+ texts = jan_2016_df['invention_title'].tolist()
31
+
32
+ # Assuming `patent_number` is categorical or needs to be converted to labels
33
+ labels = jan_2016_df['patent_number'].astype('category').cat.codes.tolist()
34
+
35
+ # Define tokenization function
36
+ def tokenize_function(texts):
37
+ return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
38
+
39
+ # Tokenize texts
40
+ tokenized_data = tokenize_function(texts)
41
+
42
+ # Create dataset
43
+ dataset_dict = {
44
+ 'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
45
+ 'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
46
+ 'labels': labels
47
+ }
48
+
49
+ dataset = Dataset.from_dict(dataset_dict)
50
+
51
+ return dataset
52
+
53
+ # Define Streamlit app
54
+ def main():
55
+ st.title("Patent Classification with Fine-Tuned BERT")
56
+
57
+ # Load data
58
+ df = load_data()
59
+
60
+ # Show sample data
61
+ st.subheader("Sample Data from January 2016")
62
+ st.write(df.head())
63
+
64
+ # Load tokenizer and model
65
+ model_name = "bert-base-uncased"
66
+ tokenizer, model = load_tokenizer_and_model(model_name)
67
+
68
+ # Prepare data
69
+ dataset = prepare_data(df, tokenizer)
70
+
71
+ # Split the dataset
72
+ train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
73
+
74
+ def create_dataset(data):
75
+ return Dataset.from_dict({
76
+ 'input_ids': [item[0] for item in data],
77
+ 'attention_mask': [item[1] for item in data],
78
+ 'labels': [item[2] for item in data]
79
+ })
80
+
81
+ train_dataset = create_dataset(train_data)
82
+ eval_dataset = create_dataset(eval_data)
83
+
84
+ # Fine-tune model
85
+ training_args = TrainingArguments(
86
+ output_dir='./results',
87
+ evaluation_strategy="epoch",
88
+ learning_rate=2e-5,
89
+ per_device_train_batch_size=8,
90
+ per_device_eval_batch_size=8,
91
+ num_train_epochs=3,
92
+ weight_decay=0.01,
93
+ )
94
+
95
+ trainer = Trainer(
96
+ model=model,
97
+ args=training_args,
98
+ train_dataset=train_dataset,
99
+ eval_dataset=eval_dataset,
100
+ tokenizer=tokenizer
101
+ )
102
+
103
+ st.subheader("Training the Model")
104
+ if st.button('Train Model'):
105
+ with st.spinner('Training in progress...'):
106
+ trainer.train()
107
+ model.save_pretrained("./finetuned_model")
108
+ tokenizer.save_pretrained("./finetuned_model")
109
+ st.success("Model training complete and saved.")
110
+
111
+ # Display pretrained model data
112
+ st.subheader("Pretrained Model")
113
+ if st.button('Show Pretrained Model'):
114
+ st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
115
+
116
+ if __name__ == "__main__":
117
+ main()
finetune3.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
4
+ from datasets import Dataset, DatasetDict
5
+ from sklearn.model_selection import train_test_split
6
+ import requests
7
+ from io import BytesIO
8
+
9
+ # Load the dataset
10
+ @st.cache_data
11
+ def load_data():
12
+ url = "https://huggingface.co/datasets/HUPD/hupd/resolve/main/hupd_metadata_2022-02-22.feather"
13
+ response = requests.get(url)
14
+ data = BytesIO(response.content)
15
+ df = pd.read_feather(data)
16
+ return df
17
+
18
+ # Tokenizer and model loading
19
+ @st.cache_resource
20
+ def load_tokenizer_and_model(model_name):
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
23
+ return tokenizer, model
24
+
25
+ # Tokenize and prepare the dataset
26
+ def prepare_data(df, tokenizer):
27
+ df['filing_date'] = pd.to_datetime(df['filing_date'])
28
+ jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
29
+
30
+ texts = jan_2016_df['invention_title'].tolist()
31
+ labels = jan_2016_df['patent_number'].tolist()
32
+
33
+ def tokenize_function(texts):
34
+ return tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=512)
35
+
36
+ tokenized_data = tokenize_function(texts)
37
+
38
+ dataset_dict = {
39
+ 'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
40
+ 'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
41
+ 'labels': labels
42
+ }
43
+
44
+ dataset = Dataset.from_dict(dataset_dict)
45
+
46
+ return dataset
47
+
48
+ # Define a custom compute_loss function
49
+ def compute_loss(model, inputs):
50
+ labels = inputs.get("labels")
51
+ outputs = model(**inputs)
52
+ logits = outputs.logits
53
+ loss_fct = torch.nn.CrossEntropyLoss()
54
+ loss = loss_fct(logits, labels)
55
+ return loss
56
+
57
+ # Define Streamlit app
58
+ def main():
59
+ st.title("Patent Classification with Fine-Tuned BERT")
60
+
61
+ # Load data
62
+ df = load_data()
63
+
64
+ # Show sample data
65
+ st.subheader("Sample Data from January 2016")
66
+ st.write(df.head())
67
+
68
+ # Load tokenizer and model
69
+ model_name = "bert-base-uncased"
70
+ tokenizer, model = load_tokenizer_and_model(model_name)
71
+
72
+ # Prepare data
73
+ dataset = prepare_data(df, tokenizer)
74
+
75
+ # Split the dataset
76
+ train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
77
+
78
+ train_dataset = Dataset.from_dict({
79
+ 'input_ids': [item[0] for item in train_data],
80
+ 'attention_mask': [item[1] for item in train_data],
81
+ 'labels': [item[2] for item in train_data]
82
+ })
83
+
84
+ eval_dataset = Dataset.from_dict({
85
+ 'input_ids': [item[0] for item in eval_data],
86
+ 'attention_mask': [item[1] for item in eval_data],
87
+ 'labels': [item[2] for item in eval_data]
88
+ })
89
+
90
+ # Fine-tune model
91
+ training_args = TrainingArguments(
92
+ output_dir='./results',
93
+ evaluation_strategy="epoch",
94
+ learning_rate=2e-5,
95
+ per_device_train_batch_size=8,
96
+ per_device_eval_batch_size=8,
97
+ num_train_epochs=3,
98
+ weight_decay=0.01,
99
+ )
100
+
101
+ trainer = Trainer(
102
+ model=model,
103
+ args=training_args,
104
+ train_dataset=train_dataset,
105
+ eval_dataset=eval_dataset,
106
+ compute_loss=compute_loss # Use the custom loss function
107
+ )
108
+
109
+ st.subheader("Training the Model")
110
+ if st.button('Train Model'):
111
+ with st.spinner('Training in progress...'):
112
+ trainer.train()
113
+ model.save_pretrained("./finetuned_model")
114
+ tokenizer.save_pretrained("./finetuned_model")
115
+ st.success("Model training complete and saved.")
116
+
117
+ # Display pretrained model data
118
+ st.subheader("Pretrained Model")
119
+ if st.button('Show Pretrained Model'):
120
+ st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
121
+
122
+ if __name__ == "__main__":
123
+ main()
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.33.0
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ altair==5.3.0
5
+ attrs==23.2.0
6
+ blinker==1.8.2
7
+ cachetools==5.4.0
8
+ certifi==2024.7.4
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ datasets==2.20.0
12
+ dill==0.3.8
13
+ filelock==3.15.4
14
+ frozenlist==1.4.1
15
+ fsspec==2024.5.0
16
+ gitdb==4.0.11
17
+ GitPython==3.1.43
18
+ huggingface-hub==0.24.2
19
+ idna==3.7
20
+ Jinja2==3.1.4
21
+ joblib==1.4.2
22
+ jsonschema==4.23.0
23
+ jsonschema-specifications==2023.12.1
24
+ markdown-it-py==3.0.0
25
+ MarkupSafe==2.1.5
26
+ mdurl==0.1.2
27
+ mpmath==1.3.0
28
+ multidict==6.0.5
29
+ multiprocess==0.70.16
30
+ networkx==3.3
31
+ numpy==1.26.4
32
+ packaging==24.1
33
+ pandas==2.2.2
34
+ pillow==10.4.0
35
+ protobuf==5.27.2
36
+ psutil==6.0.0
37
+ pyarrow==17.0.0
38
+ pyarrow-hotfix==0.6
39
+ pydeck==0.9.1
40
+ Pygments==2.18.0
41
+ python-dateutil==2.9.0.post0
42
+ pytz==2024.1
43
+ PyYAML==6.0.1
44
+ referencing==0.35.1
45
+ regex==2024.7.24
46
+ requests==2.32.3
47
+ rich==13.7.1
48
+ rpds-py==0.19.1
49
+ safetensors==0.4.3
50
+ scikit-learn==1.5.1
51
+ scipy==1.14.0
52
+ setuptools==71.1.0
53
+ six==1.16.0
54
+ smmap==5.0.1
55
+ streamlit==1.37.0
56
+ sympy==1.13.1
57
+ tenacity==8.5.0
58
+ threadpoolctl==3.5.0
59
+ tokenizers==0.19.1
60
+ toml==0.10.2
61
+ toolz==0.12.1
62
+ torch==2.4.0
63
+ tornado==6.4.1
64
+ tqdm==4.66.4
65
+ transformers==4.43.2
66
+ typing_extensions==4.12.2
67
+ tzdata==2024.1
68
+ urllib3==2.2.2
69
+ xxhash==3.4.1
70
+ yarl==1.9.4