talktorhutika commited on
Commit
e19d553
·
verified ·
1 Parent(s): 7ea55f9

Update finetune3.py

Browse files
Files changed (1) hide show
  1. finetune3.py +66 -36
finetune3.py CHANGED
@@ -1,7 +1,8 @@
 
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
4
- from datasets import Dataset, DatasetDict
5
  from sklearn.model_selection import train_test_split
6
  import requests
7
  from io import BytesIO
@@ -16,10 +17,9 @@ def load_data():
16
  return df
17
 
18
  # Tokenizer and model loading
19
- @st.cache_resource
20
- def load_tokenizer_and_model(model_name):
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
22
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
23
  return tokenizer, model
24
 
25
  # Tokenize and prepare the dataset
@@ -27,14 +27,26 @@ def prepare_data(df, tokenizer):
27
  df['filing_date'] = pd.to_datetime(df['filing_date'])
28
  jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
29
 
30
- texts = jan_2016_df['invention_title'].tolist()
31
- labels = jan_2016_df['patent_number'].tolist()
 
32
 
 
 
 
 
 
 
 
 
 
33
  def tokenize_function(texts):
34
- return tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt", max_length=512)
35
-
 
36
  tokenized_data = tokenize_function(texts)
37
-
 
38
  dataset_dict = {
39
  'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
40
  'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
@@ -43,16 +55,7 @@ def prepare_data(df, tokenizer):
43
 
44
  dataset = Dataset.from_dict(dataset_dict)
45
 
46
- return dataset
47
-
48
- # Define a custom compute_loss function
49
- def compute_loss(model, inputs):
50
- labels = inputs.get("labels")
51
- outputs = model(**inputs)
52
- logits = outputs.logits
53
- loss_fct = torch.nn.CrossEntropyLoss()
54
- loss = loss_fct(logits, labels)
55
- return loss
56
 
57
  # Define Streamlit app
58
  def main():
@@ -65,27 +68,33 @@ def main():
65
  st.subheader("Sample Data from January 2016")
66
  st.write(df.head())
67
 
68
- # Load tokenizer and model
 
69
  model_name = "bert-base-uncased"
70
- tokenizer, model = load_tokenizer_and_model(model_name)
71
 
72
- # Prepare data
73
- dataset = prepare_data(df, tokenizer)
 
 
 
 
 
 
 
 
74
 
75
  # Split the dataset
76
  train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
77
 
78
- train_dataset = Dataset.from_dict({
79
- 'input_ids': [item[0] for item in train_data],
80
- 'attention_mask': [item[1] for item in train_data],
81
- 'labels': [item[2] for item in train_data]
82
- })
83
-
84
- eval_dataset = Dataset.from_dict({
85
- 'input_ids': [item[0] for item in eval_data],
86
- 'attention_mask': [item[1] for item in eval_data],
87
- 'labels': [item[2] for item in eval_data]
88
- })
89
 
90
  # Fine-tune model
91
  training_args = TrainingArguments(
@@ -103,7 +112,7 @@ def main():
103
  args=training_args,
104
  train_dataset=train_dataset,
105
  eval_dataset=eval_dataset,
106
- compute_loss=compute_loss # Use the custom loss function
107
  )
108
 
109
  st.subheader("Training the Model")
@@ -117,7 +126,28 @@ def main():
117
  # Display pretrained model data
118
  st.subheader("Pretrained Model")
119
  if st.button('Show Pretrained Model'):
120
- st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  if __name__ == "__main__":
123
  main()
 
1
+ import os
2
  import streamlit as st
3
  import pandas as pd
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
+ from datasets import Dataset
6
  from sklearn.model_selection import train_test_split
7
  import requests
8
  from io import BytesIO
 
17
  return df
18
 
19
  # Tokenizer and model loading
20
+ def load_tokenizer_and_model(model_name, num_labels):
 
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
23
  return tokenizer, model
24
 
25
  # Tokenize and prepare the dataset
 
27
  df['filing_date'] = pd.to_datetime(df['filing_date'])
28
  jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
29
 
30
+ # Get only 5 unique labels
31
+ unique_labels = jan_2016_df['patent_number'].astype('category').cat.categories[:5]
32
+ jan_2016_df = jan_2016_df[jan_2016_df['patent_number'].isin(unique_labels)]
33
 
34
+ # Re-map labels to integers starting from 0
35
+ label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
36
+ jan_2016_df['label'] = jan_2016_df['patent_number'].map(label_mapping)
37
+
38
+ texts = jan_2016_df['invention_title'].tolist()
39
+ labels = jan_2016_df['label'].tolist()
40
+ num_labels = len(unique_labels)
41
+
42
+ # Define tokenization function
43
  def tokenize_function(texts):
44
+ return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
45
+
46
+ # Tokenize texts
47
  tokenized_data = tokenize_function(texts)
48
+
49
+ # Create dataset
50
  dataset_dict = {
51
  'input_ids': [x.tolist() for x in tokenized_data['input_ids']],
52
  'attention_mask': [x.tolist() for x in tokenized_data['attention_mask']],
 
55
 
56
  dataset = Dataset.from_dict(dataset_dict)
57
 
58
+ return dataset, num_labels
 
 
 
 
 
 
 
 
 
59
 
60
  # Define Streamlit app
61
  def main():
 
68
  st.subheader("Sample Data from January 2016")
69
  st.write(df.head())
70
 
71
+ # Prepare data
72
+ # First, select a model name
73
  model_name = "bert-base-uncased"
 
74
 
75
+ # Initialize the tokenizer and model with a dummy number of labels for now
76
+ dummy_num_labels = 5
77
+ tokenizer, model = load_tokenizer_and_model(model_name, dummy_num_labels)
78
+
79
+ # Prepare the data
80
+ dataset, num_labels = prepare_data(df, tokenizer)
81
+
82
+ # Update the model with the correct number of labels based on the data
83
+ if num_labels != dummy_num_labels:
84
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
85
 
86
  # Split the dataset
87
  train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
88
 
89
+ def create_dataset(data):
90
+ return Dataset.from_dict({
91
+ 'input_ids': [item[0] for item in data],
92
+ 'attention_mask': [item[1] for item in data],
93
+ 'labels': [item[2] for item in data]
94
+ })
95
+
96
+ train_dataset = create_dataset(train_data)
97
+ eval_dataset = create_dataset(eval_data)
 
 
98
 
99
  # Fine-tune model
100
  training_args = TrainingArguments(
 
112
  args=training_args,
113
  train_dataset=train_dataset,
114
  eval_dataset=eval_dataset,
115
+ tokenizer=tokenizer
116
  )
117
 
118
  st.subheader("Training the Model")
 
126
  # Display pretrained model data
127
  st.subheader("Pretrained Model")
128
  if st.button('Show Pretrained Model'):
129
+ model_dir = './finetuned_model'
130
+
131
+ # List files in the directory
132
+ if os.path.exists(model_dir):
133
+ files = os.listdir(model_dir)
134
+ st.write("Contents of `./finetuned_model` directory:")
135
+ st.write(files)
136
+ else:
137
+ st.write("Directory `./finetuned_model` does not exist.")
138
+
139
+ # Optionally, show the file content of model files (e.g., config.json, pytorch_model.bin)
140
+ st.subheader("Model File Contents")
141
+ if st.button('Show Model File Contents'):
142
+ if os.path.exists(model_dir):
143
+ files = os.listdir(model_dir)
144
+ for file in files:
145
+ file_path = os.path.join(model_dir, file)
146
+ with open(file_path, 'r') as f:
147
+ st.write(f"**{file}:**")
148
+ st.write(f.read())
149
+ else:
150
+ st.write("Directory `./finetuned_model` does not exist.")
151
 
152
  if __name__ == "__main__":
153
  main()