talktorhutika commited on
Commit
ee5f145
·
verified ·
1 Parent(s): 96cdb5a

Update finetune1.py

Browse files

taking 5 entries only

Files changed (1) hide show
  1. finetune1.py +21 -12
finetune1.py CHANGED
@@ -5,6 +5,7 @@ from datasets import Dataset
5
  from sklearn.model_selection import train_test_split
6
  import requests
7
  from io import BytesIO
 
8
 
9
  # Load the dataset
10
  @st.cache_data
@@ -17,9 +18,9 @@ def load_data():
17
 
18
  # Tokenizer and model loading
19
  @st.cache_resource
20
- def load_tokenizer_and_model(model_name):
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
22
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Adjust num_labels as needed
23
  return tokenizer, model
24
 
25
  # Tokenize and prepare the dataset
@@ -27,10 +28,17 @@ def prepare_data(df, tokenizer):
27
  df['filing_date'] = pd.to_datetime(df['filing_date'])
28
  jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
29
 
30
- texts = jan_2016_df['invention_title'].tolist()
 
 
31
 
32
- # Assuming `patent_number` is categorical or needs to be converted to labels
33
- labels = jan_2016_df['patent_number'].astype('category').cat.codes.tolist()
 
 
 
 
 
34
 
35
  # Define tokenization function
36
  def tokenize_function(texts):
@@ -48,7 +56,7 @@ def prepare_data(df, tokenizer):
48
 
49
  dataset = Dataset.from_dict(dataset_dict)
50
 
51
- return dataset
52
 
53
  # Define Streamlit app
54
  def main():
@@ -61,12 +69,13 @@ def main():
61
  st.subheader("Sample Data from January 2016")
62
  st.write(df.head())
63
 
64
- # Load tokenizer and model
65
- model_name = "bert-base-uncased"
66
- tokenizer, model = load_tokenizer_and_model(model_name)
67
-
68
  # Prepare data
69
- dataset = prepare_data(df, tokenizer)
 
 
 
 
 
70
 
71
  # Split the dataset
72
  train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
@@ -114,4 +123,4 @@ def main():
114
  st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
115
 
116
  if __name__ == "__main__":
117
- main()
 
5
  from sklearn.model_selection import train_test_split
6
  import requests
7
  from io import BytesIO
8
+ import numpy as np
9
 
10
  # Load the dataset
11
  @st.cache_data
 
18
 
19
  # Tokenizer and model loading
20
  @st.cache_resource
21
+ def load_tokenizer_and_model(model_name, num_labels):
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
24
  return tokenizer, model
25
 
26
  # Tokenize and prepare the dataset
 
28
  df['filing_date'] = pd.to_datetime(df['filing_date'])
29
  jan_2016_df = df[df['filing_date'].dt.to_period('M') == '2016-01']
30
 
31
+ # Get only 5 unique labels
32
+ unique_labels = jan_2016_df['patent_number'].astype('category').cat.categories[:5]
33
+ jan_2016_df = jan_2016_df[jan_2016_df['patent_number'].isin(unique_labels)]
34
 
35
+ # Re-map labels to integers starting from 0
36
+ label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
37
+ jan_2016_df['label'] = jan_2016_df['patent_number'].map(label_mapping)
38
+
39
+ texts = jan_2016_df['invention_title'].tolist()
40
+ labels = jan_2016_df['label'].tolist()
41
+ num_labels = len(unique_labels)
42
 
43
  # Define tokenization function
44
  def tokenize_function(texts):
 
56
 
57
  dataset = Dataset.from_dict(dataset_dict)
58
 
59
+ return dataset, num_labels
60
 
61
  # Define Streamlit app
62
  def main():
 
69
  st.subheader("Sample Data from January 2016")
70
  st.write(df.head())
71
 
 
 
 
 
72
  # Prepare data
73
+ tokenizer, model = None, None
74
+ dataset, num_labels = prepare_data(df, tokenizer)
75
+
76
+ # Load tokenizer and model with the correct number of labels
77
+ model_name = "bert-base-uncased"
78
+ tokenizer, model = load_tokenizer_and_model(model_name, num_labels)
79
 
80
  # Split the dataset
81
  train_data, eval_data = train_test_split(list(zip(dataset['input_ids'], dataset['attention_mask'], dataset['labels'])), test_size=0.2, random_state=42)
 
123
  st.write("Pretrained model is `bert-base-uncased`. Fine-tuned model is saved at './finetuned_model'.")
124
 
125
  if __name__ == "__main__":
126
+ main()