Spaces:
Runtime error
Runtime error
File size: 5,395 Bytes
55b44b5 33649cd 18ae97f ce2b636 8a75de9 709816c 7d33fb7 9c89421 7d33fb7 71d11a0 9429d1c 71d11a0 9429d1c 6fd5709 9429d1c 44aeeb4 9429d1c 624a2c3 71d11a0 624a2c3 9429d1c 624a2c3 9429d1c 624a2c3 9429d1c 624a2c3 89cadb8 8b143c9 89cadb8 1b13b28 efb2125 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
#!pip install -q transformers datasets torch gradio console_logging numpy
import torch
from datasets import load_dataset
from console_logging.console import Console
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import gradio as gr
console = Console()
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", )
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#labels = [label for label in dataset['train'].features.keys() if label not in ['text']]
labels = ["Bearish", "Bullish", "Neutral"]
def preprocess_data(examples):
# take a batch of texts
text = examples["text"]
# encode them
encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
# add labels
#labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
labels_batch = {'Bearish': [], 'Bullish': [], 'Neutral': []}
for i in range (len(examples['label'])):
labels_batch["Bearish"].append(False)
labels_batch["Bullish"].append(False)
labels_batch["Neutral"].append(False)
if examples['label'][i] == 0:
labels_batch["Bearish"][i] = True
elif examples['label'][i] == 1:
labels_batch["Bullish"][i] = True
else:
labels_batch["Neutral"][i] = True
# create numpy array of shape (batch_size, num_labels)
labels_matrix = np.zeros((len(text), len(labels)))
# fill numpy array
for idx, label in enumerate(labels):
labels_matrix[:, idx] = labels_batch[label]
encoding["labels"] = labels_matrix.tolist()
return encoding
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
problem_type="multi_label_classification",
num_labels=len(labels),
id2label=id2label,
label2id=label2id)
batch_size = 8
metric_name = "f1"
args = TrainingArguments(
f"bert-finetuned-sem_eval-english",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model=metric_name,
#push_to_hub=True,
)
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
# first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions))
# next, use threshold to turn them into integer predictions
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= threshold)] = 1
# finally, compute metrics
y_true = labels
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)
# return as dictionary
metrics = {'f1': f1_micro_average,
'roc_auc': roc_auc,
'accuracy': accuracy}
return metrics
def compute_metrics(p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions,
tuple) else p.predictions
result = multi_label_metrics(
predictions=preds,
labels=p.label_ids)
return result
# REMOVE THIS IN COLAB #############
title = 'Text market sentiment'
text_ = "Bitcoin to the moon"
model = torch.load("./model.pt", map_location=torch.device('cpu'))
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text = "Bitcoin to the moon"
encoding = tokenizer(text, return_tensors="pt")
logits = outputs.logits
logits.shape
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
console.log("a")
console.log(predicted_labels)
console.log("a")
inp = [gr.Textbox(label='API Key', placeholder="What is your API Key?"), gr.Textbox(label='Audio File URL', placeholder="Audio file URL?")]
out = gr.Textbox(label='Output')
text_button = gr.Button("Flip")
text_button.click(audio_to_text, inputs=inp, outputs=out)
interface = gr.Interface.load(input=inp,output=out,
title = title,
theme = "peach",
examples = [[text_]]).launch()
###############
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()
|