In [1]:
from transformers import pipeline
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
from tqdm import tqdm
import re
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
import torch
import numpy as np
from typing import Dict
from transformers import AutoModel
from typing import List
from transformers import TrainingArguments, Trainer
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
USED_MODEL = "distilbert-base-cased"

In [9]:
def read_json(json_filename):
    with open(json_filename, 'r') as f:
        return json.loads(f.read())


def save_json(json_object, json_filename, indent=4):
    with open(json_filename, 'w') as f:
        json.dump(json_object, f, separators=(',', ':'), indent=indent)

**–î–∞–Ω–Ω—ã–µ –±–µ—Ä–µ–º –æ—Ç—Å—é–¥–∞: https://www.kaggle.com/datasets/neelshah18/arxivdataset**

In [10]:
arxiv_data = read_json('arxivData.json')

In [11]:
arxiv_data[0]

{'author': "[{'name': 'Ahmed Osman'}, {'name': 'Wojciech Samek'}]",
 'day': 1,
 'id': '1802.00209v1',
 'link': "[{'rel': 'alternate', 'href': 'http://arxiv.org/abs/1802.00209v1', 'type': 'text/html'}, {'rel': 'related', 'href': 'http://arxiv.org/pdf/1802.00209v1', 'type': 'application/pdf', 'title': 'pdf'}]",
 'month': 2,
 'summary': 'We propose an architecture for VQA which utilizes recurrent layers to\ngenerate visual and textual attention. The memory characteristic of the\nproposed recurrent attention units offers a rich joint embedding of visual and\ntextual features and enables the model to reason relations between several\nparts of the image and question. Our single model outperforms the first place\nwinner on the VQA 1.0 dataset, performs within margin to the current\nstate-of-the-art ensemble model. We also experiment with replacing attention\nmechanisms in other state-of-the-art models with our implementation and show\nincreased accuracy. In both cases, our recurrent attention

**–•–æ—Ç–∏–º –ø–æ –Ω–∞–∑–≤–∞–Ω–∏—é —Å—Ç–∞—Ç—å–∏ + abstract –≤—ã–¥–∞–≤–∞—Ç—å –Ω–∞–∏–±–æ–ª–µ–µ –≤–µ—Ä–æ—è—Ç–Ω—É—é —Ç–µ–º–∞—Ç–∏–∫—É —Å—Ç–∞—Ç—å–∏, —Å–∫–∞–∂–µ–º, —Ñ–∏–∑–∏–∫–∞, –±–∏–æ–ª–æ–≥–∏—è –∏–ª–∏ computer science** 

In [2]:
# Manually prepared dataframe with arxiv topics
arxiv_topics_df = pd.read_csv('arxiv_topics.csv')
print(len(arxiv_topics_df))
arxiv_topics_df.head(5)

155


Unnamed: 0,tag,topic,category
0,cs.AI,Artificial Intelligence,Computer Science
1,cs.AR,Hardware Architecture,Computer Science
2,cs.CC,Computational Complexity,Computer Science
3,cs.CE,"Computational Engineering, Finance, and Science",Computer Science
4,cs.CG,Computational Geometry,Computer Science


In [3]:
tag_to_index = {}
tag_to_category = {}
for i, row in arxiv_topics_df.iterrows():
    tag_to_index[row['tag']] = i
    tag_to_category[row['tag']] = row['category']
index_to_tag = {value: key for key, value in tag_to_index.items()}

**–ì–æ—Ç–æ–≤–∏–º –¥–∞–Ω–Ω—ã–µ –∫ –æ–±—É—á–µ–Ω–∏—é**

In [49]:
def is_valid_tag(tag: str) -> bool:
    return tag in tag_to_index

total_categories_count = 0
total_tags_count = 0
records = []
for arxiv_record in tqdm(arxiv_data):
    record = {
        'title': arxiv_record['title'],
        'summary': arxiv_record['summary'],
        'title_and_summary': arxiv_record['title'] + ' $ ' + arxiv_record['summary'],
        'tags': sorted([current_tag['term'] for current_tag in eval(arxiv_record['tag']) if is_valid_tag(current_tag['term'])], key=lambda x: tag_to_index[x])
    }
    categories = set(tag_to_category[tag] for tag in record['tags'])
    total_categories_count += len(categories)
    total_tags_count += len(record['tags'])
    record['tags_indices'] = [tag_to_index[tag] for tag in record['tags']]
    assert len(record['tags']) > 0
    records.append(record)

print(f'–°—Ä–µ–¥–Ω–µ–µ —á–∏—Å–ª–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤ –æ–¥–Ω–æ–π —Å—Ç–∞—Ç—å–µ: {total_categories_count / len(arxiv_data)}')
print(f'–°—Ä–µ–¥–Ω–µ–µ —á–∏—Å–ª–æ —Ç–µ–≥–æ–≤ –≤ –æ–¥–Ω–æ–π —Å—Ç–∞—Ç—å–µ: {total_tags_count / len(arxiv_data)}')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 41000/41000 [00:01<00:00, 33941.59it/s]

–°—Ä–µ–¥–Ω–µ–µ —á–∏—Å–ª–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤ –æ–¥–Ω–æ–π —Å—Ç–∞—Ç—å–µ: 1.3301219512195122
–°—Ä–µ–¥–Ω–µ–µ —á–∏—Å–ª–æ —Ç–µ–≥–æ–≤ –≤ –æ–¥–Ω–æ–π —Å—Ç–∞—Ç—å–µ: 1.8489024390243902





–ö–∞–∫ –≤–∏–¥–∏–º, –ø–µ—Ä–µ–¥ –Ω–∞–º–∏ –∑–∞–¥–∞—á–∞ –º—É–ª—å—Ç–∏–±–∏–Ω–∞—Ä–Ω–æ–π –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏.

–¢–µ–≥–æ–≤ —É –æ–¥–Ω–æ–π —Å—Ç–∞—Ç—å–∏ –±—ã–≤–∞–µ—Ç –º–Ω–æ–≥–æ, —ç—Ç–æ –ø–æ–Ω—è—Ç–Ω–æ, –Ω–æ –∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–π —Ç–æ–∂–µ –±—ã–≤–∞–µ—Ç –º–Ω–æ–≥–æ. –¢–æ –µ—Å—Ç—å, —É—Å–ª–æ–≤–Ω–æ —Å—Ç–∞—Ç—å—è –º–æ–∂–µ—Ç –±—ã—Ç—å –ø–æ—Å–≤—è—â–µ–Ω–∞ –∏ —Ñ–∏–∑–∏–∫–µ –∏ –±–∏–æ–ª–æ–≥–∏–∏ –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ.

–ü–æ–ø—Ä–æ–±—É–µ–º –æ–±—É—á–∏—Ç—å –º–æ–¥–µ–ª—å –æ–ø—Ä–µ–¥–µ–ª—è—Ç—å —Ç–µ–≥–∏ - —Ç–∞–∫ –æ–Ω–∞ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ –º–æ–∂–µ—Ç —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –≤ —Å–µ–±–µ –±–æ–ª—å—à–µ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏, —á–µ–º –µ—Å–ª–∏ –µ–µ –æ–±—É—á–∏—Ç—å –æ–ø—Ä–µ–¥–µ–ª—è—Ç—å –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ (–∫–æ—Ç–æ—Ä—ã—Ö –≥–æ—Ä–∞–∑–¥–æ –º–µ–Ω—å—à–µ).

**–°–æ–µ–¥–∏–Ω—è–µ–º title –∏ summary –∏—Å–ø–æ–ª—å–∑—É—è —Å–∏–º–≤–æ–ª `$` - –æ–Ω —Ä–µ–¥–∫–∏–π, –ø—Ä–∏ —ç—Ç–æ–º –µ–≥–æ –∑–Ω–∞–µ—Ç —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä, –ø–æ—ç—Ç–æ–º—É –Ω–µ –ø—Ä–∏–¥–µ—Ç—Å—è —Å –Ω–∏–º –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ –≤–æ–∑–∏—Ç—å—Å—è**

In [50]:
full_data_df = pd.DataFrame(records)
print(len(full_data_df))
full_data_df.head(5)

41000


Unnamed: 0,title,summary,title_and_summary,tags,tags_indices
0,Dual Recurrent Attention Units for Visual Ques...,We propose an architecture for VQA which utili...,Dual Recurrent Attention Units for Visual Ques...,"[cs.AI, cs.CL, cs.CV, cs.NE, stat.ML]","[0, 5, 7, 28, 152]"
1,Sequential Short-Text Classification with Recu...,Recent approaches based on artificial neural n...,Sequential Short-Text Classification with Recu...,"[cs.AI, cs.CL, cs.LG, cs.NE, stat.ML]","[0, 5, 22, 28, 152]"
2,Multiresolution Recurrent Neural Networks: An ...,We introduce the multiresolution recurrent neu...,Multiresolution Recurrent Neural Networks: An ...,"[cs.AI, cs.CL, cs.LG, cs.NE, stat.ML]","[0, 5, 22, 28, 152]"
3,Learning what to share between loosely related...,Multi-task learning is motivated by the observ...,Learning what to share between loosely related...,"[cs.AI, cs.CL, cs.LG, cs.NE, stat.ML]","[0, 5, 22, 28, 152]"
4,A Deep Reinforcement Learning Chatbot,We present MILABOT: a deep reinforcement learn...,A Deep Reinforcement Learning Chatbot $ We pre...,"[cs.AI, cs.CL, cs.LG, cs.NE, stat.ML]","[0, 5, 22, 28, 152]"


–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Ç–µ–≥–æ–≤ –∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤ –¥–∞–Ω–Ω—ã—Ö

In [57]:
tag_to_count = defaultdict(int)
category_to_count = defaultdict(int)
for i, row in full_data_df.iterrows():
    found_categories = set()
    for tag in row['tags']:
        tag_to_count[tag] += 1
        found_categories.add(tag_to_category[tag])
    for category in found_categories:
        category_to_count[category] += 1
print(category_to_count)
print(tag_to_count)

defaultdict(<class 'int'>, {'Statistics': 10618, 'Computer Science': 39251, 'Physics': 1208, 'Mathematics': 2263, 'Quantitative Biology': 896, 'Electrical Engineering and Systems Science': 220, 'Quantitative Finance': 66, 'Economics': 13})
defaultdict(<class 'int'>, {'cs.AI': 10481, 'cs.CL': 6417, 'cs.CV': 13902, 'cs.NE': 3819, 'stat.ML': 10326, 'cs.LG': 13735, 'physics.soc-ph': 293, 'stat.AP': 360, 'cs.RO': 973, 'cs.SE': 180, 'cs.MA': 268, 'math.OC': 1020, 'cs.IR': 1443, 'cond-mat.dis-nn': 126, 'stat.ME': 458, 'physics.chem-ph': 16, 'cs.DC': 404, 'stat.CO': 260, 'q-bio.NC': 513, 'cs.GT': 318, 'cs.MM': 345, 'cs.CG': 94, 'cs.CR': 411, 'cs.HC': 434, 'cs.GL': 10, 'eess.AS': 89, 'cs.SD': 389, 'math.DS': 49, 'cs.GR': 225, 'math.NA': 172, 'cs.CY': 376, 'physics.data-an': 187, 'math.ST': 336, 'stat.TH': 336, 'cs.IT': 543, 'math.IT': 543, 'quant-ph': 142, 'astro-ph.GA': 6, 'astro-ph.IM': 76, 'cs.SI': 639, 'cs.DB': 327, 'cs.LO': 643, 'nlin.AO': 119, 'cs.PF': 35, 'cs.ET': 85, 'eess.IV': 85, 'cs.

**–ö–∞–∫ –≤–∏–¥–∏–º, Computer science –≤—Å—Ç—Ä–µ—á–∞–µ—Ç—Å—è –æ—á–µ–Ω—å —á–∞—Å—Ç–æ. –ê, –Ω–∞–ø—Ä–∏–º–µ—Ä, —ç–∫–æ–Ω–æ–º–∏–∫–∞ - —Å–æ–≤—Å–µ–º —Ä–µ–¥–∫–æ**

**–≠—Ç–æ –ø–æ-—Ö–æ—Ä–æ—à–µ–º—É –Ω—É–∂–Ω–æ —É—á–µ—Å—Ç—å, –Ω–æ –≤ —Ä–∞–º–∫–∞—Ö –¥–∞–Ω–Ω–æ–≥–æ –Ω–æ—É—Ç–±—É–∫–∞ –º—ã —ç—Ç–æ –¥–µ–ª–∞—Ç—å –Ω–µ –±—É–¥–µ–º**

In [10]:
text_data = list(full_data_df['title_and_summary'])
tags_indices = list(full_data_df['tags_indices'])

In [11]:
X_train_val, X_test, y_train_val, y_test = train_test_split(text_data, tags_indices, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=2/9, random_state=42)
print(len(X_train), len(X_val), len(X_test))
# Train is 70%, val is 20%, test is 10%

28700 8200 4100


In [12]:
tokenizer = DistilBertTokenizer.from_pretrained(USED_MODEL)

In [13]:
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True)

In [17]:
print(text_data[0])
tokenize_function(text_data[0])

Dual Recurrent Attention Units for Visual Question Answering $ We propose an architecture for VQA which utilizes recurrent layers to
generate visual and textual attention. The memory characteristic of the
proposed recurrent attention units offers a rich joint embedding of visual and
textual features and enables the model to reason relations between several
parts of the image and question. Our single model outperforms the first place
winner on the VQA 1.0 dataset, performs within margin to the current
state-of-the-art ensemble model. We also experiment with replacing attention
mechanisms in other state-of-the-art models with our implementation and show
increased accuracy. In both cases, our recurrent attention mechanism improves
performance in tasks requiring sequential or relational reasoning on the VQA
dataset.


{'input_ids': [101, 27791, 11336, 21754, 1335, 5208, 2116, 21687, 1111, 12071, 22171, 26018, 1158, 109, 1284, 17794, 1126, 4220, 1111, 159, 4880, 1592, 1134, 24242, 1231, 21754, 8798, 1106, 9509, 5173, 1105, 3087, 4746, 2209, 119, 1109, 2962, 7987, 1104, 1103, 3000, 1231, 21754, 2209, 2338, 3272, 170, 3987, 4091, 9712, 4774, 3408, 1104, 5173, 1105, 3087, 4746, 1956, 1105, 13267, 1103, 2235, 1106, 2255, 4125, 1206, 1317, 2192, 1104, 1103, 3077, 1105, 2304, 119, 3458, 1423, 2235, 1149, 3365, 13199, 1116, 1103, 1148, 1282, 2981, 1113, 1103, 159, 4880, 1592, 122, 119, 121, 2233, 9388, 117, 10383, 1439, 7464, 1106, 1103, 1954, 1352, 118, 1104, 118, 1103, 118, 1893, 9525, 2235, 119, 1284, 1145, 7886, 1114, 5861, 2209, 10748, 1107, 1168, 1352, 118, 1104, 118, 1103, 118, 1893, 3584, 1114, 1412, 7249, 1105, 1437, 2569, 10893, 119, 1130, 1241, 2740, 117, 1412, 1231, 21754, 2209, 6978, 4607, 1116, 2099, 1107, 8249, 8753, 14516, 21967, 1137, 6796, 1348, 14417, 1113, 1103, 159, 4880, 1592, 2233, 93

In [18]:
train_encodings = tokenize_function(X_train)
val_encodings = tokenize_function(X_val)
test_encodings = tokenize_function(X_test)

In [19]:
print(type(train_encodings))
print(dir(train_encodings))
print(len(train_encodings))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
['_MutableMapping__marker', '__abstractmethods__', '__class__', '__class_getitem__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_encodings', '_n_sequences', 'char_to_token', 'char_to_word', 'clear', 'convert_to_tensors', 'copy', 'data', 'encodings', 'fromkeys', 'get', 'is_fast', 'items', 'keys', 'n_sequences', 'pop', 'popitem', 'sequence_ids', 'setdefault', 'to', 'token_to_chars', 'token_to_sequence', 'token_to_word', 'tokens', '

In [20]:
def get_labels(y: List[List[int]]):
  labels = np.zeros((len(y), len(tag_to_index)))
  for i in tqdm(range(len(y))):
    labels[i, y[i]] = 1
  return labels.tolist()

In [21]:
labels_train = get_labels(y_train)
labels_val = get_labels(y_val)
labels_test = get_labels(y_test)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28700/28700 [00:00<00:00, 388780.42it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8200/8200 [00:00<00:00, 223262.03it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4100/4100 [00:00<00:00, 165215.75it/s]


In [22]:
train_encodings['labels'] = labels_train
val_encodings['labels'] = labels_val
test_encodings['labels'] = labels_test

**–Ø –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª –ø—Ä–∏–º–µ—Ä –æ—Ç—Å—é–¥–∞ —á—Ç–æ–±—ã –ø–æ–Ω–∏–º–∞—Ç—å, –∫–∞–∫–æ–π –Ω—É–∂–µ–Ω —Ñ–æ—Ä–º–∞—Ç –¥–∞–Ω–Ω—ã—Ö https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb**

In [23]:
train_dataset = Dataset.from_dict(train_encodings)
val_dataset = Dataset.from_dict(val_encodings)
test_dataset = Dataset.from_dict(test_encodings)

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(
    USED_MODEL, 
    problem_type="multi_label_classification", 
    num_labels=len(tag_to_index),
    id2label=index_to_tag,
    label2id=tag_to_index
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
batch_size = 8
metric_name = "f1"

In [None]:
args = TrainingArguments(
    output_dir=f'train-{USED_MODEL}-baseline',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False
)



In [27]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [28]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.024,0.022899,0.652954,0.770167,0.410366
2,0.0204,0.02073,0.673765,0.785226,0.426829
3,0.0179,0.019692,0.700292,0.812313,0.425
4,0.0161,0.019695,0.701593,0.812366,0.433171
5,0.0148,0.019767,0.701193,0.81271,0.431707


TrainOutput(global_step=17940, training_loss=0.02238395190159214, metrics={'train_runtime': 1927.2238, 'train_samples_per_second': 74.459, 'train_steps_per_second': 9.309, 'total_flos': 1.906093867776e+16, 'train_loss': 0.02238395190159214, 'epoch': 5.0})

In [32]:
trainer.evaluate(eval_dataset=val_dataset)

{'eval_loss': 0.019695421680808067,
 'eval_f1': 0.7015928686248721,
 'eval_roc_auc': 0.8123655228058703,
 'eval_accuracy': 0.43317073170731707,
 'eval_runtime': 34.8656,
 'eval_samples_per_second': 235.189,
 'eval_steps_per_second': 29.399,
 'epoch': 5.0}

In [33]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.019682902842760086,
 'eval_f1': 0.6966158423205653,
 'eval_roc_auc': 0.8081637343174538,
 'eval_accuracy': 0.4370731707317073,
 'eval_runtime': 16.5771,
 'eval_samples_per_second': 247.329,
 'eval_steps_per_second': 30.946,
 'epoch': 5.0}

–ò—Å—Ö–æ–¥–Ω–∞—è –∑–∞–¥–∞—á–∞ —É –Ω–∞—Å –∑–≤—É—á–∞–ª–∞ –∫–∞–∫ "—Ö–æ—Ç–∏–º —É–≤–∏–¥–µ—Ç—å —Ç–æ–ø-95%* —Ç–µ–º–∞—Ç–∏–∫, –æ—Ç—Å–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –ø–æ —É–±—ã–≤–∞–Ω–∏—é –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏", –≥–¥–µ –ø–æ–¥ —Ç–µ–º–∞—Ç–∏–∫–∞–º–∏ –∏–º–µ–ª–∏—Å—å –≤–≤–∏–¥—É –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ (—Ñ–∏–∑–∏–∫–∞, –±–∏–æ–ª–æ–≥–∏—è –∏ —Ç–∞–∫ –¥–∞–ª–µ–µ)

–ë—É–¥–µ–º –¥–µ–ª–∞—Ç—å —Å–ª–µ–¥—É—é—â–µ–µ:
- –Ω–∞—à–∞ –º–æ–¥–µ–ª—å –≤—ã–¥–∞–µ—Ç –ª–æ–≥–∏—Ç—ã —Ç–µ–≥–æ–≤
- –ø–æ—Å—á–∏—Ç–∞–µ–º —Å –∏—Ö –ø–æ–º–æ—â—å—é –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å –∫–∞–∂–¥–æ–≥–æ —Ç–µ–≥–∞, —Å—á–∏—Ç–∞—è —Å—É–º–º—É –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π —Ä–∞–≤–Ω–æ–π 1
- –ø–æ—Å—á–∏—Ç–∞–µ–º –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –∫–∞–∫ —Å—É–º–º—É –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π —Ç–µ–≥–æ–≤
- –≤—ã–≤–µ–¥–µ–º —Ç—Ä–µ–±—É–µ–º—ã–µ —Ç–æ–ø-95% —Ç–µ–º–∞—Ç–∏–∫

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    "train_distilbert-base-cased/checkpoint-17940", 
    problem_type="multi_label_classification", 
    num_labels=len(tag_to_index),
    id2label=index_to_tag,
    label2id=tag_to_index
).to(torch.device('cuda'))

In [10]:
model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function('Maths is cool $ In our article we prove that maths is the coolest subject at school').items()})

SequenceClassifierOutput(loss=None, logits=tensor([[-1.3623, -5.3834, -3.3988, -3.4555, -3.7096, -4.5285, -5.1323, -2.3077,
         -3.6645, -4.6847, -4.2481, -5.0417, -3.5121, -2.7808, -5.9767, -4.8864,
         -5.6730, -4.6838, -3.8588, -5.2819, -3.9295, -2.7704,  0.4331, -4.5505,
         -5.2648, -4.9248, -4.2074, -3.4895, -3.2717, -5.2713, -5.7536, -7.2749,
         -4.8728, -5.2606, -4.5935, -4.7103, -5.4628, -5.4589, -5.3678, -3.5648,
         -5.1455, -8.8455, -9.1583, -6.4358, -4.7737, -4.7821, -8.9264, -5.8790,
         -4.7536, -5.4549, -5.3879, -6.1918, -4.1667, -7.1828, -7.3235, -5.4470,
         -4.6688, -4.7201, -6.2949, -7.5401, -6.6242, -6.1022, -5.5325, -3.1546,
         -9.4200, -5.2060, -5.3880, -6.8743, -3.3176, -7.2654, -7.4301, -3.0929,
         -3.2351, -9.0408, -5.4315, -6.3230, -9.5853, -5.7075, -3.6443, -5.5524,
         -6.0723, -6.0414, -7.3201, -3.9738, -5.5964, -4.0455, -5.2017, -5.8061,
         -7.8401, -7.5268, -7.4576, -4.4483, -6.4790, -5.9085, -6.

In [19]:
@torch.no_grad
def get_category_probs_dict(model, title: str, summary: str) -> Dict[str, float]:
    text = f'{title} $ {summary}'
    tags_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits
    sigmoid = torch.nn.Sigmoid()
    tags_probs = sigmoid(tags_logits.squeeze().cpu()).numpy()
    tags_probs /= tags_probs.sum()
    category_probs_dict = {category: 0.0 for category in set(arxiv_topics_df['category'])}
    for index in range(len(index_to_tag)):
        category_probs_dict[tag_to_category[index_to_tag[index]]] += float(tags_probs[index])
    return category_probs_dict

In [16]:
def get_most_probable_keys(probs_dict: Dict[str, float], target_probability: float, print_probabilities: bool) -> List[str]:
    current_p = 0
    probs_list = sorted([(value, key) for key, value in probs_dict.items()])[::-1]
    current_index = 0
    answer = []
    while current_p <= target_probability:
        current_p += probs_list[current_index][0]
        if not print_probabilities:
            answer.append(probs_list[current_index][1])
        else:
            answer.append(f'{probs_list[current_index][1]} ({probs_list[current_index][0]})')
        current_index += 1
        if current_index >= len(probs_list):
            break
    return answer

–¢–µ–ø–µ—Ä—å –Ω—É–∂–Ω–æ –∫–∞–∫-—Ç–æ —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –º–æ–¥–µ–ª—å, —á—Ç–æ–±—ã –ø–æ—Ç–æ–º –º–æ–∂–Ω–æ –±—ã–ª–æ –µ—ë –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –≤ huggingface space

In [6]:
model.push_to_hub("bumchik2/train_distilbert-base-cased-tags-classification-simple")

model.safetensors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 264M/264M [00:31<00:00, 8.47MB/s] 


CommitInfo(commit_url='https://huggingface.co/bumchik2/train_distilbert-base-cased-tags-classification-simple/commit/98a87d7c96e0647dd557a9d47be03ddd30e0c964', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='98a87d7c96e0647dd557a9d47be03ddd30e0c964', pr_url=None, repo_url=RepoUrl('https://huggingface.co/bumchik2/train_distilbert-base-cased-tags-classification-simple', endpoint='https://huggingface.co', repo_type='model', repo_id='bumchik2/train_distilbert-base-cased-tags-classification-simple'), pr_revision=None, pr_num=None)

–¢–µ–ø–µ—Ä—å —è —Å–º–æ–≥—É –∑–∞–≥—Ä—É–∂–∞—Ç—å —Å–≤–æ—é –º–æ–¥–µ–ª—å –æ—Ç—Ç—É–¥–∞

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bumchik2/train_distilbert-base-cased-tags-classification-simple", 
    problem_type="multi_label_classification", 
    num_labels=len(tag_to_index),
    id2label=index_to_tag,
    label2id=tag_to_index
).to(torch.device('cuda'))