In [1]:
import json
from tqdm.auto import tqdm
from datasets import load_dataset
import pandas as pd
import numpy as np
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import requests
from bs4 import BeautifulSoup

page = requests.get('https://arxiv.org/category_taxonomy')
soup = BeautifulSoup(page.content)
tag_to_name = {}
for tag_html in soup.find_all('h4')[1:]:
    tag, name = tag_html.text.split(maxsplit=1)
    tag_to_name[tag] = name[1:-1]
with open('tag_to_name.json', 'w') as fout:
    json.dump(tag_to_name, fout)

In [3]:
tag_to_label = dict(zip(tag_to_name, range(len(tag_to_name))))

In [4]:
def add_labels(row):
    tag_list = eval(row['tag'])
    label_ids, label_tags = [], []
    for tag_dict in tag_list:
        if tag_dict['term'] in tag_to_label:
            label_tags.append(tag_dict['term'])
            label_ids.append(tag_to_label[tag_dict['term']])
    return {'label_ids': label_ids, 'label_tags': label_tags}

In [5]:
dataset = load_dataset("json", data_files="arxivData.json", split="train")
dataset = dataset.map(add_labels, num_proc=8)
dataset = dataset.remove_columns(['author', 'day', 'id', 'link', 'month', 'tag', 'year'])

Using custom data configuration default-60d1f0f90275ae1e
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-66945521f8e38136.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-5298549794823409.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-6c93a706327f5678.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ff58b61d0d461ac4.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-259b966b550351dc.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-8f0ed2baf297a3db.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-845944d2885d6a34.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-8ec43ba6cf3d3eba.arrow


In [6]:
pd.DataFrame(dataset.select([0, 1000, 10000]))

Unnamed: 0,summary,title,label_ids,label_tags
0,We propose an architecture for VQA which utili...,Dual Recurrent Attention Units for Visual Ques...,"[0, 5, 7, 28, 152]","[cs.AI, cs.CL, cs.CV, cs.NE, stat.ML]"
1,"In a physical neural system, where storage and...","A Theory of Local Learning, the Learning Chann...","[22, 28, 152]","[cs.LG, cs.NE, stat.ML]"
2,One way to approach end-to-end autonomous driv...,Query-Efficient Imitation Learning for End-to-...,"[22, 0, 34]","[cs.LG, cs.AI, cs.RO]"


In [7]:
from datasets import DatasetDict

dataset = dataset.train_test_split(test_size=2048, seed=0)
dataset_val = dataset['test'].train_test_split(test_size=1024, seed=0)

dataset = DatasetDict({
    'train': dataset['train'],
    'val': dataset_val['train'],
    'test': dataset_val['test'],
})

dataset

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7ce5346705e1f437.arrow and /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-981e0a6e9da25ee7.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-1ab388509804381c.arrow and /root/.cache/huggingface/datasets/json/default-60d1f0f90275ae1e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-eac731b57f161563.arrow


DatasetDict({
    train: Dataset({
        features: ['summary', 'title', 'label_ids', 'label_tags'],
        num_rows: 38952
    })
    val: Dataset({
        features: ['summary', 'title', 'label_ids', 'label_tags'],
        num_rows: 1024
    })
    test: Dataset({
        features: ['summary', 'title', 'label_ids', 'label_tags'],
        num_rows: 1024
    })
})

In [17]:
def get_collator(tokenizer, abstract_proba=0.5, num_labels=len(tag_to_label)):
    def collate_fn(rows):
        texts = []
        take_abstracts = np.random.rand(len(rows)) < abstract_proba
        for row, take_abstract in zip(rows, take_abstracts):
            if take_abstract:
                texts.append(row['title'] + '[SEP]' + row['summary'])
            else:
                texts.append(row['title'])
        processed = tokenizer(texts, truncation=True, return_tensors='pt', padding=True, max_length=512)
        labels = torch.zeros(size=(len(rows), num_labels), dtype=torch.float)
        for i, row in enumerate(rows):
            labels[i, row['label_ids']] = 1 / len(row['label_ids'])
        processed['labels'] = labels
        return processed
    return collate_fn

In [9]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

model = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/deberta-v3-base',
    problem_type=None,  # https://github.com/huggingface/transformers/blob/v4.28.1/src/transformers/models/deberta_v2/modeling_deberta_v2.py#L1349
    num_labels=len(tag_to_label), id2label={v: k for k, v in tag_to_label.items()}, label2id=tag_to_label)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a Be

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='checkpoints',
    learning_rate=2e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    weight_decay=0.01,
    warmup_ratio=0.02,
    logging_steps=100,
    overwrite_output_dir=True,
    seed=0,
    dataloader_num_workers=8,
    do_train=True,
    do_eval=True,
    max_steps=5000,
    save_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    report_to="tensorboard",
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    tokenizer=tokenizer,
    data_collator=get_collator(tokenizer),
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()

***** Running training *****
  Num examples = 38952
  Num Epochs = 4
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 184541339


Step,Training Loss,Validation Loss
100,4.2861,2.809958
200,2.3657,2.110714
300,2.0236,2.046348
400,2.0204,1.982979
500,1.9273,1.915667
600,1.9195,1.92761
700,1.8346,1.929402
800,1.8408,1.861055
900,1.8239,1.819358
1000,1.7571,1.798097


***** Running Evaluation *****
  Num examples = 1024
  Batch size = 24
Saving model checkpoint to checkpoints/checkpoint-100
Configuration saved in checkpoints/checkpoint-100/config.json
Model weights saved in checkpoints/checkpoint-100/pytorch_model.bin
tokenizer config file saved in checkpoints/checkpoint-100/tokenizer_config.json
Special tokens file saved in checkpoints/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1024
  Batch size = 24
Saving model checkpoint to checkpoints/checkpoint-200
Configuration saved in checkpoints/checkpoint-200/config.json
Model weights saved in checkpoints/checkpoint-200/pytorch_model.bin
tokenizer config file saved in checkpoints/checkpoint-200/tokenizer_config.json
Special tokens file saved in checkpoints/checkpoint-200/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1024
  Batch size = 24
Saving model checkpoint to checkpoints/checkpoint-300
Configuration saved in checkpoints/checkpoint

TrainOutput(global_step=5000, training_loss=1.7068539916992187, metrics={'train_runtime': 1373.8884, 'train_samples_per_second': 87.343, 'train_steps_per_second': 3.639, 'total_flos': 1.9803672136145664e+16, 'train_loss': 1.7068539916992187, 'epoch': 3.08})

In [12]:
!ls checkpoints/

checkpoint-4000  checkpoint-5000  runs


In [94]:
from torch.utils.data import DataLoader

def calc_metrics(model, dataset, abstract_proba):
    dataloader = DataLoader(
        dataset, batch_size=16, shuffle=False,
        collate_fn=get_collator(tokenizer, abstract_proba=abstract_proba)
    )
    precisions, recalls, top1_accs = [], [], []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            outputs = model(**batch.to('cuda'))
            for labels, preds in zip(batch['labels'], outputs.logits.softmax(-1)):
                top_probs, top_inds = preds.sort(descending=True)
                mask = top_probs.cumsum(0) <= 0.95
                mask[0] = True
                a = set(top_inds[mask].tolist())
                y = set(labels.nonzero().flatten().tolist())
                top1_accs.append(int(top_inds[0]) in y)
                recalls.append(len(y & a) / len(y))
                precisions.append(len(y & a) / len(a))
    return {'Recall@0.95': np.mean(recalls),
            'Precision@0.95': np.mean(precisions),
            'Top-1 Accuracy': np.mean(top1_accs)}

In [97]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('checkpoints/checkpoint-4000/')
model = AutoModelForSequenceClassification.from_pretrained('checkpoints/checkpoint-4000/')
model.to('cuda')
model.eval();

In [98]:
calc_metrics(model, dataset['test'], abstract_proba=0.0)

  0%|          | 0/64 [00:00<?, ?it/s]

{'Recall@0.95': 0.9289341517857141,
 'Precision@0.95': 0.28929539856301567,
 'Top-1 Accuracy': 0.791015625}

In [99]:
calc_metrics(model, dataset['test'], abstract_proba=1.0)

  0%|          | 0/64 [00:00<?, ?it/s]

{'Recall@0.95': 0.9357700892857144,
 'Precision@0.95': 0.3308562290529852,
 'Top-1 Accuracy': 0.87109375}

In [100]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('checkpoints/checkpoint-5000/')
model = AutoModelForSequenceClassification.from_pretrained('checkpoints/checkpoint-5000/')
model.to('cuda')
model.eval();

In [101]:
calc_metrics(model, dataset['test'], abstract_proba=0.0)

  0%|          | 0/64 [00:00<?, ?it/s]

{'Recall@0.95': 0.9222516741071428,
 'Precision@0.95': 0.32720513773363014,
 'Top-1 Accuracy': 0.796875}

In [102]:
calc_metrics(model, dataset['test'], abstract_proba=1.0)

  0%|          | 0/64 [00:00<?, ?it/s]

{'Recall@0.95': 0.932661365327381,
 'Precision@0.95': 0.3758523827747189,
 'Top-1 Accuracy': 0.87109375}

* С наличием abstract качество ожидаемо выше;
* Модели с 4k и 5k итераций очень близки по метрикам, но с 5k точность чуть лучше — на инференс возьмем ее.