desllre commited on
Commit
1348d1c
·
verified ·
1 Parent(s): c91f861

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Russian news detection
2
+
3
+ ### About
4
+
5
+ - Model based on `cointegrated/rubert-tiny2`
6
+ - Further training of the model took place on a set of texts of social networks and news texts of the corpus of texts [Taiga](https://tatianashavrina.github.io/taiga_site /)
7
+ - Estimates of the accuracy of the model in the validation sample:
8
+
9
+ | Accuracy | Precision | Recall | F1-score |
10
+ | -------- | --------- | -------- | -------- |
11
+ | 0.996342 | 0.999747 | 0.993717 | 0.996723 |
12
+
13
+ ### Getting started
14
+
15
+ ```python
16
+ from huggingface_hub import hf_hub_download
17
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
18
+ import torch
19
+ import pickle
20
+
21
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
+ model_path = 'desllre/ru_news_detection'
23
+
24
+ encoder_path = hf_hub_download(repo_id=model_path, filename="encoder.pkl")
25
+ with open(encoder_path, "rb") as f:
26
+ encoder = pickle.load(f)
27
+
28
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
29
+ classifier = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
30
+
31
+ text = 'Tesla дала добро на взлом ПО своих автомобилей\n\nКомпания изменила условия программы Bug Bounty, предусматривающей выплату вознаграждений за поиск уязвимостей. Теперь энтузиасты могут взламывать электрокары Tesla, не боясь отзыва гарантии. Более того, в соответствии с новой политикой компании, автопроизводитель будет перепрошивать автомобили, ПО которых вышло из строя в процессе экспериментов специалистов кибербезопасности.\n\nИзменения в политике компании Telsa очень тепло встретили представители индустрии.'
32
+
33
+ tokenized = tokenize_function(text, news_tokenizer)
34
+ tokenized = {key: value.to(device) for key, value in tokenized.items()}
35
+ with torch.no_grad():
36
+ output = classifier(**tokenized)
37
+
38
+ predicted_class_id = torch.argmax(output.logits, dim=1).item()
39
+ label = label_encoder.inverse_transform([predicted_class_id])[0]
40
+
41
+ print(label)
42
+ ```
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "emb_size": 312,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 312,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 600,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 2048,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 3,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "problem_type": "single_label_classification",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.51.3",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 83828
27
+ }
encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3efb12a29b5b3481414b87f99cebd129162d04824212bc193ee08d641919ff8
3
+ size 258
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c52166c0edb3e2de30c642eea39be43b14fe2990c63025561a781787814ef9
3
+ size 116784136
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "max_length": 512,
51
+ "model_max_length": 2048,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "[UNK]"
65
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8e466f28d86902c4ea9cd76781236a119b0b5c5c5a05dac56f2e09b550e4b97
3
+ size 5368
vocab.txt ADDED
The diff for this file is too large to render. See raw diff