Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +51 -0
config.json +27 -0
encoder.pkl +3 -0
model.safetensors +3 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +65 -0
training_args.bin +3 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+## Russian news detection
+### About
+- Model based on `cointegrated/rubert-tiny2`
+- Further training of the model took place on a set of texts of social networks and news texts of the corpus of texts [Taiga](https://tatianashavrina.github.io/taiga_site /)
+- Estimates of the accuracy of the model in the validation sample:
+| Accuracy | Precision | Recall   | F1-score |
+| -------- | --------- | -------- | -------- |
+| 0.996342 | 0.999747  | 0.993717 | 0.996723 |
+### Getting started
+```python
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import pickle
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model_path = 'desllre/ru_news_detection'
+encoder_path = hf_hub_download(repo_id=model_path, filename="encoder.pkl")
+with open(encoder_path, "rb") as f:
+    encoder = pickle.load(f)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+classifier = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
+text = 'Tesla дала добро на взлом ПО своих автомобилей\n\nКомпания  изменила условия программы Bug Bounty, предусматривающей выплату вознаграждений за поиск уязвимостей. Теперь энтузиасты могут взламывать электрокары Tesla, не боясь отзыва гарантии. Более того, в соответствии с новой политикой компании, автопроизводитель будет перепрошивать автомобили, ПО которых вышло из строя в процессе экспериментов специалистов кибербезопасности.\n\nИзменения в политике компании Telsa очень тепло встретили представители индустрии.'
+tokenized = tokenize_function(text, news_tokenizer)
+tokenized = {key: value.to(device) for key, value in tokenized.items()}
+with torch.no_grad():
+    output = classifier(**tokenized)
+predicted_class_id = torch.argmax(output.logits, dim=1).item()
+label = label_encoder.inverse_transform([predicted_class_id])[0]
+print(label)
+```

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "emb_size": 312,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 312,
+  "initializer_range": 0.02,
+  "intermediate_size": 600,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 2048,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 3,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 83828
+}

encoder.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3efb12a29b5b3481414b87f99cebd129162d04824212bc193ee08d641919ff8
+size 258

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90c52166c0edb3e2de30c642eea39be43b14fe2990c63025561a781787814ef9
+size 116784136

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 512,
+  "model_max_length": 2048,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e466f28d86902c4ea9cd76781236a119b0b5c5c5a05dac56f2e09b550e4b97
+size 5368

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff