Spaces:

bumchik2
/

articles_classifier

Running

App Files Files Community

bumchik2 commited on Apr 4

Commit

b1eb861

1 Parent(s): 97c6f33

final

Browse files

Files changed (2) hide show

app.py +1 -1
notebooks/distilroberta_base_main.ipynb +49 -17

app.py CHANGED Viewed

@@ -59,7 +59,7 @@ def get_category_probs_dict(model, title: str, summary: str) -> Dict[str, float]
             current_index += 1
     index_to_category = {value: key for key, value in category_to_index.items()}
-    text = f'{title} $ {summary}'
     category_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits
     sigmoid = torch.nn.Sigmoid()
     category_probs = sigmoid(category_logits.squeeze().cpu()).numpy()

             current_index += 1
     index_to_category = {value: key for key, value in category_to_index.items()}
+    text = f'{title} $ {summary or ""}'
     category_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits
     sigmoid = torch.nn.Sigmoid()
     category_probs = sigmoid(category_logits.squeeze().cpu()).numpy()

notebooks/distilroberta_base_main.ipynb CHANGED Viewed

@@ -59,7 +59,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,7 +91,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -117,7 +117,7 @@
        " 'year': 2018}"
       ]
      },
-     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -135,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -215,7 +215,7 @@
        "4  cs.CG                           Computational Geometry  Computer Science"
       ]
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -228,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -523,7 +523,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -988,13 +988,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "@torch.no_grad\n",
     "def get_category_probs_dict(model, title: str, summary: str) -> Dict[str, float]:\n",
-    "    text = f'{title} $ {summary}'\n",
     "    category_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits\n",
     "    sigmoid = torch.nn.Sigmoid()\n",
     "    category_probs = sigmoid(category_logits.squeeze().cpu()).numpy()\n",
@@ -1007,7 +1007,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1071,7 +1071,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1086,7 +1086,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -1098,7 +1098,7 @@
        " 'Physics (0.07676041126251221)']"
       ]
      },
-     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1118,7 +1118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -1130,7 +1130,7 @@
        " 'Statistics (0.02984526939690113)']"
       ]
      },
-     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1148,6 +1148,38 @@
     ")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
        " 'year': 2018}"
       ]
      },
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
        "4  cs.CG                           Computational Geometry  Computer Science"
       ]
      },
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
     "@torch.no_grad\n",
     "def get_category_probs_dict(model, title: str, summary: str) -> Dict[str, float]:\n",
+    "    text = f'{title} $ {summary or \"\"}'\n",
     "    category_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits\n",
     "    sigmoid = torch.nn.Sigmoid()\n",
     "    category_probs = sigmoid(category_logits.squeeze().cpu()).numpy()\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
        " 'Physics (0.07676041126251221)']"
       ]
      },
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
        " 'Statistics (0.02984526939690113)']"
       ]
      },
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Quantitative Biology (0.45450547337532043)',\n",
+       " 'Computer Science (0.3519783318042755)',\n",
+       " 'Physics (0.07536326348781586)',\n",
+       " 'Statistics (0.06953499466180801)']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# правильный ответ Quantitative Biology\n",
+    "get_most_probable_keys(\n",
+    "    probs_dict=get_category_probs_dict(\n",
+    "        model=model,\n",
+    "        title='Simulating cell populations with explicit cell cycle length -- implications to cell cycle dependent tumour therapy',\n",
+    "        summary=''\n",
+    "    ),\n",
+    "    target_probability=0.95,\n",
+    "    print_probabilities=True\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,