File size: 2,555 Bytes
875cfff
 
ad3d6a3
875cfff
 
 
 
6c23168
aabbe07
9baaef5
875cfff
 
 
 
 
 
 
 
 
 
8d3f7b8
875cfff
954b6e7
9baaef5
875cfff
8d3f7b8
 
875cfff
 
 
52297f7
0d84089
7ae919b
ad3d6a3
 
 
875cfff
8d3f7b8
93acac7
875cfff
8d3f7b8
 
93acac7
8d3f7b8
875cfff
 
 
ad3d6a3
93acac7
 
 
 
 
 
875cfff
9baaef5
 
02d9909
 
ccdaa76
875cfff
 
5c1780d
875cfff
 
 
9f3271a
875cfff
 
5c1780d
875cfff
8f7df49
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import torch

import pandas as pd
import streamlit as st
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel


@st.cache(suppress_st_warning=True)
def init_model():
    model = RobertaModel.from_pretrained("roberta-large-mnli")

    model.pooler = nn.Sequential(
        nn.Linear(1024, 256),
        nn.LayerNorm(256),
        nn.ReLU(),
        nn.Linear(256, 8),
        nn.Sigmoid()
    )
    
    model_path = "model.pt"
    model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
    model.eval()
    return model

cats = ["Computer Science", "Economics", "Electrical Engineering", 
        "Mathematics", "Physics", "Biology", "Finance", "Statistics"]

def predict(outputs):
    top = 0
    temp = 100000
    apr_probs = torch.nn.functional.softmax(torch.tensor([39253., 84., 220., 2263., 1214., 909., 66., 10661.]) / temp, dim=0)
    probs = nn.functional.softmax(outputs / apr_probs, dim=1).tolist()[0]
    
    top_cats = []
    top_probs = []

    first = True
    write_cs = False
    for prob, cat in sorted(zip(probs, cats), reverse=True):
        if first:
            if cat == "Computer Science":
                write_cs = True
            first = False
        if top < 95:
            percent = prob * 100
            top += percent
            top_cats.append(cat)
            top_probs.append(str(round(percent, 1)))
    res = pd.DataFrame(top_probs, index=top_cats, columns=['Percent'])
    st.write(res)

    if write_cs:
        st.write("Today everything is connected with Computer Science")

tokenizer = RobertaTokenizer.from_pretrained("roberta-large-mnli")
model = init_model()

st.title("Article classifier")
st.markdown("<img width=500px src='https://lionbridge.ai/wp-content/uploads/2020/09/2020-09-08_text-classification-tools-services.jpg' class='center'>", unsafe_allow_html=True)
st.markdown("### Title")

title = st.text_area("*Enter title (required)", height=20)

st.markdown("### Abstract")

abstract = st.text_area(" Enter abstract", height=200)

if not title:
    st.warning("Please fill in required fields")
else:    
    try:
        st.markdown("### Result")
        encoded_input = tokenizer(title + ". " + abstract, return_tensors="pt", padding=True, 
                              max_length=1024, truncation=True)
        with torch.no_grad():
            outputs = model(**encoded_input).pooler_output[:, 0, :]
            predict(outputs)
    except Exception:
        st.error("Something went wrong. Try different text or contact me. Telegram: @rrevoid")