Spaces:
Running
Running
adding app files
Browse files- app.py +95 -0
- arxiv_topics.csv +156 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline
|
3 |
+
import torch
|
4 |
+
from transformers import AutoModelForSequenceClassification
|
5 |
+
import pandas as pd
|
6 |
+
from typing import Dict
|
7 |
+
from transformers import DistilBertTokenizer
|
8 |
+
from typing import List
|
9 |
+
|
10 |
+
|
11 |
+
USED_MODEL = "distilbert-base-cased"
|
12 |
+
|
13 |
+
@st.cache_resource # кэширование
|
14 |
+
def load_model():
|
15 |
+
# csv локально прочитать очень быстро, так что его не кешируем, хотя это не сложно было бы добавить наверное
|
16 |
+
arxiv_topics_df = pd.read_csv('arxiv_topics.csv')
|
17 |
+
tag_to_index = {}
|
18 |
+
for i, row in arxiv_topics_df.iterrows():
|
19 |
+
tag_to_index[row['tag']] = i
|
20 |
+
index_to_tag = {value: key for key, value in tag_to_index.items()}
|
21 |
+
|
22 |
+
return AutoModelForSequenceClassification.from_pretrained(
|
23 |
+
"bumchik2/train_distilbert-base-cased-tags-classification-simple",
|
24 |
+
problem_type="multi_label_classification",
|
25 |
+
num_labels=len(tag_to_index),
|
26 |
+
id2label=index_to_tag,
|
27 |
+
label2id=tag_to_index
|
28 |
+
).to(torch.device('cuda'))
|
29 |
+
|
30 |
+
model = load_model()
|
31 |
+
|
32 |
+
|
33 |
+
@st.cache_resource()
|
34 |
+
def get_tokenizer():
|
35 |
+
return DistilBertTokenizer.from_pretrained(USED_MODEL)
|
36 |
+
|
37 |
+
|
38 |
+
def tokenize_function(text):
|
39 |
+
tokenizer = get_tokenizer()
|
40 |
+
return tokenizer(text, padding="max_length", truncation=True)
|
41 |
+
|
42 |
+
|
43 |
+
@torch.no_grad
|
44 |
+
def get_category_probs_dict(model, title: str, summary: str) -> Dict[str, float]:
|
45 |
+
# csv локально прочитать очень быстро, так что его не кешируем, хотя это не сложно было бы добавить наверное
|
46 |
+
arxiv_topics_df = pd.read_csv('arxiv_topics.csv')
|
47 |
+
tag_to_index = {}
|
48 |
+
tag_to_category = {}
|
49 |
+
for i, row in arxiv_topics_df.iterrows():
|
50 |
+
tag_to_category[row['tag']] = row['category']
|
51 |
+
tag_to_index[row['tag']] = i
|
52 |
+
index_to_tag = {value: key for key, value in tag_to_index.items()}
|
53 |
+
|
54 |
+
text = f'{title} $ {summary}'
|
55 |
+
tags_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits
|
56 |
+
sigmoid = torch.nn.Sigmoid()
|
57 |
+
tags_probs = sigmoid(tags_logits.squeeze().cpu()).numpy()
|
58 |
+
tags_probs /= tags_probs.sum()
|
59 |
+
category_probs_dict = {category: 0.0 for category in set(arxiv_topics_df['category'])}
|
60 |
+
for index in range(len(index_to_tag)):
|
61 |
+
category_probs_dict[tag_to_category[index_to_tag[index]]] += float(tags_probs[index])
|
62 |
+
return category_probs_dict
|
63 |
+
|
64 |
+
|
65 |
+
def get_most_probable_keys(probs_dict: Dict[str, float], target_probability: float, print_probabilities: bool) -> List[str]:
|
66 |
+
current_p = 0
|
67 |
+
probs_list = sorted([(value, key) for key, value in probs_dict.items()])[::-1]
|
68 |
+
current_index = 0
|
69 |
+
answer = []
|
70 |
+
while current_p <= target_probability:
|
71 |
+
current_p += probs_list[current_index][0]
|
72 |
+
if not print_probabilities:
|
73 |
+
answer.append(probs_list[current_index][1])
|
74 |
+
else:
|
75 |
+
answer.append(f'{probs_list[current_index][1]} ({probs_list[current_index][0]})')
|
76 |
+
current_index += 1
|
77 |
+
if current_index >= len(probs_list):
|
78 |
+
break
|
79 |
+
return answer
|
80 |
+
|
81 |
+
|
82 |
+
title = st.text_input("Article title", value="Enter title here...")
|
83 |
+
summary = st.text_input("Article summary", value="Enter summary here...")
|
84 |
+
|
85 |
+
need_to_print_probabilities = st.radio("Need to print probabilities: ", ('Yes', 'No'), index=0)
|
86 |
+
st.session_state['need_to_print_probabilities'] = need_to_print_probabilities
|
87 |
+
|
88 |
+
target_probability = st.slider("Select minimum probability sum", 0.0, 1.0, step=0.01, value=0.95)
|
89 |
+
st.session_state['target_probability'] = 'target_probability'
|
90 |
+
|
91 |
+
|
92 |
+
if title and summary:
|
93 |
+
category_probs_dict = get_category_probs_dict(model=model, title=title, summary=summary)
|
94 |
+
result = get_most_probable_keys(probs_dict=category_probs_dict, target_probability=target_probability, print_probabilities=need_to_print_probabilities=='Yes')
|
95 |
+
st.write(result)
|
arxiv_topics.csv
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tag,topic,category
|
2 |
+
cs.AI,Artificial Intelligence,Computer Science
|
3 |
+
cs.AR,Hardware Architecture,Computer Science
|
4 |
+
cs.CC,Computational Complexity,Computer Science
|
5 |
+
cs.CE,"Computational Engineering, Finance, and Science",Computer Science
|
6 |
+
cs.CG,Computational Geometry,Computer Science
|
7 |
+
cs.CL,Computation and Language,Computer Science
|
8 |
+
cs.CR,Cryptography and Security,Computer Science
|
9 |
+
cs.CV,Computer Vision and Pattern Recognition,Computer Science
|
10 |
+
cs.CY,Computers and Society,Computer Science
|
11 |
+
cs.DB,Databases,Computer Science
|
12 |
+
cs.DC,"Distributed, Parallel, and Cluster Computing",Computer Science
|
13 |
+
cs.DL,Digital Libraries,Computer Science
|
14 |
+
cs.DM,Discrete Mathematics,Computer Science
|
15 |
+
cs.DS,Data Structures and Algorithms,Computer Science
|
16 |
+
cs.ET,Emerging Technologies,Computer Science
|
17 |
+
cs.FL,Formal Languages and Automata Theory,Computer Science
|
18 |
+
cs.GL,General Literature,Computer Science
|
19 |
+
cs.GR,Graphics,Computer Science
|
20 |
+
cs.GT,Computer Science and Game Theory,Computer Science
|
21 |
+
cs.HC,Human-Computer Interaction,Computer Science
|
22 |
+
cs.IR,Information Retrieval,Computer Science
|
23 |
+
cs.IT,Information Theory,Computer Science
|
24 |
+
cs.LG,Machine Learning,Computer Science
|
25 |
+
cs.LO,Logic in Computer Science,Computer Science
|
26 |
+
cs.MA,Multiagent Systems,Computer Science
|
27 |
+
cs.MM,Multimedia,Computer Science
|
28 |
+
cs.MS,Mathematical Software,Computer Science
|
29 |
+
cs.NA,Numerical Analysis,Computer Science
|
30 |
+
cs.NE,Neural and Evolutionary Computing,Computer Science
|
31 |
+
cs.NI,Networking and Internet Architecture,Computer Science
|
32 |
+
cs.OH,Other Computer Science,Computer Science
|
33 |
+
cs.OS,Operating Systems,Computer Science
|
34 |
+
cs.PF,Performance,Computer Science
|
35 |
+
cs.PL,Programming Languages,Computer Science
|
36 |
+
cs.RO,Robotics,Computer Science
|
37 |
+
cs.SC,Symbolic Computation,Computer Science
|
38 |
+
cs.SD,Sound,Computer Science
|
39 |
+
cs.SE,Software Engineering,Computer Science
|
40 |
+
cs.SI,Social and Information Networks,Computer Science
|
41 |
+
cs.SY,Systems and Control,Computer Science
|
42 |
+
econ.EM,Econometrics,Economics
|
43 |
+
econ.GN,General Economics,Economics
|
44 |
+
econ.TH,Theoretical Economics,Economics
|
45 |
+
eess.AS,Audio and Speech Processing,Electrical Engineering and Systems Science
|
46 |
+
eess.IV,Image and Video Processing,Electrical Engineering and Systems Science
|
47 |
+
eess.SP,Signal Processing,Electrical Engineering and Systems Science
|
48 |
+
eess.SY,Systems and Control,Electrical Engineering and Systems Science
|
49 |
+
math.AC,Commutative Algebra,Mathematics
|
50 |
+
math.AG,Algebraic Geometry,Mathematics
|
51 |
+
math.AP,Analysis of PDEs,Mathematics
|
52 |
+
math.AT,Algebraic Topology,Mathematics
|
53 |
+
math.CA,Classical Analysis and ODEs,Mathematics
|
54 |
+
math.CO,Combinatorics,Mathematics
|
55 |
+
math.CT,Category Theory,Mathematics
|
56 |
+
math.CV,Complex Variables,Mathematics
|
57 |
+
math.DG,Differential Geometry,Mathematics
|
58 |
+
math.DS,Dynamical Systems,Mathematics
|
59 |
+
math.FA,Functional Analysis,Mathematics
|
60 |
+
math.GM,General Mathematics,Mathematics
|
61 |
+
math.GN,General Topology,Mathematics
|
62 |
+
math.GR,Group Theory,Mathematics
|
63 |
+
math.GT,Geometric Topology,Mathematics
|
64 |
+
math.HO,History and Overview,Mathematics
|
65 |
+
math.IT,Information Theory,Mathematics
|
66 |
+
math.KT,K-Theory and Homology,Mathematics
|
67 |
+
math.LO,Logic,Mathematics
|
68 |
+
math.MG,Metric Geometry,Mathematics
|
69 |
+
math.MP,Mathematical Physics,Mathematics
|
70 |
+
math.NA,Numerical Analysis,Mathematics
|
71 |
+
math.NT,Number Theory,Mathematics
|
72 |
+
math.OA,Operator Algebras,Mathematics
|
73 |
+
math.OC,Optimization and Control,Mathematics
|
74 |
+
math.PR,Probability,Mathematics
|
75 |
+
math.QA,Quantum Algebra,Mathematics
|
76 |
+
math.RA,Rings and Algebras,Mathematics
|
77 |
+
math.RT,Representation Theory,Mathematics
|
78 |
+
math.SG,Symplectic Geometry,Mathematics
|
79 |
+
math.SP,Spectral Theory,Mathematics
|
80 |
+
math.ST,Statistics Theory,Mathematics
|
81 |
+
astro-ph.CO,Cosmology and Nongalactic Astrophysics,Physics
|
82 |
+
astro-ph.EP,Earth and Planetary Astrophysics,Physics
|
83 |
+
astro-ph.GA,Astrophysics of Galaxies,Physics
|
84 |
+
astro-ph.HE,High Energy Astrophysical Phenomena,Physics
|
85 |
+
astro-ph.IM,Instrumentation and Methods for Astrophysics,Physics
|
86 |
+
astro-ph.SR,Solar and Stellar Astrophysics,Physics
|
87 |
+
cond-mat.dis-nn,Disordered Systems and Neural Networks,Physics
|
88 |
+
cond-mat.mes-hall,Mesoscale and Nanoscale Physics,Physics
|
89 |
+
cond-mat.mtrl-sci,Materials Science,Physics
|
90 |
+
cond-mat.other,Other Condensed Matter,Physics
|
91 |
+
cond-mat.quant-gas,Quantum Gases,Physics
|
92 |
+
cond-mat.soft,Soft Condensed Matter,Physics
|
93 |
+
cond-mat.stat-mech,Statistical Mechanics,Physics
|
94 |
+
cond-mat.str-el,Strongly Correlated Electrons,Physics
|
95 |
+
cond-mat.supr-con,Superconductivity,Physics
|
96 |
+
gr-qc,General Relativity and Quantum Cosmology,Physics
|
97 |
+
hep-ex,High Energy Physics - Experiment,Physics
|
98 |
+
hep-lat,High Energy Physics - Lattice,Physics
|
99 |
+
hep-ph,High Energy Physics - Phenomenology,Physics
|
100 |
+
hep-th,High Energy Physics - Theory,Physics
|
101 |
+
math-ph,Mathematical Physics,Physics
|
102 |
+
nlin.AO,Adaptation and Self-Organizing Systems,Physics
|
103 |
+
nlin.CD,Chaotic Dynamics,Physics
|
104 |
+
nlin.CG,Cellular Automata and Lattice Gases,Physics
|
105 |
+
nlin.PS,Pattern Formation and Solitons,Physics
|
106 |
+
nlin.SI,Exactly Solvable and Integrable Systems,Physics
|
107 |
+
nucl-ex,Nuclear Experiment,Physics
|
108 |
+
nucl-th,Nuclear Theory,Physics
|
109 |
+
physics.acc-ph,Accelerator Physics,Physics
|
110 |
+
physics.ao-ph,Atmospheric and Oceanic Physics,Physics
|
111 |
+
physics.app-ph,Applied Physics,Physics
|
112 |
+
physics.atm-clus,Atomic and Molecular Clusters,Physics
|
113 |
+
physics.atom-ph,Atomic Physics,Physics
|
114 |
+
physics.bio-ph,Biological Physics,Physics
|
115 |
+
physics.chem-ph,Chemical Physics,Physics
|
116 |
+
physics.class-ph,Classical Physics,Physics
|
117 |
+
physics.comp-ph,Computational Physics,Physics
|
118 |
+
physics.data-an,"Data Analysis, Statistics and Probability",Physics
|
119 |
+
physics.ed-ph,Physics Education,Physics
|
120 |
+
physics.flu-dyn,Fluid Dynamics,Physics
|
121 |
+
physics.gen-ph,General Physics,Physics
|
122 |
+
physics.geo-ph,Geophysics,Physics
|
123 |
+
physics.hist-ph,History and Philosophy of Physics,Physics
|
124 |
+
physics.ins-det,Instrumentation and Detectors,Physics
|
125 |
+
physics.med-ph,Medical Physics,Physics
|
126 |
+
physics.optics,Optics,Physics
|
127 |
+
physics.plasm-ph,Plasma Physics,Physics
|
128 |
+
physics.pop-ph,Popular Physics,Physics
|
129 |
+
physics.soc-ph,Physics and Society,Physics
|
130 |
+
physics.space-ph,Space Physics,Physics
|
131 |
+
quant-ph,Quantum Physics,Physics
|
132 |
+
q-bio.BM,Biomolecules,Quantitative Biology
|
133 |
+
q-bio.CB,Cell Behavior,Quantitative Biology
|
134 |
+
q-bio.GN,Genomics,Quantitative Biology
|
135 |
+
q-bio.MN,Molecular Networks,Quantitative Biology
|
136 |
+
q-bio.NC,Neurons and Cognition,Quantitative Biology
|
137 |
+
q-bio.OT,Other Quantitative Biology,Quantitative Biology
|
138 |
+
q-bio.PE,Populations and Evolution,Quantitative Biology
|
139 |
+
q-bio.QM,Quantitative Methods,Quantitative Biology
|
140 |
+
q-bio.SC,Subcellular Processes,Quantitative Biology
|
141 |
+
q-bio.TO,Tissues and Organs,Quantitative Biology
|
142 |
+
q-fin.CP,Computational Finance,Quantitative Finance
|
143 |
+
q-fin.EC,Economics,Quantitative Finance
|
144 |
+
q-fin.GN,General Finance,Quantitative Finance
|
145 |
+
q-fin.MF,Mathematical Finance,Quantitative Finance
|
146 |
+
q-fin.PM,Portfolio Management,Quantitative Finance
|
147 |
+
q-fin.PR,Pricing of Securities,Quantitative Finance
|
148 |
+
q-fin.RM,Risk Management,Quantitative Finance
|
149 |
+
q-fin.ST,Statistical Finance,Quantitative Finance
|
150 |
+
q-fin.TR,Trading and Market Microstructure,Quantitative Finance
|
151 |
+
stat.AP,Applications,Statistics
|
152 |
+
stat.CO,Computation,Statistics
|
153 |
+
stat.ME,Methodology,Statistics
|
154 |
+
stat.ML,Machine Learning,Statistics
|
155 |
+
stat.OT,Other Statistics,Statistics
|
156 |
+
stat.TH,Statistics Theory,Statistics
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
torch
|
3 |
+
pandas
|