bumchik2 commited on
Commit
7e18220
·
1 Parent(s): 49e968d

adding app files

Browse files
Files changed (3) hide show
  1. app.py +95 -0
  2. arxiv_topics.csv +156 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ import torch
4
+ from transformers import AutoModelForSequenceClassification
5
+ import pandas as pd
6
+ from typing import Dict
7
+ from transformers import DistilBertTokenizer
8
+ from typing import List
9
+
10
+
11
+ USED_MODEL = "distilbert-base-cased"
12
+
13
+ @st.cache_resource # кэширование
14
+ def load_model():
15
+ # csv локально прочитать очень быстро, так что его не кешируем, хотя это не сложно было бы добавить наверное
16
+ arxiv_topics_df = pd.read_csv('arxiv_topics.csv')
17
+ tag_to_index = {}
18
+ for i, row in arxiv_topics_df.iterrows():
19
+ tag_to_index[row['tag']] = i
20
+ index_to_tag = {value: key for key, value in tag_to_index.items()}
21
+
22
+ return AutoModelForSequenceClassification.from_pretrained(
23
+ "bumchik2/train_distilbert-base-cased-tags-classification-simple",
24
+ problem_type="multi_label_classification",
25
+ num_labels=len(tag_to_index),
26
+ id2label=index_to_tag,
27
+ label2id=tag_to_index
28
+ ).to(torch.device('cuda'))
29
+
30
+ model = load_model()
31
+
32
+
33
+ @st.cache_resource()
34
+ def get_tokenizer():
35
+ return DistilBertTokenizer.from_pretrained(USED_MODEL)
36
+
37
+
38
+ def tokenize_function(text):
39
+ tokenizer = get_tokenizer()
40
+ return tokenizer(text, padding="max_length", truncation=True)
41
+
42
+
43
+ @torch.no_grad
44
+ def get_category_probs_dict(model, title: str, summary: str) -> Dict[str, float]:
45
+ # csv локально прочитать очень быстро, так что его не кешируем, хотя это не сложно было бы добавить наверное
46
+ arxiv_topics_df = pd.read_csv('arxiv_topics.csv')
47
+ tag_to_index = {}
48
+ tag_to_category = {}
49
+ for i, row in arxiv_topics_df.iterrows():
50
+ tag_to_category[row['tag']] = row['category']
51
+ tag_to_index[row['tag']] = i
52
+ index_to_tag = {value: key for key, value in tag_to_index.items()}
53
+
54
+ text = f'{title} $ {summary}'
55
+ tags_logits = model(**{key: torch.tensor(value).to(model.device).unsqueeze(0) for key, value in tokenize_function(text).items()}).logits
56
+ sigmoid = torch.nn.Sigmoid()
57
+ tags_probs = sigmoid(tags_logits.squeeze().cpu()).numpy()
58
+ tags_probs /= tags_probs.sum()
59
+ category_probs_dict = {category: 0.0 for category in set(arxiv_topics_df['category'])}
60
+ for index in range(len(index_to_tag)):
61
+ category_probs_dict[tag_to_category[index_to_tag[index]]] += float(tags_probs[index])
62
+ return category_probs_dict
63
+
64
+
65
+ def get_most_probable_keys(probs_dict: Dict[str, float], target_probability: float, print_probabilities: bool) -> List[str]:
66
+ current_p = 0
67
+ probs_list = sorted([(value, key) for key, value in probs_dict.items()])[::-1]
68
+ current_index = 0
69
+ answer = []
70
+ while current_p <= target_probability:
71
+ current_p += probs_list[current_index][0]
72
+ if not print_probabilities:
73
+ answer.append(probs_list[current_index][1])
74
+ else:
75
+ answer.append(f'{probs_list[current_index][1]} ({probs_list[current_index][0]})')
76
+ current_index += 1
77
+ if current_index >= len(probs_list):
78
+ break
79
+ return answer
80
+
81
+
82
+ title = st.text_input("Article title", value="Enter title here...")
83
+ summary = st.text_input("Article summary", value="Enter summary here...")
84
+
85
+ need_to_print_probabilities = st.radio("Need to print probabilities: ", ('Yes', 'No'), index=0)
86
+ st.session_state['need_to_print_probabilities'] = need_to_print_probabilities
87
+
88
+ target_probability = st.slider("Select minimum probability sum", 0.0, 1.0, step=0.01, value=0.95)
89
+ st.session_state['target_probability'] = 'target_probability'
90
+
91
+
92
+ if title and summary:
93
+ category_probs_dict = get_category_probs_dict(model=model, title=title, summary=summary)
94
+ result = get_most_probable_keys(probs_dict=category_probs_dict, target_probability=target_probability, print_probabilities=need_to_print_probabilities=='Yes')
95
+ st.write(result)
arxiv_topics.csv ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag,topic,category
2
+ cs.AI,Artificial Intelligence,Computer Science
3
+ cs.AR,Hardware Architecture,Computer Science
4
+ cs.CC,Computational Complexity,Computer Science
5
+ cs.CE,"Computational Engineering, Finance, and Science",Computer Science
6
+ cs.CG,Computational Geometry,Computer Science
7
+ cs.CL,Computation and Language,Computer Science
8
+ cs.CR,Cryptography and Security,Computer Science
9
+ cs.CV,Computer Vision and Pattern Recognition,Computer Science
10
+ cs.CY,Computers and Society,Computer Science
11
+ cs.DB,Databases,Computer Science
12
+ cs.DC,"Distributed, Parallel, and Cluster Computing",Computer Science
13
+ cs.DL,Digital Libraries,Computer Science
14
+ cs.DM,Discrete Mathematics,Computer Science
15
+ cs.DS,Data Structures and Algorithms,Computer Science
16
+ cs.ET,Emerging Technologies,Computer Science
17
+ cs.FL,Formal Languages and Automata Theory,Computer Science
18
+ cs.GL,General Literature,Computer Science
19
+ cs.GR,Graphics,Computer Science
20
+ cs.GT,Computer Science and Game Theory,Computer Science
21
+ cs.HC,Human-Computer Interaction,Computer Science
22
+ cs.IR,Information Retrieval,Computer Science
23
+ cs.IT,Information Theory,Computer Science
24
+ cs.LG,Machine Learning,Computer Science
25
+ cs.LO,Logic in Computer Science,Computer Science
26
+ cs.MA,Multiagent Systems,Computer Science
27
+ cs.MM,Multimedia,Computer Science
28
+ cs.MS,Mathematical Software,Computer Science
29
+ cs.NA,Numerical Analysis,Computer Science
30
+ cs.NE,Neural and Evolutionary Computing,Computer Science
31
+ cs.NI,Networking and Internet Architecture,Computer Science
32
+ cs.OH,Other Computer Science,Computer Science
33
+ cs.OS,Operating Systems,Computer Science
34
+ cs.PF,Performance,Computer Science
35
+ cs.PL,Programming Languages,Computer Science
36
+ cs.RO,Robotics,Computer Science
37
+ cs.SC,Symbolic Computation,Computer Science
38
+ cs.SD,Sound,Computer Science
39
+ cs.SE,Software Engineering,Computer Science
40
+ cs.SI,Social and Information Networks,Computer Science
41
+ cs.SY,Systems and Control,Computer Science
42
+ econ.EM,Econometrics,Economics
43
+ econ.GN,General Economics,Economics
44
+ econ.TH,Theoretical Economics,Economics
45
+ eess.AS,Audio and Speech Processing,Electrical Engineering and Systems Science
46
+ eess.IV,Image and Video Processing,Electrical Engineering and Systems Science
47
+ eess.SP,Signal Processing,Electrical Engineering and Systems Science
48
+ eess.SY,Systems and Control,Electrical Engineering and Systems Science
49
+ math.AC,Commutative Algebra,Mathematics
50
+ math.AG,Algebraic Geometry,Mathematics
51
+ math.AP,Analysis of PDEs,Mathematics
52
+ math.AT,Algebraic Topology,Mathematics
53
+ math.CA,Classical Analysis and ODEs,Mathematics
54
+ math.CO,Combinatorics,Mathematics
55
+ math.CT,Category Theory,Mathematics
56
+ math.CV,Complex Variables,Mathematics
57
+ math.DG,Differential Geometry,Mathematics
58
+ math.DS,Dynamical Systems,Mathematics
59
+ math.FA,Functional Analysis,Mathematics
60
+ math.GM,General Mathematics,Mathematics
61
+ math.GN,General Topology,Mathematics
62
+ math.GR,Group Theory,Mathematics
63
+ math.GT,Geometric Topology,Mathematics
64
+ math.HO,History and Overview,Mathematics
65
+ math.IT,Information Theory,Mathematics
66
+ math.KT,K-Theory and Homology,Mathematics
67
+ math.LO,Logic,Mathematics
68
+ math.MG,Metric Geometry,Mathematics
69
+ math.MP,Mathematical Physics,Mathematics
70
+ math.NA,Numerical Analysis,Mathematics
71
+ math.NT,Number Theory,Mathematics
72
+ math.OA,Operator Algebras,Mathematics
73
+ math.OC,Optimization and Control,Mathematics
74
+ math.PR,Probability,Mathematics
75
+ math.QA,Quantum Algebra,Mathematics
76
+ math.RA,Rings and Algebras,Mathematics
77
+ math.RT,Representation Theory,Mathematics
78
+ math.SG,Symplectic Geometry,Mathematics
79
+ math.SP,Spectral Theory,Mathematics
80
+ math.ST,Statistics Theory,Mathematics
81
+ astro-ph.CO,Cosmology and Nongalactic Astrophysics,Physics
82
+ astro-ph.EP,Earth and Planetary Astrophysics,Physics
83
+ astro-ph.GA,Astrophysics of Galaxies,Physics
84
+ astro-ph.HE,High Energy Astrophysical Phenomena,Physics
85
+ astro-ph.IM,Instrumentation and Methods for Astrophysics,Physics
86
+ astro-ph.SR,Solar and Stellar Astrophysics,Physics
87
+ cond-mat.dis-nn,Disordered Systems and Neural Networks,Physics
88
+ cond-mat.mes-hall,Mesoscale and Nanoscale Physics,Physics
89
+ cond-mat.mtrl-sci,Materials Science,Physics
90
+ cond-mat.other,Other Condensed Matter,Physics
91
+ cond-mat.quant-gas,Quantum Gases,Physics
92
+ cond-mat.soft,Soft Condensed Matter,Physics
93
+ cond-mat.stat-mech,Statistical Mechanics,Physics
94
+ cond-mat.str-el,Strongly Correlated Electrons,Physics
95
+ cond-mat.supr-con,Superconductivity,Physics
96
+ gr-qc,General Relativity and Quantum Cosmology,Physics
97
+ hep-ex,High Energy Physics - Experiment,Physics
98
+ hep-lat,High Energy Physics - Lattice,Physics
99
+ hep-ph,High Energy Physics - Phenomenology,Physics
100
+ hep-th,High Energy Physics - Theory,Physics
101
+ math-ph,Mathematical Physics,Physics
102
+ nlin.AO,Adaptation and Self-Organizing Systems,Physics
103
+ nlin.CD,Chaotic Dynamics,Physics
104
+ nlin.CG,Cellular Automata and Lattice Gases,Physics
105
+ nlin.PS,Pattern Formation and Solitons,Physics
106
+ nlin.SI,Exactly Solvable and Integrable Systems,Physics
107
+ nucl-ex,Nuclear Experiment,Physics
108
+ nucl-th,Nuclear Theory,Physics
109
+ physics.acc-ph,Accelerator Physics,Physics
110
+ physics.ao-ph,Atmospheric and Oceanic Physics,Physics
111
+ physics.app-ph,Applied Physics,Physics
112
+ physics.atm-clus,Atomic and Molecular Clusters,Physics
113
+ physics.atom-ph,Atomic Physics,Physics
114
+ physics.bio-ph,Biological Physics,Physics
115
+ physics.chem-ph,Chemical Physics,Physics
116
+ physics.class-ph,Classical Physics,Physics
117
+ physics.comp-ph,Computational Physics,Physics
118
+ physics.data-an,"Data Analysis, Statistics and Probability",Physics
119
+ physics.ed-ph,Physics Education,Physics
120
+ physics.flu-dyn,Fluid Dynamics,Physics
121
+ physics.gen-ph,General Physics,Physics
122
+ physics.geo-ph,Geophysics,Physics
123
+ physics.hist-ph,History and Philosophy of Physics,Physics
124
+ physics.ins-det,Instrumentation and Detectors,Physics
125
+ physics.med-ph,Medical Physics,Physics
126
+ physics.optics,Optics,Physics
127
+ physics.plasm-ph,Plasma Physics,Physics
128
+ physics.pop-ph,Popular Physics,Physics
129
+ physics.soc-ph,Physics and Society,Physics
130
+ physics.space-ph,Space Physics,Physics
131
+ quant-ph,Quantum Physics,Physics
132
+ q-bio.BM,Biomolecules,Quantitative Biology
133
+ q-bio.CB,Cell Behavior,Quantitative Biology
134
+ q-bio.GN,Genomics,Quantitative Biology
135
+ q-bio.MN,Molecular Networks,Quantitative Biology
136
+ q-bio.NC,Neurons and Cognition,Quantitative Biology
137
+ q-bio.OT,Other Quantitative Biology,Quantitative Biology
138
+ q-bio.PE,Populations and Evolution,Quantitative Biology
139
+ q-bio.QM,Quantitative Methods,Quantitative Biology
140
+ q-bio.SC,Subcellular Processes,Quantitative Biology
141
+ q-bio.TO,Tissues and Organs,Quantitative Biology
142
+ q-fin.CP,Computational Finance,Quantitative Finance
143
+ q-fin.EC,Economics,Quantitative Finance
144
+ q-fin.GN,General Finance,Quantitative Finance
145
+ q-fin.MF,Mathematical Finance,Quantitative Finance
146
+ q-fin.PM,Portfolio Management,Quantitative Finance
147
+ q-fin.PR,Pricing of Securities,Quantitative Finance
148
+ q-fin.RM,Risk Management,Quantitative Finance
149
+ q-fin.ST,Statistical Finance,Quantitative Finance
150
+ q-fin.TR,Trading and Market Microstructure,Quantitative Finance
151
+ stat.AP,Applications,Statistics
152
+ stat.CO,Computation,Statistics
153
+ stat.ME,Methodology,Statistics
154
+ stat.ML,Machine Learning,Statistics
155
+ stat.OT,Other Statistics,Statistics
156
+ stat.TH,Statistics Theory,Statistics
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ pandas