1NEYRON1 commited on
Commit
6a4a9f2
·
1 Parent(s): 0c1b0b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -194
app.py CHANGED
@@ -1,147 +1,148 @@
1
  import streamlit as st
2
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
3
 
4
- id_to_cat = {0: 'Cryptography and Security',
5
- 1: 'Medical Physics',
6
- 2: 'Audio and Speech Processing',
7
- 3: 'Combinatorics',
8
- 4: 'Information Theory',
9
- 5: 'Quantum Physics',
10
- 6: 'Nuclear Theory',
11
- 7: 'Computers and Society',
12
- 8: 'Pattern Formation and Solitons',
13
- 9: 'General Finance',
14
- 10: 'Multiagent Systems',
15
- 11: 'Trading and Market Microstructure',
16
- 12: 'Mesoscale and Nanoscale Physics',
17
- 13: 'Instrumentation and Detectors',
18
- 14: 'Emerging Technologies',
19
- 15: 'Software Engineering',
20
- 16: 'Computational Physics',
21
- 17: 'Econometrics',
22
- 18: 'Materials Science',
23
- 19: 'Computer Vision and Pattern Recognition',
24
- 20: 'Differential Geometry',
25
- 21: 'General Literature',
26
- 22: 'Computation and Language',
27
- 23: 'Superconductivity',
28
- 24: 'Risk Management',
29
- 25: 'Other Condensed Matter',
30
- 26: 'Other Quantitative Biology',
31
- 27: 'High Energy Physics - Phenomenology',
32
- 28: 'Analysis of PDEs',
33
- 29: 'Earth and Planetary Astrophysics',
34
- 30: 'Optics',
35
- 31: 'Hardware Architecture',
36
- 32: 'Optimization and Control',
37
- 33: 'Methodology',
38
- 34: 'Number Theory',
39
- 35: 'General Topology',
40
- 36: 'Populations and Evolution',
41
- 37: 'Solar and Stellar Astrophysics',
42
- 38: 'Distributed, Parallel, and Cluster Computing',
43
- 39: 'Chaotic Dynamics',
44
- 40: 'History and Philosophy of Physics',
45
- 41: 'Computational Engineering, Finance, and Science',
46
- 42: 'Discrete Mathematics',
47
- 43: 'Statistical Mechanics',
48
- 44: 'Operating Systems',
49
- 45: 'Data Structures and Algorithms',
50
- 46: 'Geophysics',
51
- 47: 'Quantum Algebra',
52
- 48: 'Systems and Control',
53
- 49: 'Statistics Theory',
54
- 50: 'High Energy Physics - Theory',
55
- 51: 'Rings and Algebras',
56
- 52: 'Neural and Evolutionary Computing',
57
- 53: 'General Physics',
58
- 54: 'Computational Geometry',
59
- 55: 'Signal Processing',
60
- 56: 'Computational Finance',
61
- 57: 'History and Overview',
62
- 58: 'Space Physics',
63
- 59: 'Physics and Society',
64
- 60: 'Cosmology and Nongalactic Astrophysics',
65
- 61: 'Information Retrieval',
66
- 62: 'Symbolic Computation',
67
- 63: 'Statistical Finance',
68
- 64: 'Image and Video Processing',
69
- 65: 'Quantum Gases',
70
- 66: 'Artificial Intelligence',
71
- 67: 'Nuclear Experiment',
72
- 68: 'General Mathematics',
73
- 69: 'Complex Variables',
74
- 70: 'Logic in Computer Science',
75
- 71: 'Data Analysis, Statistics and Probability',
76
- 72: 'Fluid Dynamics',
77
- 73: 'Dynamical Systems',
78
- 74: 'High Energy Astrophysical Phenomena',
79
- 75: 'Programming Languages',
80
- 76: 'Mathematical Physics',
81
- 77: 'Logic',
82
- 78: 'Social and Information Networks',
83
- 79: 'Numerical Analysis',
84
- 80: 'Sound',
85
- 81: 'Chemical Physics',
86
- 82: 'Genomics',
87
- 83: 'Instrumentation and Methods for Astrophysics',
88
- 84: 'Applications',
89
- 85: 'Representation Theory',
90
- 86: 'Machine Learning',
91
- 87: 'Formal Languages and Automata Theory',
92
- 88: 'Quantitative Methods',
93
- 89: 'Atmospheric and Oceanic Physics',
94
- 90: 'Subcellular Processes',
95
- 91: 'Networking and Internet Architecture',
96
- 92: 'Functional Analysis',
97
- 93: 'Metric Geometry',
98
- 94: 'General Relativity and Quantum Cosmology',
99
- 95: 'Spectral Theory',
100
- 96: 'Graphics',
101
- 97: 'Adaptation and Self-Organizing Systems',
102
- 98: 'Economics',
103
- 99: 'Classical Analysis and ODEs',
104
- 100: 'Other Computer Science',
105
- 101: 'Geometric Topology',
106
- 102: 'Pricing of Securities',
107
- 103: 'High Energy Physics - Experiment',
108
- 104: 'Category Theory',
109
- 105: 'Human-Computer Interaction',
110
- 106: 'Biological Physics',
111
- 107: 'Popular Physics',
112
- 108: 'Probability',
113
- 109: 'Commutative Algebra',
114
- 110: 'Strongly Correlated Electrons',
115
- 111: 'Group Theory',
116
- 112: 'Computation',
117
- 113: 'Digital Libraries',
118
- 114: 'Classical Physics',
119
- 115: 'Neurons and Cognition',
120
- 116: 'Operator Algebras',
121
- 117: 'Tissues and Organs',
122
- 118: 'High Energy Physics - Lattice',
123
- 119: 'Robotics',
124
- 120: 'Portfolio Management',
125
- 121: 'Computational Complexity',
126
- 122: 'Soft Condensed Matter',
127
- 123: 'Mathematical Software',
128
- 124: 'Applied Physics',
129
- 125: 'Computer Science and Game Theory',
130
- 126: 'Multimedia',
131
- 127: 'Molecular Networks',
132
- 128: 'Disordered Systems and Neural Networks',
133
- 129: 'Other Statistics',
134
- 130: 'Cell Behavior',
135
- 131: 'Performance',
136
- 132: 'Biomolecules',
137
- 133: 'Astrophysics of Galaxies',
138
- 134: 'Databases',
139
- 135: 'Algebraic Topology',
140
- 136: 'Cellular Automata and Lattice Gases',
141
- 137: 'Algebraic Geometry'}
 
 
142
 
143
- # Загружаем модель (замените на вашу модель, если нужно)
144
- # @st.cache_resource
145
  def load_model():
146
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
147
  model = AutoModelForSequenceClassification.from_pretrained(
@@ -151,76 +152,61 @@ def load_model():
151
  )
152
  return model, tokenizer
153
 
154
- # Load model/tokenizer once and cache it
155
  try:
156
  model, tokenizer = load_model()
157
  except OSError as e:
158
- st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.")
159
  st.stop()
160
 
161
-
162
  def classify_text(title, description):
163
- """
164
- Классифицирует текст и возвращает результаты в отсортированном виде.
165
- Args:
166
- title (str): Заголовок текста.
167
- description (str): Краткое описание текста.
168
- Returns:
169
- list: Отсортированный список результатов классификации.
170
- """
171
- text = f"{title} {description}" # Объединяем заголовок и описание
172
- topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k = len(id_to_cat))
173
  try:
174
- results = topic_classifier(text)
 
175
  except Exception as e:
176
- st.error(f"Ошибка классификации: {e}")
177
  return []
178
 
179
- for i in results[0]:
180
- i['label'] = id_to_cat[int(i['label'].split('_')[1])]
181
-
182
- filtered_results = []
183
- for i in results[0]:
184
- filtered_results.append((i['label'], i['score']))
185
- return filtered_results
186
-
187
 
188
- # --- Интерфейс Streamlit ---
189
- st.title("Классификация статей 1")
 
190
 
191
- # Ввод данных
192
- title = st.text_input("Заголовок статьи")
193
- description = st.text_area("Краткое описание статьи", height=150)
194
- top = st.text_input("Top x%")
195
 
196
- # Кнопка "Классифицировать"
197
- if st.button("Классифицировать"):
198
  if not title and not description:
199
- st.warning("Пожалуйста, заполните хотя бы одно поле.")
200
  else:
201
- with st.spinner("Идет классификация..."): # Индикатор загрузки
 
 
 
 
 
 
 
 
 
 
202
  results = classify_text(title, description)
 
203
  if results:
204
-
205
- cumulative_prob = 0
206
- t = 0.95
207
- try:
208
- if float(top):
209
- if (float(top) >= 0) and (float(top) <= 1):
210
- t = float(top)
211
- elif (float(top) > 1):
212
- t = float(top) / 100
213
- except ValueError:
214
- t = 0.95
215
-
216
- st.subheader(f'Результаты классификации (top {int(min(t * 100, 100))}%):')
217
- for label, score in results:
218
- st.write(f"- **{label}**: {score:.4f}")
219
- cumulative_prob += score
220
- if cumulative_prob >= t:
221
- break
222
  else:
223
  st.info("Не удалось получить результаты классификации.")
224
-
225
- elif title or description: #небольшой костыль, чтобы при старте не было предупреждения
226
- st.warning("Пожалуйста, заполните хотя бы одно поле.")
 
1
  import streamlit as st
2
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
3
 
4
+ id_to_cat = {
5
+ 0: 'Cryptography and Security',
6
+ 1: 'Medical Physics',
7
+ 2: 'Audio and Speech Processing',
8
+ 3: 'Combinatorics',
9
+ 4: 'Information Theory',
10
+ 5: 'Quantum Physics',
11
+ 6: 'Nuclear Theory',
12
+ 7: 'Computers and Society',
13
+ 8: 'Pattern Formation and Solitons',
14
+ 9: 'General Finance',
15
+ 10: 'Multiagent Systems',
16
+ 11: 'Trading and Market Microstructure',
17
+ 12: 'Mesoscale and Nanoscale Physics',
18
+ 13: 'Instrumentation and Detectors',
19
+ 14: 'Emerging Technologies',
20
+ 15: 'Software Engineering',
21
+ 16: 'Computational Physics',
22
+ 17: 'Econometrics',
23
+ 18: 'Materials Science',
24
+ 19: 'Computer Vision and Pattern Recognition',
25
+ 20: 'Differential Geometry',
26
+ 21: 'General Literature',
27
+ 22: 'Computation and Language',
28
+ 23: 'Superconductivity',
29
+ 24: 'Risk Management',
30
+ 25: 'Other Condensed Matter',
31
+ 26: 'Other Quantitative Biology',
32
+ 27: 'High Energy Physics - Phenomenology',
33
+ 28: 'Analysis of PDEs',
34
+ 29: 'Earth and Planetary Astrophysics',
35
+ 30: 'Optics',
36
+ 31: 'Hardware Architecture',
37
+ 32: 'Optimization and Control',
38
+ 33: 'Methodology',
39
+ 34: 'Number Theory',
40
+ 35: 'General Topology',
41
+ 36: 'Populations and Evolution',
42
+ 37: 'Solar and Stellar Astrophysics',
43
+ 38: 'Distributed, Parallel, and Cluster Computing',
44
+ 39: 'Chaotic Dynamics',
45
+ 40: 'History and Philosophy of Physics',
46
+ 41: 'Computational Engineering, Finance, and Science',
47
+ 42: 'Discrete Mathematics',
48
+ 43: 'Statistical Mechanics',
49
+ 44: 'Operating Systems',
50
+ 45: 'Data Structures and Algorithms',
51
+ 46: 'Geophysics',
52
+ 47: 'Quantum Algebra',
53
+ 48: 'Systems and Control',
54
+ 49: 'Statistics Theory',
55
+ 50: 'High Energy Physics - Theory',
56
+ 51: 'Rings and Algebras',
57
+ 52: 'Neural and Evolutionary Computing',
58
+ 53: 'General Physics',
59
+ 54: 'Computational Geometry',
60
+ 55: 'Signal Processing',
61
+ 56: 'Computational Finance',
62
+ 57: 'History and Overview',
63
+ 58: 'Space Physics',
64
+ 59: 'Physics and Society',
65
+ 60: 'Cosmology and Nongalactic Astrophysics',
66
+ 61: 'Information Retrieval',
67
+ 62: 'Symbolic Computation',
68
+ 63: 'Statistical Finance',
69
+ 64: 'Image and Video Processing',
70
+ 65: 'Quantum Gases',
71
+ 66: 'Artificial Intelligence',
72
+ 67: 'Nuclear Experiment',
73
+ 68: 'General Mathematics',
74
+ 69: 'Complex Variables',
75
+ 70: 'Logic in Computer Science',
76
+ 71: 'Data Analysis, Statistics and Probability',
77
+ 72: 'Fluid Dynamics',
78
+ 73: 'Dynamical Systems',
79
+ 74: 'High Energy Astrophysical Phenomena',
80
+ 75: 'Programming Languages',
81
+ 76: 'Mathematical Physics',
82
+ 77: 'Logic',
83
+ 78: 'Social and Information Networks',
84
+ 79: 'Numerical Analysis',
85
+ 80: 'Sound',
86
+ 81: 'Chemical Physics',
87
+ 82: 'Genomics',
88
+ 83: 'Instrumentation and Methods for Astrophysics',
89
+ 84: 'Applications',
90
+ 85: 'Representation Theory',
91
+ 86: 'Machine Learning',
92
+ 87: 'Formal Languages and Automata Theory',
93
+ 88: 'Quantitative Methods',
94
+ 89: 'Atmospheric and Oceanic Physics',
95
+ 90: 'Subcellular Processes',
96
+ 91: 'Networking and Internet Architecture',
97
+ 92: 'Functional Analysis',
98
+ 93: 'Metric Geometry',
99
+ 94: 'General Relativity and Quantum Cosmology',
100
+ 95: 'Spectral Theory',
101
+ 96: 'Graphics',
102
+ 97: 'Adaptation and Self-Organizing Systems',
103
+ 98: 'Economics',
104
+ 99: 'Classical Analysis and ODEs',
105
+ 100: 'Other Computer Science',
106
+ 101: 'Geometric Topology',
107
+ 102: 'Pricing of Securities',
108
+ 103: 'High Energy Physics - Experiment',
109
+ 104: 'Category Theory',
110
+ 105: 'Human-Computer Interaction',
111
+ 106: 'Biological Physics',
112
+ 107: 'Popular Physics',
113
+ 108: 'Probability',
114
+ 109: 'Commutative Algebra',
115
+ 110: 'Strongly Correlated Electrons',
116
+ 111: 'Group Theory',
117
+ 112: 'Computation',
118
+ 113: 'Digital Libraries',
119
+ 114: 'Classical Physics',
120
+ 115: 'Neurons and Cognition',
121
+ 116: 'Operator Algebras',
122
+ 117: 'Tissues and Organs',
123
+ 118: 'High Energy Physics - Lattice',
124
+ 119: 'Robotics',
125
+ 120: 'Portfolio Management',
126
+ 121: 'Computational Complexity',
127
+ 122: 'Soft Condensed Matter',
128
+ 123: 'Mathematical Software',
129
+ 124: 'Applied Physics',
130
+ 125: 'Computer Science and Game Theory',
131
+ 126: 'Multimedia',
132
+ 127: 'Molecular Networks',
133
+ 128: 'Disordered Systems and Neural Networks',
134
+ 129: 'Other Statistics',
135
+ 130: 'Cell Behavior',
136
+ 131: 'Performance',
137
+ 132: 'Biomolecules',
138
+ 133: 'Astrophysics of Galaxies',
139
+ 134: 'Databases',
140
+ 135: 'Algebraic Topology',
141
+ 136: 'Cellular Automata and Lattice Gases',
142
+ 137: 'Algebraic Geometry'
143
+ }
144
 
145
+ @st.cache_resource
 
146
  def load_model():
147
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
148
  model = AutoModelForSequenceClassification.from_pretrained(
 
152
  )
153
  return model, tokenizer
154
 
 
155
  try:
156
  model, tokenizer = load_model()
157
  except OSError as e:
158
+ st.error(f"Ошибка при загрузке модели: {e}")
159
  st.stop()
160
 
 
161
  def classify_text(title, description):
162
+ text = f"{title.strip()} {description.strip()}"
 
 
 
 
 
 
 
 
 
163
  try:
164
+ classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
165
+ results = classifier(text)
166
  except Exception as e:
167
+ st.error(f"Ошибка при классификации текста: {e}")
168
  return []
169
 
170
+ readable_results = [
171
+ (id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
172
+ for entry in results[0]
173
+ ]
174
+ return readable_results
 
 
 
175
 
176
+ st.set_page_config(page_title="Классификация статей", layout="wide")
177
+ st.title("🔬 Классификация научных статей")
178
+ st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")
179
 
180
+ title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
181
+ description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
182
+ top_percent = st.text_input("📊 Порог вероятности (например, 95 или 0.95 для top 95%)", value="95")
 
183
 
184
+ if st.button("🚀 Классифицировать"):
 
185
  if not title and not description:
186
+ st.warning("Пожалуйста, введите заголовок или описание статьи.")
187
  else:
188
+ try:
189
+ t = float(top_percent)
190
+ if t > 1:
191
+ t = t / 100
192
+ if not (0 < t <= 1):
193
+ raise ValueError()
194
+ except ValueError:
195
+ st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
196
+ t = 0.95
197
+
198
+ with st.spinner("🔍 Классификация..."):
199
  results = classify_text(title, description)
200
+
201
  if results:
202
+ cumulative_prob = 0.0
203
+ st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
204
+ for label, score in results:
205
+ st.write(f"- **{label}**: {score:.4f}")
206
+ cumulative_prob += score
207
+ if cumulative_prob >= t:
208
+ break
 
 
 
 
 
 
 
 
 
 
 
209
  else:
210
  st.info("Не удалось получить результаты классификации.")
211
+ elif title or description:
212
+ st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")