1NEYRON1's picture
Update app.py
275db5d
raw
history blame
8.43 kB
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
id_to_cat = {0: 'Performance',
1: 'Molecular Networks',
2: 'Operating Systems',
3: 'High Energy Astrophysical Phenomena',
4: 'Computational Finance',
5: 'General Finance',
6: 'Astrophysics of Galaxies',
7: 'Portfolio Management',
8: 'Functional Analysis',
9: 'Quantitative Methods',
10: 'Mathematical Software',
11: 'Computation',
12: 'Chemical Physics',
13: 'Information Theory',
14: 'Classical Physics',
15: 'Subcellular Processes',
16: 'Medical Physics',
17: 'Differential Geometry',
18: 'Biomolecules',
19: 'Metric Geometry',
20: 'Cryptography and Security',
21: 'Instrumentation and Methods for Astrophysics',
22: 'General Mathematics',
23: 'Computational Complexity',
24: 'Soft Condensed Matter',
25: 'Analysis of PDEs',
26: 'Human-Computer Interaction',
27: 'Classical Analysis and ODEs',
28: 'Genomics',
29: 'Optimization and Control',
30: 'Applied Physics',
31: 'Computational Engineering, Finance, and Science',
32: 'Quantum Algebra',
33: 'Other Condensed Matter',
34: 'Category Theory',
35: 'Popular Physics',
36: 'General Topology',
37: 'Algebraic Topology',
38: 'Trading and Market Microstructure',
39: 'Numerical Analysis',
40: 'Applications',
41: 'Group Theory',
42: 'Cosmology and Nongalactic Astrophysics',
43: 'Mathematical Physics',
44: 'Econometrics',
45: 'Systems and Control',
46: 'Graphics',
47: 'Data Structures and Algorithms',
48: 'Operator Algebras',
49: 'Number Theory',
50: 'Robotics',
51: 'Nuclear Theory',
52: 'Neural and Evolutionary Computing',
53: 'Multimedia',
54: 'Information Retrieval',
55: 'Image and Video Processing',
56: 'Rings and Algebras',
57: 'Instrumentation and Detectors',
58: 'Social and Information Networks',
59: 'High Energy Physics - Lattice',
60: 'Emerging Technologies',
61: 'Strongly Correlated Electrons',
62: 'Representation Theory',
63: 'Space Physics',
64: 'Risk Management',
65: 'Disordered Systems and Neural Networks',
66: 'Databases',
67: 'Networking and Internet Architecture',
68: 'Computers and Society',
69: 'Hardware Architecture',
70: 'Chaotic Dynamics',
71: 'Mesoscale and Nanoscale Physics',
72: 'Computational Geometry',
73: 'Commutative Algebra',
74: 'Statistics Theory',
75: 'General Literature',
76: 'Physics and Society',
77: 'Geophysics',
78: 'Economics',
79: 'Quantum Physics',
80: 'Symbolic Computation',
81: 'Computational Physics',
82: 'Sound',
83: 'Multiagent Systems',
84: 'Signal Processing',
85: 'Adaptation and Self-Organizing Systems',
86: 'Other Computer Science',
87: 'Other Quantitative Biology',
88: 'Formal Languages and Automata Theory',
89: 'Populations and Evolution',
90: 'Spectral Theory',
91: 'Pattern Formation and Solitons',
92: 'Methodology',
93: 'Biological Physics',
94: 'General Physics',
95: 'Logic in Computer Science',
96: 'Complex Variables',
97: 'Optics',
98: 'Discrete Mathematics',
99: 'History and Overview',
100: 'Programming Languages',
101: 'Audio and Speech Processing',
102: 'Algebraic Geometry',
103: 'Neurons and Cognition',
104: 'High Energy Physics - Phenomenology',
105: 'History and Philosophy of Physics',
106: 'Earth and Planetary Astrophysics',
107: 'Pricing of Securities',
108: 'Distributed, Parallel, and Cluster Computing',
109: 'Tissues and Organs',
110: 'Cellular Automata and Lattice Gases',
111: 'Statistical Finance',
112: 'Materials Science',
113: 'High Energy Physics - Theory',
114: 'Digital Libraries',
115: 'Other Statistics',
116: 'Superconductivity',
117: 'Cell Behavior',
118: 'General Relativity and Quantum Cosmology',
119: 'Dynamical Systems',
120: 'Statistical Mechanics',
121: 'Fluid Dynamics',
122: 'Computer Science and Game Theory',
123: 'Logic',
124: 'Computer Vision and Pattern Recognition',
125: 'Solar and Stellar Astrophysics',
126: 'High Energy Physics - Experiment',
127: 'Software Engineering',
128: 'Combinatorics',
129: 'Data Analysis, Statistics and Probability',
130: 'Machine Learning',
131: 'Probability',
132: 'Atmospheric and Oceanic Physics',
133: 'Geometric Topology',
134: 'Computation and Language',
135: 'Quantum Gases',
136: 'Nuclear Experiment',
137: 'Artificial Intelligence'}
# Загружаем модель (замените на вашу модель, если нужно)
model_name = 'checkpoint'
try:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=len(id_to_cat),
problem_type="multi_label_classification"
)
except OSError as e:
st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.")
st.stop() # Остановка выполнения приложения при ошибке
def classify_text(title, description, show_all=False, threshold=0.95):
"""
Классифицирует текст и возвращает результаты в отсортированном виде.
Args:
title (str): Заголовок текста.
description (str): Краткое описание текста.
show_all (bool): Показывать ли все результаты, независимо от порога.
threshold (float): Порог суммарной вероятности.
Returns:
list: Отсортированный список результатов классификации.
"""
text = f"{title} {description}" # Объединяем заголовок и описание
topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k = len(id_to_cat))
try:
results = topic_classifier(text)
# results = topic_classifier(text, candidate_labels, multi_label=True) # multi_label=True для нескольких меток
except Exception as e:
st.error(f"Ошибка классификации: {e}")
return []
for i in results[0]:
i['label'] = id_to_category[int(i['label'].split('_')[1])]
if show_all:
filtered_results = []
for i in results[0]:
filtered_results.append((i['label'], i['score']))
return filtered_results
else:
cumulative_prob = 0
filtered_results = []
for i in results[0]:
filtered_results.append((i['label'], i['score']))
cumulative_prob += score
if cumulative_prob >= threshold:
break
return filtered_results
# --- Интерфейс Streamlit ---
st.title("Классификация статей")
# Ввод данных
title = st.text_input("Заголовок статьи")
description = st.text_area("Краткое описание статьи", height=150)
# Кнопка "Классифицировать"
if st.button("Классифицировать"):
if not title or not description:
st.warning("Пожалуйста, заполните хотя бы одно поле.")
else:
with st.spinner("Идет классификация..."): # Индикатор загрузки
results = classify_text(title, description)
if results:
st.subheader("Результаты классификации (с ограничением по вероятности):")
for label, score in results:
st.write(f"- **{label}**: {score:.4f}")
# Кнопка "Показать все"
if st.button("Показать все категории"):
all_results = classify_text(title, description, candidate_labels, show_all=True)
st.subheader("Полные результаты классификации:")
for label, score in all_results:
st.write(f"- **{label}**: {score:.4f}")
else:
st.info("Не удалось получить результаты классификации.")
elif title or description: #небольшой костыль, чтобы при старте не было предупреждения
st.warning("Пожалуйста, заполните все поля.")