|
import streamlit as st |
|
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification |
|
|
|
id_to_cat = {0: 'Performance', |
|
1: 'Molecular Networks', |
|
2: 'Operating Systems', |
|
3: 'High Energy Astrophysical Phenomena', |
|
4: 'Computational Finance', |
|
5: 'General Finance', |
|
6: 'Astrophysics of Galaxies', |
|
7: 'Portfolio Management', |
|
8: 'Functional Analysis', |
|
9: 'Quantitative Methods', |
|
10: 'Mathematical Software', |
|
11: 'Computation', |
|
12: 'Chemical Physics', |
|
13: 'Information Theory', |
|
14: 'Classical Physics', |
|
15: 'Subcellular Processes', |
|
16: 'Medical Physics', |
|
17: 'Differential Geometry', |
|
18: 'Biomolecules', |
|
19: 'Metric Geometry', |
|
20: 'Cryptography and Security', |
|
21: 'Instrumentation and Methods for Astrophysics', |
|
22: 'General Mathematics', |
|
23: 'Computational Complexity', |
|
24: 'Soft Condensed Matter', |
|
25: 'Analysis of PDEs', |
|
26: 'Human-Computer Interaction', |
|
27: 'Classical Analysis and ODEs', |
|
28: 'Genomics', |
|
29: 'Optimization and Control', |
|
30: 'Applied Physics', |
|
31: 'Computational Engineering, Finance, and Science', |
|
32: 'Quantum Algebra', |
|
33: 'Other Condensed Matter', |
|
34: 'Category Theory', |
|
35: 'Popular Physics', |
|
36: 'General Topology', |
|
37: 'Algebraic Topology', |
|
38: 'Trading and Market Microstructure', |
|
39: 'Numerical Analysis', |
|
40: 'Applications', |
|
41: 'Group Theory', |
|
42: 'Cosmology and Nongalactic Astrophysics', |
|
43: 'Mathematical Physics', |
|
44: 'Econometrics', |
|
45: 'Systems and Control', |
|
46: 'Graphics', |
|
47: 'Data Structures and Algorithms', |
|
48: 'Operator Algebras', |
|
49: 'Number Theory', |
|
50: 'Robotics', |
|
51: 'Nuclear Theory', |
|
52: 'Neural and Evolutionary Computing', |
|
53: 'Multimedia', |
|
54: 'Information Retrieval', |
|
55: 'Image and Video Processing', |
|
56: 'Rings and Algebras', |
|
57: 'Instrumentation and Detectors', |
|
58: 'Social and Information Networks', |
|
59: 'High Energy Physics - Lattice', |
|
60: 'Emerging Technologies', |
|
61: 'Strongly Correlated Electrons', |
|
62: 'Representation Theory', |
|
63: 'Space Physics', |
|
64: 'Risk Management', |
|
65: 'Disordered Systems and Neural Networks', |
|
66: 'Databases', |
|
67: 'Networking and Internet Architecture', |
|
68: 'Computers and Society', |
|
69: 'Hardware Architecture', |
|
70: 'Chaotic Dynamics', |
|
71: 'Mesoscale and Nanoscale Physics', |
|
72: 'Computational Geometry', |
|
73: 'Commutative Algebra', |
|
74: 'Statistics Theory', |
|
75: 'General Literature', |
|
76: 'Physics and Society', |
|
77: 'Geophysics', |
|
78: 'Economics', |
|
79: 'Quantum Physics', |
|
80: 'Symbolic Computation', |
|
81: 'Computational Physics', |
|
82: 'Sound', |
|
83: 'Multiagent Systems', |
|
84: 'Signal Processing', |
|
85: 'Adaptation and Self-Organizing Systems', |
|
86: 'Other Computer Science', |
|
87: 'Other Quantitative Biology', |
|
88: 'Formal Languages and Automata Theory', |
|
89: 'Populations and Evolution', |
|
90: 'Spectral Theory', |
|
91: 'Pattern Formation and Solitons', |
|
92: 'Methodology', |
|
93: 'Biological Physics', |
|
94: 'General Physics', |
|
95: 'Logic in Computer Science', |
|
96: 'Complex Variables', |
|
97: 'Optics', |
|
98: 'Discrete Mathematics', |
|
99: 'History and Overview', |
|
100: 'Programming Languages', |
|
101: 'Audio and Speech Processing', |
|
102: 'Algebraic Geometry', |
|
103: 'Neurons and Cognition', |
|
104: 'High Energy Physics - Phenomenology', |
|
105: 'History and Philosophy of Physics', |
|
106: 'Earth and Planetary Astrophysics', |
|
107: 'Pricing of Securities', |
|
108: 'Distributed, Parallel, and Cluster Computing', |
|
109: 'Tissues and Organs', |
|
110: 'Cellular Automata and Lattice Gases', |
|
111: 'Statistical Finance', |
|
112: 'Materials Science', |
|
113: 'High Energy Physics - Theory', |
|
114: 'Digital Libraries', |
|
115: 'Other Statistics', |
|
116: 'Superconductivity', |
|
117: 'Cell Behavior', |
|
118: 'General Relativity and Quantum Cosmology', |
|
119: 'Dynamical Systems', |
|
120: 'Statistical Mechanics', |
|
121: 'Fluid Dynamics', |
|
122: 'Computer Science and Game Theory', |
|
123: 'Logic', |
|
124: 'Computer Vision and Pattern Recognition', |
|
125: 'Solar and Stellar Astrophysics', |
|
126: 'High Energy Physics - Experiment', |
|
127: 'Software Engineering', |
|
128: 'Combinatorics', |
|
129: 'Data Analysis, Statistics and Probability', |
|
130: 'Machine Learning', |
|
131: 'Probability', |
|
132: 'Atmospheric and Oceanic Physics', |
|
133: 'Geometric Topology', |
|
134: 'Computation and Language', |
|
135: 'Quantum Gases', |
|
136: 'Nuclear Experiment', |
|
137: 'Artificial Intelligence'} |
|
|
|
|
|
model_name = 'checkpoint' |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
model_name, |
|
num_labels=len(id_to_cat), |
|
problem_type="multi_label_classification" |
|
) |
|
except OSError as e: |
|
st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.") |
|
st.stop() |
|
|
|
|
|
def classify_text(title, description, show_all=False, threshold=0.95): |
|
""" |
|
Классифицирует текст и возвращает результаты в отсортированном виде. |
|
|
|
Args: |
|
title (str): Заголовок текста. |
|
description (str): Краткое описание текста. |
|
show_all (bool): Показывать ли все результаты, независимо от порога. |
|
threshold (float): Порог суммарной вероятности. |
|
|
|
Returns: |
|
list: Отсортированный список результатов классификации. |
|
""" |
|
text = f"{title} {description}" |
|
topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k = len(id_to_cat)) |
|
try: |
|
|
|
results = topic_classifier(text) |
|
|
|
except Exception as e: |
|
st.error(f"Ошибка классификации: {e}") |
|
return [] |
|
|
|
for i in results[0]: |
|
i['label'] = id_to_category[int(i['label'].split('_')[1])] |
|
|
|
if show_all: |
|
filtered_results = [] |
|
for i in results[0]: |
|
filtered_results.append((i['label'], i['score'])) |
|
return filtered_results |
|
else: |
|
cumulative_prob = 0 |
|
filtered_results = [] |
|
for i in results[0]: |
|
filtered_results.append((i['label'], i['score'])) |
|
cumulative_prob += score |
|
if cumulative_prob >= threshold: |
|
break |
|
return filtered_results |
|
|
|
|
|
|
|
st.title("Классификация статей") |
|
|
|
|
|
title = st.text_input("Заголовок статьи") |
|
description = st.text_area("Краткое описание статьи", height=150) |
|
|
|
|
|
if st.button("Классифицировать"): |
|
if not title or not description: |
|
st.warning("Пожалуйста, заполните хотя бы одно поле.") |
|
else: |
|
with st.spinner("Идет классификация..."): |
|
results = classify_text(title, description) |
|
if results: |
|
st.subheader("Результаты классификации (с ограничением по вероятности):") |
|
for label, score in results: |
|
st.write(f"- **{label}**: {score:.4f}") |
|
|
|
|
|
if st.button("Показать все категории"): |
|
all_results = classify_text(title, description, candidate_labels, show_all=True) |
|
st.subheader("Полные результаты классификации:") |
|
for label, score in all_results: |
|
st.write(f"- **{label}**: {score:.4f}") |
|
else: |
|
st.info("Не удалось получить результаты классификации.") |
|
|
|
elif title or description: |
|
st.warning("Пожалуйста, заполните все поля.") |