1NEYRON1's picture
Update app.py
0d397bb
raw
history blame
7.93 kB
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
id_to_cat = {
0: 'Cryptography and Security',
1: 'Medical Physics',
2: 'Audio and Speech Processing',
3: 'Combinatorics',
4: 'Information Theory',
5: 'Quantum Physics',
6: 'Nuclear Theory',
7: 'Computers and Society',
8: 'Pattern Formation and Solitons',
9: 'General Finance',
10: 'Multiagent Systems',
11: 'Trading and Market Microstructure',
12: 'Mesoscale and Nanoscale Physics',
13: 'Instrumentation and Detectors',
14: 'Emerging Technologies',
15: 'Software Engineering',
16: 'Computational Physics',
17: 'Econometrics',
18: 'Materials Science',
19: 'Computer Vision and Pattern Recognition',
20: 'Differential Geometry',
21: 'General Literature',
22: 'Computation and Language',
23: 'Superconductivity',
24: 'Risk Management',
25: 'Other Condensed Matter',
26: 'Other Quantitative Biology',
27: 'High Energy Physics - Phenomenology',
28: 'Analysis of PDEs',
29: 'Earth and Planetary Astrophysics',
30: 'Optics',
31: 'Hardware Architecture',
32: 'Optimization and Control',
33: 'Methodology',
34: 'Number Theory',
35: 'General Topology',
36: 'Populations and Evolution',
37: 'Solar and Stellar Astrophysics',
38: 'Distributed, Parallel, and Cluster Computing',
39: 'Chaotic Dynamics',
40: 'History and Philosophy of Physics',
41: 'Computational Engineering, Finance, and Science',
42: 'Discrete Mathematics',
43: 'Statistical Mechanics',
44: 'Operating Systems',
45: 'Data Structures and Algorithms',
46: 'Geophysics',
47: 'Quantum Algebra',
48: 'Systems and Control',
49: 'Statistics Theory',
50: 'High Energy Physics - Theory',
51: 'Rings and Algebras',
52: 'Neural and Evolutionary Computing',
53: 'General Physics',
54: 'Computational Geometry',
55: 'Signal Processing',
56: 'Computational Finance',
57: 'History and Overview',
58: 'Space Physics',
59: 'Physics and Society',
60: 'Cosmology and Nongalactic Astrophysics',
61: 'Information Retrieval',
62: 'Symbolic Computation',
63: 'Statistical Finance',
64: 'Image and Video Processing',
65: 'Quantum Gases',
66: 'Artificial Intelligence',
67: 'Nuclear Experiment',
68: 'General Mathematics',
69: 'Complex Variables',
70: 'Logic in Computer Science',
71: 'Data Analysis, Statistics and Probability',
72: 'Fluid Dynamics',
73: 'Dynamical Systems',
74: 'High Energy Astrophysical Phenomena',
75: 'Programming Languages',
76: 'Mathematical Physics',
77: 'Logic',
78: 'Social and Information Networks',
79: 'Numerical Analysis',
80: 'Sound',
81: 'Chemical Physics',
82: 'Genomics',
83: 'Instrumentation and Methods for Astrophysics',
84: 'Applications',
85: 'Representation Theory',
86: 'Machine Learning',
87: 'Formal Languages and Automata Theory',
88: 'Quantitative Methods',
89: 'Atmospheric and Oceanic Physics',
90: 'Subcellular Processes',
91: 'Networking and Internet Architecture',
92: 'Functional Analysis',
93: 'Metric Geometry',
94: 'General Relativity and Quantum Cosmology',
95: 'Spectral Theory',
96: 'Graphics',
97: 'Adaptation and Self-Organizing Systems',
98: 'Economics',
99: 'Classical Analysis and ODEs',
100: 'Other Computer Science',
101: 'Geometric Topology',
102: 'Pricing of Securities',
103: 'High Energy Physics - Experiment',
104: 'Category Theory',
105: 'Human-Computer Interaction',
106: 'Biological Physics',
107: 'Popular Physics',
108: 'Probability',
109: 'Commutative Algebra',
110: 'Strongly Correlated Electrons',
111: 'Group Theory',
112: 'Computation',
113: 'Digital Libraries',
114: 'Classical Physics',
115: 'Neurons and Cognition',
116: 'Operator Algebras',
117: 'Tissues and Organs',
118: 'High Energy Physics - Lattice',
119: 'Robotics',
120: 'Portfolio Management',
121: 'Computational Complexity',
122: 'Soft Condensed Matter',
123: 'Mathematical Software',
124: 'Applied Physics',
125: 'Computer Science and Game Theory',
126: 'Multimedia',
127: 'Molecular Networks',
128: 'Disordered Systems and Neural Networks',
129: 'Other Statistics',
130: 'Cell Behavior',
131: 'Performance',
132: 'Biomolecules',
133: 'Astrophysics of Galaxies',
134: 'Databases',
135: 'Algebraic Topology',
136: 'Cellular Automata and Lattice Gases',
137: 'Algebraic Geometry'
}
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained(
'checkpoint',
num_labels=len(id_to_cat),
problem_type="multi_label_classification"
)
return model, tokenizer
try:
model, tokenizer = load_model()
except OSError as e:
st.error(f"Ошибка при загрузке модели: {e}")
st.stop()
def classify_text(title, description):
text = f"{title.strip()} {description.strip()}"
try:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
results = classifier(text)
except Exception as e:
st.error(f"Ошибка при классификации текста: {e}")
return []
res = [
(id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
for entry in results[0]
]
total = sum(score for _, score in res)
return [(label, score / total) for label, score in res]
st.title("🔬 Классификация научных статей")
st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")
title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
top_percent = st.text_input("📊 Порог вероятности (например, 95 или 0.95 для top 95%)", value="95")
if st.button("🚀 Классифицировать"):
if not title and not description:
st.warning("Пожалуйста, введите заголовок или описание статьи.")
else:
try:
t = float(top_percent)
if t > 1:
t = t / 100
if not (0 < t <= 1):
raise ValueError()
except ValueError:
st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
t = 0.95
with st.spinner("🔍 Классификация..."):
results = classify_text(title, description)
if results:
cumulative_prob = 0.0
st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
for label, score in results:
st.write(f"- **{label}**: {score*100:.4f}%")
cumulative_prob += score
if cumulative_prob >= t:
break
else:
st.info("Не удалось получить результаты классификации.")
elif title or description:
st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")