File size: 7,930 Bytes
5f01a56 0a215d5 b4c0a34 6a4a9f2 72216f4 6a4a9f2 c4aa0f7 72216f4 c4aa0f7 72216f4 c4aa0f7 b4c0a34 6a4a9f2 c4aa0f7 e786d48 92010d3 6a4a9f2 b4c0a34 6a4a9f2 b4c0a34 6a4a9f2 b4c0a34 e7531a2 6a4a9f2 e7531a2 e786d48 6a4a9f2 b4c0a34 6a4a9f2 b4c0a34 6a4a9f2 92458f0 6a4a9f2 b4c0a34 6a4a9f2 c3b06d1 6a4a9f2 c3b06d1 6a4a9f2 0d397bb 6a4a9f2 c3b06d1 6a4a9f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
id_to_cat = {
0: 'Cryptography and Security',
1: 'Medical Physics',
2: 'Audio and Speech Processing',
3: 'Combinatorics',
4: 'Information Theory',
5: 'Quantum Physics',
6: 'Nuclear Theory',
7: 'Computers and Society',
8: 'Pattern Formation and Solitons',
9: 'General Finance',
10: 'Multiagent Systems',
11: 'Trading and Market Microstructure',
12: 'Mesoscale and Nanoscale Physics',
13: 'Instrumentation and Detectors',
14: 'Emerging Technologies',
15: 'Software Engineering',
16: 'Computational Physics',
17: 'Econometrics',
18: 'Materials Science',
19: 'Computer Vision and Pattern Recognition',
20: 'Differential Geometry',
21: 'General Literature',
22: 'Computation and Language',
23: 'Superconductivity',
24: 'Risk Management',
25: 'Other Condensed Matter',
26: 'Other Quantitative Biology',
27: 'High Energy Physics - Phenomenology',
28: 'Analysis of PDEs',
29: 'Earth and Planetary Astrophysics',
30: 'Optics',
31: 'Hardware Architecture',
32: 'Optimization and Control',
33: 'Methodology',
34: 'Number Theory',
35: 'General Topology',
36: 'Populations and Evolution',
37: 'Solar and Stellar Astrophysics',
38: 'Distributed, Parallel, and Cluster Computing',
39: 'Chaotic Dynamics',
40: 'History and Philosophy of Physics',
41: 'Computational Engineering, Finance, and Science',
42: 'Discrete Mathematics',
43: 'Statistical Mechanics',
44: 'Operating Systems',
45: 'Data Structures and Algorithms',
46: 'Geophysics',
47: 'Quantum Algebra',
48: 'Systems and Control',
49: 'Statistics Theory',
50: 'High Energy Physics - Theory',
51: 'Rings and Algebras',
52: 'Neural and Evolutionary Computing',
53: 'General Physics',
54: 'Computational Geometry',
55: 'Signal Processing',
56: 'Computational Finance',
57: 'History and Overview',
58: 'Space Physics',
59: 'Physics and Society',
60: 'Cosmology and Nongalactic Astrophysics',
61: 'Information Retrieval',
62: 'Symbolic Computation',
63: 'Statistical Finance',
64: 'Image and Video Processing',
65: 'Quantum Gases',
66: 'Artificial Intelligence',
67: 'Nuclear Experiment',
68: 'General Mathematics',
69: 'Complex Variables',
70: 'Logic in Computer Science',
71: 'Data Analysis, Statistics and Probability',
72: 'Fluid Dynamics',
73: 'Dynamical Systems',
74: 'High Energy Astrophysical Phenomena',
75: 'Programming Languages',
76: 'Mathematical Physics',
77: 'Logic',
78: 'Social and Information Networks',
79: 'Numerical Analysis',
80: 'Sound',
81: 'Chemical Physics',
82: 'Genomics',
83: 'Instrumentation and Methods for Astrophysics',
84: 'Applications',
85: 'Representation Theory',
86: 'Machine Learning',
87: 'Formal Languages and Automata Theory',
88: 'Quantitative Methods',
89: 'Atmospheric and Oceanic Physics',
90: 'Subcellular Processes',
91: 'Networking and Internet Architecture',
92: 'Functional Analysis',
93: 'Metric Geometry',
94: 'General Relativity and Quantum Cosmology',
95: 'Spectral Theory',
96: 'Graphics',
97: 'Adaptation and Self-Organizing Systems',
98: 'Economics',
99: 'Classical Analysis and ODEs',
100: 'Other Computer Science',
101: 'Geometric Topology',
102: 'Pricing of Securities',
103: 'High Energy Physics - Experiment',
104: 'Category Theory',
105: 'Human-Computer Interaction',
106: 'Biological Physics',
107: 'Popular Physics',
108: 'Probability',
109: 'Commutative Algebra',
110: 'Strongly Correlated Electrons',
111: 'Group Theory',
112: 'Computation',
113: 'Digital Libraries',
114: 'Classical Physics',
115: 'Neurons and Cognition',
116: 'Operator Algebras',
117: 'Tissues and Organs',
118: 'High Energy Physics - Lattice',
119: 'Robotics',
120: 'Portfolio Management',
121: 'Computational Complexity',
122: 'Soft Condensed Matter',
123: 'Mathematical Software',
124: 'Applied Physics',
125: 'Computer Science and Game Theory',
126: 'Multimedia',
127: 'Molecular Networks',
128: 'Disordered Systems and Neural Networks',
129: 'Other Statistics',
130: 'Cell Behavior',
131: 'Performance',
132: 'Biomolecules',
133: 'Astrophysics of Galaxies',
134: 'Databases',
135: 'Algebraic Topology',
136: 'Cellular Automata and Lattice Gases',
137: 'Algebraic Geometry'
}
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained(
'checkpoint',
num_labels=len(id_to_cat),
problem_type="multi_label_classification"
)
return model, tokenizer
try:
model, tokenizer = load_model()
except OSError as e:
st.error(f"Ошибка при загрузке модели: {e}")
st.stop()
def classify_text(title, description):
text = f"{title.strip()} {description.strip()}"
try:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
results = classifier(text)
except Exception as e:
st.error(f"Ошибка при классификации текста: {e}")
return []
res = [
(id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
for entry in results[0]
]
total = sum(score for _, score in res)
return [(label, score / total) for label, score in res]
st.title("🔬 Классификация научных статей")
st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")
title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
top_percent = st.text_input("📊 Порог вероятности (например, 95 или 0.95 для top 95%)", value="95")
if st.button("🚀 Классифицировать"):
if not title and not description:
st.warning("Пожалуйста, введите заголовок или описание статьи.")
else:
try:
t = float(top_percent)
if t > 1:
t = t / 100
if not (0 < t <= 1):
raise ValueError()
except ValueError:
st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
t = 0.95
with st.spinner("🔍 Классификация..."):
results = classify_text(title, description)
if results:
cumulative_prob = 0.0
st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
for label, score in results:
st.write(f"- **{label}**: {score*100:.4f}%")
cumulative_prob += score
if cumulative_prob >= t:
break
else:
st.info("Не удалось получить результаты классификации.")
elif title or description:
st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.") |