File size: 7,930 Bytes
5f01a56
0a215d5
b4c0a34
6a4a9f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72216f4
6a4a9f2
c4aa0f7
72216f4
 
c4aa0f7
72216f4
 
 
c4aa0f7
 
 
 
b4c0a34
6a4a9f2
c4aa0f7
e786d48
92010d3
6a4a9f2
b4c0a34
6a4a9f2
 
b4c0a34
6a4a9f2
b4c0a34
 
e7531a2
6a4a9f2
 
 
e7531a2
 
e786d48
6a4a9f2
 
b4c0a34
6a4a9f2
 
 
b4c0a34
6a4a9f2
92458f0
6a4a9f2
b4c0a34
6a4a9f2
 
 
 
 
 
 
 
 
 
 
c3b06d1
6a4a9f2
c3b06d1
6a4a9f2
 
 
0d397bb
6a4a9f2
 
 
c3b06d1
 
6a4a9f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

id_to_cat = {
    0: 'Cryptography and Security',
    1: 'Medical Physics',
    2: 'Audio and Speech Processing',
    3: 'Combinatorics',
    4: 'Information Theory',
    5: 'Quantum Physics',
    6: 'Nuclear Theory',
    7: 'Computers and Society',
    8: 'Pattern Formation and Solitons',
    9: 'General Finance',
    10: 'Multiagent Systems',
    11: 'Trading and Market Microstructure',
    12: 'Mesoscale and Nanoscale Physics',
    13: 'Instrumentation and Detectors',
    14: 'Emerging Technologies',
    15: 'Software Engineering',
    16: 'Computational Physics',
    17: 'Econometrics',
    18: 'Materials Science',
    19: 'Computer Vision and Pattern Recognition',
    20: 'Differential Geometry',
    21: 'General Literature',
    22: 'Computation and Language',
    23: 'Superconductivity',
    24: 'Risk Management',
    25: 'Other Condensed Matter',
    26: 'Other Quantitative Biology',
    27: 'High Energy Physics - Phenomenology',
    28: 'Analysis of PDEs',
    29: 'Earth and Planetary Astrophysics',
    30: 'Optics',
    31: 'Hardware Architecture',
    32: 'Optimization and Control',
    33: 'Methodology',
    34: 'Number Theory',
    35: 'General Topology',
    36: 'Populations and Evolution',
    37: 'Solar and Stellar Astrophysics',
    38: 'Distributed, Parallel, and Cluster Computing',
    39: 'Chaotic Dynamics',
    40: 'History and Philosophy of Physics',
    41: 'Computational Engineering, Finance, and Science',
    42: 'Discrete Mathematics',
    43: 'Statistical Mechanics',
    44: 'Operating Systems',
    45: 'Data Structures and Algorithms',
    46: 'Geophysics',
    47: 'Quantum Algebra',
    48: 'Systems and Control',
    49: 'Statistics Theory',
    50: 'High Energy Physics - Theory',
    51: 'Rings and Algebras',
    52: 'Neural and Evolutionary Computing',
    53: 'General Physics',
    54: 'Computational Geometry',
    55: 'Signal Processing',
    56: 'Computational Finance',
    57: 'History and Overview',
    58: 'Space Physics',
    59: 'Physics and Society',
    60: 'Cosmology and Nongalactic Astrophysics',
    61: 'Information Retrieval',
    62: 'Symbolic Computation',
    63: 'Statistical Finance',
    64: 'Image and Video Processing',
    65: 'Quantum Gases',
    66: 'Artificial Intelligence',
    67: 'Nuclear Experiment',
    68: 'General Mathematics',
    69: 'Complex Variables',
    70: 'Logic in Computer Science',
    71: 'Data Analysis, Statistics and Probability',
    72: 'Fluid Dynamics',
    73: 'Dynamical Systems',
    74: 'High Energy Astrophysical Phenomena',
    75: 'Programming Languages',
    76: 'Mathematical Physics',
    77: 'Logic',
    78: 'Social and Information Networks',
    79: 'Numerical Analysis',
    80: 'Sound',
    81: 'Chemical Physics',
    82: 'Genomics',
    83: 'Instrumentation and Methods for Astrophysics',
    84: 'Applications',
    85: 'Representation Theory',
    86: 'Machine Learning',
    87: 'Formal Languages and Automata Theory',
    88: 'Quantitative Methods',
    89: 'Atmospheric and Oceanic Physics',
    90: 'Subcellular Processes',
    91: 'Networking and Internet Architecture',
    92: 'Functional Analysis',
    93: 'Metric Geometry',
    94: 'General Relativity and Quantum Cosmology',
    95: 'Spectral Theory',
    96: 'Graphics',
    97: 'Adaptation and Self-Organizing Systems',
    98: 'Economics',
    99: 'Classical Analysis and ODEs',
    100: 'Other Computer Science',
    101: 'Geometric Topology',
    102: 'Pricing of Securities',
    103: 'High Energy Physics - Experiment',
    104: 'Category Theory',
    105: 'Human-Computer Interaction',
    106: 'Biological Physics',
    107: 'Popular Physics',
    108: 'Probability',
    109: 'Commutative Algebra',
    110: 'Strongly Correlated Electrons',
    111: 'Group Theory',
    112: 'Computation',
    113: 'Digital Libraries',
    114: 'Classical Physics',
    115: 'Neurons and Cognition',
    116: 'Operator Algebras',
    117: 'Tissues and Organs',
    118: 'High Energy Physics - Lattice',
    119: 'Robotics',
    120: 'Portfolio Management',
    121: 'Computational Complexity',
    122: 'Soft Condensed Matter',
    123: 'Mathematical Software',
    124: 'Applied Physics',
    125: 'Computer Science and Game Theory',
    126: 'Multimedia',
    127: 'Molecular Networks',
    128: 'Disordered Systems and Neural Networks',
    129: 'Other Statistics',
    130: 'Cell Behavior',
    131: 'Performance',
    132: 'Biomolecules',
    133: 'Astrophysics of Galaxies',
    134: 'Databases',
    135: 'Algebraic Topology',
    136: 'Cellular Automata and Lattice Gases',
    137: 'Algebraic Geometry'
}

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained(
        'checkpoint',
        num_labels=len(id_to_cat),
        problem_type="multi_label_classification"
    )
    return model, tokenizer

try:
    model, tokenizer = load_model()
except OSError as e:
    st.error(f"Ошибка при загрузке модели: {e}")
    st.stop()

def classify_text(title, description):
    text = f"{title.strip()} {description.strip()}"
    try:
        classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
        results = classifier(text)
    except Exception as e:
        st.error(f"Ошибка при классификации текста: {e}")
        return []

    res = [
        (id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
        for entry in results[0]
    ]
    total = sum(score for _, score in res)
    return [(label, score / total) for label, score in res]

st.title("🔬 Классификация научных статей")
st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")

title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
top_percent = st.text_input("📊 Порог вероятности (например, 95 или 0.95 для top 95%)", value="95")

if st.button("🚀 Классифицировать"):
    if not title and not description:
        st.warning("Пожалуйста, введите заголовок или описание статьи.")
    else:
        try:
            t = float(top_percent)
            if t > 1:
                t = t / 100
            if not (0 < t <= 1):
                raise ValueError()
        except ValueError:
            st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
            t = 0.95

        with st.spinner("🔍 Классификация..."):
            results = classify_text(title, description)

            if results:
                cumulative_prob = 0.0
                st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
                for label, score in results:
                    st.write(f"- **{label}**: {score*100:.4f}%")
                    cumulative_prob += score
                    if cumulative_prob >= t:
                        break
            else:
                st.info("Не удалось получить результаты классификации.")
elif title or description:
    st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")