import streamlit as st from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification id_to_cat = { 0: 'Cryptography and Security', 1: 'Medical Physics', 2: 'Audio and Speech Processing', 3: 'Combinatorics', 4: 'Information Theory', 5: 'Quantum Physics', 6: 'Nuclear Theory', 7: 'Computers and Society', 8: 'Pattern Formation and Solitons', 9: 'General Finance', 10: 'Multiagent Systems', 11: 'Trading and Market Microstructure', 12: 'Mesoscale and Nanoscale Physics', 13: 'Instrumentation and Detectors', 14: 'Emerging Technologies', 15: 'Software Engineering', 16: 'Computational Physics', 17: 'Econometrics', 18: 'Materials Science', 19: 'Computer Vision and Pattern Recognition', 20: 'Differential Geometry', 21: 'General Literature', 22: 'Computation and Language', 23: 'Superconductivity', 24: 'Risk Management', 25: 'Other Condensed Matter', 26: 'Other Quantitative Biology', 27: 'High Energy Physics - Phenomenology', 28: 'Analysis of PDEs', 29: 'Earth and Planetary Astrophysics', 30: 'Optics', 31: 'Hardware Architecture', 32: 'Optimization and Control', 33: 'Methodology', 34: 'Number Theory', 35: 'General Topology', 36: 'Populations and Evolution', 37: 'Solar and Stellar Astrophysics', 38: 'Distributed, Parallel, and Cluster Computing', 39: 'Chaotic Dynamics', 40: 'History and Philosophy of Physics', 41: 'Computational Engineering, Finance, and Science', 42: 'Discrete Mathematics', 43: 'Statistical Mechanics', 44: 'Operating Systems', 45: 'Data Structures and Algorithms', 46: 'Geophysics', 47: 'Quantum Algebra', 48: 'Systems and Control', 49: 'Statistics Theory', 50: 'High Energy Physics - Theory', 51: 'Rings and Algebras', 52: 'Neural and Evolutionary Computing', 53: 'General Physics', 54: 'Computational Geometry', 55: 'Signal Processing', 56: 'Computational Finance', 57: 'History and Overview', 58: 'Space Physics', 59: 'Physics and Society', 60: 'Cosmology and Nongalactic Astrophysics', 61: 'Information Retrieval', 62: 'Symbolic Computation', 63: 'Statistical Finance', 64: 'Image and Video Processing', 65: 'Quantum Gases', 66: 'Artificial Intelligence', 67: 'Nuclear Experiment', 68: 'General Mathematics', 69: 'Complex Variables', 70: 'Logic in Computer Science', 71: 'Data Analysis, Statistics and Probability', 72: 'Fluid Dynamics', 73: 'Dynamical Systems', 74: 'High Energy Astrophysical Phenomena', 75: 'Programming Languages', 76: 'Mathematical Physics', 77: 'Logic', 78: 'Social and Information Networks', 79: 'Numerical Analysis', 80: 'Sound', 81: 'Chemical Physics', 82: 'Genomics', 83: 'Instrumentation and Methods for Astrophysics', 84: 'Applications', 85: 'Representation Theory', 86: 'Machine Learning', 87: 'Formal Languages and Automata Theory', 88: 'Quantitative Methods', 89: 'Atmospheric and Oceanic Physics', 90: 'Subcellular Processes', 91: 'Networking and Internet Architecture', 92: 'Functional Analysis', 93: 'Metric Geometry', 94: 'General Relativity and Quantum Cosmology', 95: 'Spectral Theory', 96: 'Graphics', 97: 'Adaptation and Self-Organizing Systems', 98: 'Economics', 99: 'Classical Analysis and ODEs', 100: 'Other Computer Science', 101: 'Geometric Topology', 102: 'Pricing of Securities', 103: 'High Energy Physics - Experiment', 104: 'Category Theory', 105: 'Human-Computer Interaction', 106: 'Biological Physics', 107: 'Popular Physics', 108: 'Probability', 109: 'Commutative Algebra', 110: 'Strongly Correlated Electrons', 111: 'Group Theory', 112: 'Computation', 113: 'Digital Libraries', 114: 'Classical Physics', 115: 'Neurons and Cognition', 116: 'Operator Algebras', 117: 'Tissues and Organs', 118: 'High Energy Physics - Lattice', 119: 'Robotics', 120: 'Portfolio Management', 121: 'Computational Complexity', 122: 'Soft Condensed Matter', 123: 'Mathematical Software', 124: 'Applied Physics', 125: 'Computer Science and Game Theory', 126: 'Multimedia', 127: 'Molecular Networks', 128: 'Disordered Systems and Neural Networks', 129: 'Other Statistics', 130: 'Cell Behavior', 131: 'Performance', 132: 'Biomolecules', 133: 'Astrophysics of Galaxies', 134: 'Databases', 135: 'Algebraic Topology', 136: 'Cellular Automata and Lattice Gases', 137: 'Algebraic Geometry' } @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') model = AutoModelForSequenceClassification.from_pretrained( 'checkpoint', num_labels=len(id_to_cat), problem_type="multi_label_classification" ) return model, tokenizer try: model, tokenizer = load_model() except OSError as e: st.error(f"Ошибка при загрузке модели: {e}") st.stop() def classify_text(title, description): text = f"{title.strip()} {description.strip()}" try: classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat)) results = classifier(text) except Exception as e: st.error(f"Ошибка при классификации текста: {e}") return [] res = [ (id_to_cat[int(entry['label'].split('_')[1])], entry['score']) for entry in results[0] ] total = sum(score for _, score in res) return [(label, score / total) for label, score in res] st.title("🔬 Классификация научных статей") st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.") title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition") description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...") top_percent = st.text_input("📊 Порог суммарной вероятности для предсказанных категорий (например, 95 или 0.95 для top 95%)", value="95") if st.button("🚀 Классифицировать"): if not title and not description: st.warning("Пожалуйста, введите заголовок или описание статьи.") else: try: t = float(top_percent) if t > 1: t = t / 100 if not (0 < t <= 1): raise ValueError() except ValueError: st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.") t = 0.95 with st.spinner("🔍 Классификация..."): results = classify_text(title, description) if results: cumulative_prob = 0.0 st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):") for label, score in results: st.write(f"- **{label}**: {score*100:.2f}%") cumulative_prob += score if cumulative_prob >= t: break else: st.info("Не удалось получить результаты классификации.") elif title or description: st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")