Spaces:

1NEYRON1
/

Topic_classification_for_scientific_articles

Running

Topic_classification_for_scientific_articles

File size: 7,930 Bytes

import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

id_to_cat = {
    0: 'Cryptography and Security',
    1: 'Medical Physics',
    2: 'Audio and Speech Processing',
    3: 'Combinatorics',
    4: 'Information Theory',
    5: 'Quantum Physics',
    6: 'Nuclear Theory',
    7: 'Computers and Society',
    8: 'Pattern Formation and Solitons',
    9: 'General Finance',
    10: 'Multiagent Systems',
    11: 'Trading and Market Microstructure',
    12: 'Mesoscale and Nanoscale Physics',
    13: 'Instrumentation and Detectors',
    14: 'Emerging Technologies',
    15: 'Software Engineering',
    16: 'Computational Physics',
    17: 'Econometrics',
    18: 'Materials Science',
    19: 'Computer Vision and Pattern Recognition',
    20: 'Differential Geometry',
    21: 'General Literature',
    22: 'Computation and Language',
    23: 'Superconductivity',
    24: 'Risk Management',
    25: 'Other Condensed Matter',
    26: 'Other Quantitative Biology',
    27: 'High Energy Physics - Phenomenology',
    28: 'Analysis of PDEs',
    29: 'Earth and Planetary Astrophysics',
    30: 'Optics',
    31: 'Hardware Architecture',
    32: 'Optimization and Control',
    33: 'Methodology',
    34: 'Number Theory',
    35: 'General Topology',
    36: 'Populations and Evolution',
    37: 'Solar and Stellar Astrophysics',
    38: 'Distributed, Parallel, and Cluster Computing',
    39: 'Chaotic Dynamics',
    40: 'History and Philosophy of Physics',
    41: 'Computational Engineering, Finance, and Science',
    42: 'Discrete Mathematics',
    43: 'Statistical Mechanics',
    44: 'Operating Systems',
    45: 'Data Structures and Algorithms',
    46: 'Geophysics',
    47: 'Quantum Algebra',
    48: 'Systems and Control',
    49: 'Statistics Theory',
    50: 'High Energy Physics - Theory',
    51: 'Rings and Algebras',
    52: 'Neural and Evolutionary Computing',
    53: 'General Physics',
    54: 'Computational Geometry',
    55: 'Signal Processing',
    56: 'Computational Finance',
    57: 'History and Overview',
    58: 'Space Physics',
    59: 'Physics and Society',
    60: 'Cosmology and Nongalactic Astrophysics',
    61: 'Information Retrieval',
    62: 'Symbolic Computation',
    63: 'Statistical Finance',
    64: 'Image and Video Processing',
    65: 'Quantum Gases',
    66: 'Artificial Intelligence',
    67: 'Nuclear Experiment',
    68: 'General Mathematics',
    69: 'Complex Variables',
    70: 'Logic in Computer Science',
    71: 'Data Analysis, Statistics and Probability',
    72: 'Fluid Dynamics',
    73: 'Dynamical Systems',
    74: 'High Energy Astrophysical Phenomena',
    75: 'Programming Languages',
    76: 'Mathematical Physics',
    77: 'Logic',
    78: 'Social and Information Networks',
    79: 'Numerical Analysis',
    80: 'Sound',
    81: 'Chemical Physics',
    82: 'Genomics',
    83: 'Instrumentation and Methods for Astrophysics',
    84: 'Applications',
    85: 'Representation Theory',
    86: 'Machine Learning',
    87: 'Formal Languages and Automata Theory',
    88: 'Quantitative Methods',
    89: 'Atmospheric and Oceanic Physics',
    90: 'Subcellular Processes',
    91: 'Networking and Internet Architecture',
    92: 'Functional Analysis',
    93: 'Metric Geometry',
    94: 'General Relativity and Quantum Cosmology',
    95: 'Spectral Theory',
    96: 'Graphics',
    97: 'Adaptation and Self-Organizing Systems',
    98: 'Economics',
    99: 'Classical Analysis and ODEs',
    100: 'Other Computer Science',
    101: 'Geometric Topology',
    102: 'Pricing of Securities',
    103: 'High Energy Physics - Experiment',
    104: 'Category Theory',
    105: 'Human-Computer Interaction',
    106: 'Biological Physics',
    107: 'Popular Physics',
    108: 'Probability',
    109: 'Commutative Algebra',
    110: 'Strongly Correlated Electrons',
    111: 'Group Theory',
    112: 'Computation',
    113: 'Digital Libraries',
    114: 'Classical Physics',
    115: 'Neurons and Cognition',
    116: 'Operator Algebras',
    117: 'Tissues and Organs',
    118: 'High Energy Physics - Lattice',
    119: 'Robotics',
    120: 'Portfolio Management',
    121: 'Computational Complexity',
    122: 'Soft Condensed Matter',
    123: 'Mathematical Software',
    124: 'Applied Physics',
    125: 'Computer Science and Game Theory',
    126: 'Multimedia',
    127: 'Molecular Networks',
    128: 'Disordered Systems and Neural Networks',
    129: 'Other Statistics',
    130: 'Cell Behavior',
    131: 'Performance',
    132: 'Biomolecules',
    133: 'Astrophysics of Galaxies',
    134: 'Databases',
    135: 'Algebraic Topology',
    136: 'Cellular Automata and Lattice Gases',
    137: 'Algebraic Geometry'
}

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained(
        'checkpoint',
        num_labels=len(id_to_cat),
        problem_type="multi_label_classification"
    )
    return model, tokenizer

try:
    model, tokenizer = load_model()
except OSError as e:
    st.error(f"Ошибка при загрузке модели: {e}")
    st.stop()

def classify_text(title, description):
    text = f"{title.strip()} {description.strip()}"
    try:
        classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
        results = classifier(text)
    except Exception as e:
        st.error(f"Ошибка при классификации текста: {e}")
        return []

    res = [
        (id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
        for entry in results[0]
    ]
    total = sum(score for _, score in res)
    return [(label, score / total) for label, score in res]

st.title("🔬 Классификация научных статей")
st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")

title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
top_percent = st.text_input("📊 Порог вероятности (например, 95 или 0.95 для top 95%)", value="95")

if st.button("🚀 Классифицировать"):
    if not title and not description:
        st.warning("Пожалуйста, введите заголовок или описание статьи.")
    else:
        try:
            t = float(top_percent)
            if t > 1:
                t = t / 100
            if not (0 < t <= 1):
                raise ValueError()
        except ValueError:
            st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
            t = 0.95

        with st.spinner("🔍 Классификация..."):
            results = classify_text(title, description)

            if results:
                cumulative_prob = 0.0
                st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
                for label, score in results:
                    st.write(f"- **{label}**: {score*100:.4f}%")
                    cumulative_prob += score
                    if cumulative_prob >= t:
                        break
            else:
                st.info("Не удалось получить результаты классификации.")
elif title or description:
    st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")