Spaces:

1NEYRON1
/

Topic_classification_for_scientific_articles

Running

Topic_classification_for_scientific_articles

File size: 8,434 Bytes

import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

id_to_cat = {0: 'Performance',
 1: 'Molecular Networks',
 2: 'Operating Systems',
 3: 'High Energy Astrophysical Phenomena',
 4: 'Computational Finance',
 5: 'General Finance',
 6: 'Astrophysics of Galaxies',
 7: 'Portfolio Management',
 8: 'Functional Analysis',
 9: 'Quantitative Methods',
 10: 'Mathematical Software',
 11: 'Computation',
 12: 'Chemical Physics',
 13: 'Information Theory',
 14: 'Classical Physics',
 15: 'Subcellular Processes',
 16: 'Medical Physics',
 17: 'Differential Geometry',
 18: 'Biomolecules',
 19: 'Metric Geometry',
 20: 'Cryptography and Security',
 21: 'Instrumentation and Methods for Astrophysics',
 22: 'General Mathematics',
 23: 'Computational Complexity',
 24: 'Soft Condensed Matter',
 25: 'Analysis of PDEs',
 26: 'Human-Computer Interaction',
 27: 'Classical Analysis and ODEs',
 28: 'Genomics',
 29: 'Optimization and Control',
 30: 'Applied Physics',
 31: 'Computational Engineering, Finance, and Science',
 32: 'Quantum Algebra',
 33: 'Other Condensed Matter',
 34: 'Category Theory',
 35: 'Popular Physics',
 36: 'General Topology',
 37: 'Algebraic Topology',
 38: 'Trading and Market Microstructure',
 39: 'Numerical Analysis',
 40: 'Applications',
 41: 'Group Theory',
 42: 'Cosmology and Nongalactic Astrophysics',
 43: 'Mathematical Physics',
 44: 'Econometrics',
 45: 'Systems and Control',
 46: 'Graphics',
 47: 'Data Structures and Algorithms',
 48: 'Operator Algebras',
 49: 'Number Theory',
 50: 'Robotics',
 51: 'Nuclear Theory',
 52: 'Neural and Evolutionary Computing',
 53: 'Multimedia',
 54: 'Information Retrieval',
 55: 'Image and Video Processing',
 56: 'Rings and Algebras',
 57: 'Instrumentation and Detectors',
 58: 'Social and Information Networks',
 59: 'High Energy Physics - Lattice',
 60: 'Emerging Technologies',
 61: 'Strongly Correlated Electrons',
 62: 'Representation Theory',
 63: 'Space Physics',
 64: 'Risk Management',
 65: 'Disordered Systems and Neural Networks',
 66: 'Databases',
 67: 'Networking and Internet Architecture',
 68: 'Computers and Society',
 69: 'Hardware Architecture',
 70: 'Chaotic Dynamics',
 71: 'Mesoscale and Nanoscale Physics',
 72: 'Computational Geometry',
 73: 'Commutative Algebra',
 74: 'Statistics Theory',
 75: 'General Literature',
 76: 'Physics and Society',
 77: 'Geophysics',
 78: 'Economics',
 79: 'Quantum Physics',
 80: 'Symbolic Computation',
 81: 'Computational Physics',
 82: 'Sound',
 83: 'Multiagent Systems',
 84: 'Signal Processing',
 85: 'Adaptation and Self-Organizing Systems',
 86: 'Other Computer Science',
 87: 'Other Quantitative Biology',
 88: 'Formal Languages and Automata Theory',
 89: 'Populations and Evolution',
 90: 'Spectral Theory',
 91: 'Pattern Formation and Solitons',
 92: 'Methodology',
 93: 'Biological Physics',
 94: 'General Physics',
 95: 'Logic in Computer Science',
 96: 'Complex Variables',
 97: 'Optics',
 98: 'Discrete Mathematics',
 99: 'History and Overview',
 100: 'Programming Languages',
 101: 'Audio and Speech Processing',
 102: 'Algebraic Geometry',
 103: 'Neurons and Cognition',
 104: 'High Energy Physics - Phenomenology',
 105: 'History and Philosophy of Physics',
 106: 'Earth and Planetary Astrophysics',
 107: 'Pricing of Securities',
 108: 'Distributed, Parallel, and Cluster Computing',
 109: 'Tissues and Organs',
 110: 'Cellular Automata and Lattice Gases',
 111: 'Statistical Finance',
 112: 'Materials Science',
 113: 'High Energy Physics - Theory',
 114: 'Digital Libraries',
 115: 'Other Statistics',
 116: 'Superconductivity',
 117: 'Cell Behavior',
 118: 'General Relativity and Quantum Cosmology',
 119: 'Dynamical Systems',
 120: 'Statistical Mechanics',
 121: 'Fluid Dynamics',
 122: 'Computer Science and Game Theory',
 123: 'Logic',
 124: 'Computer Vision and Pattern Recognition',
 125: 'Solar and Stellar Astrophysics',
 126: 'High Energy Physics - Experiment',
 127: 'Software Engineering',
 128: 'Combinatorics',
 129: 'Data Analysis, Statistics and Probability',
 130: 'Machine Learning',
 131: 'Probability',
 132: 'Atmospheric and Oceanic Physics',
 133: 'Geometric Topology',
 134: 'Computation and Language',
 135: 'Quantum Gases',
 136: 'Nuclear Experiment',
 137: 'Artificial Intelligence'}

# Загружаем модель (замените на вашу модель, если нужно)
model_name = 'checkpoint'
try:
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(id_to_cat),
        problem_type="multi_label_classification"
    )
except OSError as e:
    st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.")
    st.stop()  # Остановка выполнения приложения при ошибке


def classify_text(title, description, show_all=False, threshold=0.95):
    """
    Классифицирует текст и возвращает результаты в отсортированном виде.

    Args:
        title (str): Заголовок текста.
        description (str): Краткое описание текста.
        show_all (bool): Показывать ли все результаты, независимо от порога.
        threshold (float): Порог суммарной вероятности.

    Returns:
        list: Отсортированный список результатов классификации.
    """
    text = f"{title} {description}"  # Объединяем заголовок и описание
    topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k = len(id_to_cat))
    try:
        
        results = topic_classifier(text) 
        # results = topic_classifier(text, candidate_labels, multi_label=True)  # multi_label=True для нескольких меток
    except Exception as e:
        st.error(f"Ошибка классификации: {e}")
        return []

    for i in results[0]:
        i['label'] = id_to_category[int(i['label'].split('_')[1])]

    if show_all:
        filtered_results = []
        for i in results[0]:
            filtered_results.append((i['label'], i['score']))
        return filtered_results
    else:
        cumulative_prob = 0
        filtered_results = []
        for i in results[0]:
            filtered_results.append((i['label'], i['score']))
            cumulative_prob += score
            if cumulative_prob >= threshold:
                break
        return filtered_results


# --- Интерфейс Streamlit ---
st.title("Классификация статей")

# Ввод данных
title = st.text_input("Заголовок статьи")
description = st.text_area("Краткое описание статьи", height=150)

# Кнопка "Классифицировать"
if st.button("Классифицировать"):
    if not title or not description:
        st.warning("Пожалуйста, заполните хотя бы одно поле.")
    else:
        with st.spinner("Идет классификация..."):  # Индикатор загрузки
            results = classify_text(title, description)
            if results:
              st.subheader("Результаты классификации (с ограничением по вероятности):")
              for label, score in results:
                  st.write(f"- **{label}**: {score:.4f}")

              # Кнопка "Показать все"
              if st.button("Показать все категории"):
                  all_results = classify_text(title, description, candidate_labels, show_all=True)
                  st.subheader("Полные результаты классификации:")
                  for label, score in all_results:
                      st.write(f"- **{label}**: {score:.4f}")
            else:
                st.info("Не удалось получить результаты классификации.")

elif title or description: #небольшой костыль, чтобы при старте не было предупреждения
    st.warning("Пожалуйста, заполните все поля.")