Spaces:

1NEYRON1
/

Topic_classification_for_scientific_articles

Sleeping

App Files Files Community

1NEYRON1 commited on Apr 6

Commit

6a4a9f2

1 Parent(s): 0c1b0b7

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -194

app.py CHANGED Viewed

@@ -1,147 +1,148 @@
 import streamlit as st
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
-id_to_cat = {0: 'Cryptography and Security',
-             1: 'Medical Physics',
-             2: 'Audio and Speech Processing',
-             3: 'Combinatorics',
-             4: 'Information Theory',
-             5: 'Quantum Physics',
-             6: 'Nuclear Theory',
-             7: 'Computers and Society',
-             8: 'Pattern Formation and Solitons',
-             9: 'General Finance',
-             10: 'Multiagent Systems',
-             11: 'Trading and Market Microstructure',
-             12: 'Mesoscale and Nanoscale Physics',
-             13: 'Instrumentation and Detectors',
-             14: 'Emerging Technologies',
-             15: 'Software Engineering',
-             16: 'Computational Physics',
-             17: 'Econometrics',
-             18: 'Materials Science',
-             19: 'Computer Vision and Pattern Recognition',
-             20: 'Differential Geometry',
-             21: 'General Literature',
-             22: 'Computation and Language',
-             23: 'Superconductivity',
-             24: 'Risk Management',
-             25: 'Other Condensed Matter',
-             26: 'Other Quantitative Biology',
-             27: 'High Energy Physics - Phenomenology',
-             28: 'Analysis of PDEs',
-             29: 'Earth and Planetary Astrophysics',
-             30: 'Optics',
-             31: 'Hardware Architecture',
-             32: 'Optimization and Control',
-             33: 'Methodology',
-             34: 'Number Theory',
-             35: 'General Topology',
-             36: 'Populations and Evolution',
-             37: 'Solar and Stellar Astrophysics',
-             38: 'Distributed, Parallel, and Cluster Computing',
-             39: 'Chaotic Dynamics',
-             40: 'History and Philosophy of Physics',
-             41: 'Computational Engineering, Finance, and Science',
-             42: 'Discrete Mathematics',
-             43: 'Statistical Mechanics',
-             44: 'Operating Systems',
-             45: 'Data Structures and Algorithms',
-             46: 'Geophysics',
-             47: 'Quantum Algebra',
-             48: 'Systems and Control',
-             49: 'Statistics Theory',
-             50: 'High Energy Physics - Theory',
-             51: 'Rings and Algebras',
-             52: 'Neural and Evolutionary Computing',
-             53: 'General Physics',
-             54: 'Computational Geometry',
-             55: 'Signal Processing',
-             56: 'Computational Finance',
-             57: 'History and Overview',
-             58: 'Space Physics',
-             59: 'Physics and Society',
-             60: 'Cosmology and Nongalactic Astrophysics',
-             61: 'Information Retrieval',
-             62: 'Symbolic Computation',
-             63: 'Statistical Finance',
-             64: 'Image and Video Processing',
-             65: 'Quantum Gases',
-             66: 'Artificial Intelligence',
-             67: 'Nuclear Experiment',
-             68: 'General Mathematics',
-             69: 'Complex Variables',
-             70: 'Logic in Computer Science',
-             71: 'Data Analysis, Statistics and Probability',
-             72: 'Fluid Dynamics',
-             73: 'Dynamical Systems',
-             74: 'High Energy Astrophysical Phenomena',
-             75: 'Programming Languages',
-             76: 'Mathematical Physics',
-             77: 'Logic',
-             78: 'Social and Information Networks',
-             79: 'Numerical Analysis',
-             80: 'Sound',
-             81: 'Chemical Physics',
-             82: 'Genomics',
-             83: 'Instrumentation and Methods for Astrophysics',
-             84: 'Applications',
-             85: 'Representation Theory',
-             86: 'Machine Learning',
-             87: 'Formal Languages and Automata Theory',
-             88: 'Quantitative Methods',
-             89: 'Atmospheric and Oceanic Physics',
-             90: 'Subcellular Processes',
-             91: 'Networking and Internet Architecture',
-             92: 'Functional Analysis',
-             93: 'Metric Geometry',
-             94: 'General Relativity and Quantum Cosmology',
-             95: 'Spectral Theory',
-             96: 'Graphics',
-             97: 'Adaptation and Self-Organizing Systems',
-             98: 'Economics',
-             99: 'Classical Analysis and ODEs',
-             100: 'Other Computer Science',
-             101: 'Geometric Topology',
-             102: 'Pricing of Securities',
-             103: 'High Energy Physics - Experiment',
-             104: 'Category Theory',
-             105: 'Human-Computer Interaction',
-             106: 'Biological Physics',
-             107: 'Popular Physics',
-             108: 'Probability',
-             109: 'Commutative Algebra',
-             110: 'Strongly Correlated Electrons',
-             111: 'Group Theory',
-             112: 'Computation',
-             113: 'Digital Libraries',
-             114: 'Classical Physics',
-             115: 'Neurons and Cognition',
-             116: 'Operator Algebras',
-             117: 'Tissues and Organs',
-             118: 'High Energy Physics - Lattice',
-             119: 'Robotics',
-             120: 'Portfolio Management',
-             121: 'Computational Complexity',
-             122: 'Soft Condensed Matter',
-             123: 'Mathematical Software',
-             124: 'Applied Physics',
-             125: 'Computer Science and Game Theory',
-             126: 'Multimedia',
-             127: 'Molecular Networks',
-             128: 'Disordered Systems and Neural Networks',
-             129: 'Other Statistics',
-             130: 'Cell Behavior',
-             131: 'Performance',
-             132: 'Biomolecules',
-             133: 'Astrophysics of Galaxies',
-             134: 'Databases',
-             135: 'Algebraic Topology',
-             136: 'Cellular Automata and Lattice Gases',
-             137: 'Algebraic Geometry'}
-# Загружаем модель (замените на вашу модель, если нужно)
-# @st.cache_resource
 def load_model():
     tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
     model = AutoModelForSequenceClassification.from_pretrained(
@@ -151,76 +152,61 @@ def load_model():
     )
     return model, tokenizer
-# Load model/tokenizer once and cache it
 try:
     model, tokenizer = load_model()
 except OSError as e:
-    st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.")
     st.stop()
 def classify_text(title, description):
-    """
-    Классифицирует текст и возвращает результаты в отсортированном виде.
-    Args:
-        title (str): Заголовок текста.
-        description (str): Краткое описание текста.
-    Returns:
-        list: Отсортированный список результатов классификации.
-    """
-    text = f"{title} {description}"  # Объединяем заголовок и описание
-    topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k = len(id_to_cat))
     try:
-        results = topic_classifier(text)
     except Exception as e:
-        st.error(f"Ошибка классификации: {e}")
         return []
-    for i in results[0]:
-        i['label'] = id_to_cat[int(i['label'].split('_')[1])]
-    filtered_results = []
-    for i in results[0]:
-        filtered_results.append((i['label'], i['score']))
-    return filtered_results
-# --- Интерфейс Streamlit ---
-st.title("Классификация статей 1")
-# Ввод данных
-title = st.text_input("Заголовок статьи")
-description = st.text_area("Краткое описание статьи", height=150)
-top = st.text_input("Top x%")
-# Кнопка "Классифицировать"
-if st.button("Классифицировать"):
     if not title and not description:
-        st.warning("Пожалуйста, заполните хотя бы одно поле.")
     else:
-        with st.spinner("Идет классификация..."):  # Индикатор загрузки
             results = classify_text(title, description)
             if results:
-              cumulative_prob = 0
-              t = 0.95
-              try:
-                  if float(top):
-                      if (float(top) >= 0) and (float(top) <= 1):
-                          t = float(top)
-                      elif (float(top) > 1):
-                          t = float(top) / 100
-              except ValueError:
-                  t = 0.95
-              st.subheader(f'Результаты классификации (top {int(min(t * 100, 100))}%):')
-              for label, score in results:
-                  st.write(f"- **{label}**: {score:.4f}")
-                  cumulative_prob += score
-                  if cumulative_prob >= t:
-                      break
             else:
                 st.info("Не удалось получить результаты классификации.")
-elif title or description: #небольшой костыль, чтобы при старте не было предупреждения
-    st.warning("Пожалуйста, заполните хотя бы одно поле.")

 import streamlit as st
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+id_to_cat = {
+    0: 'Cryptography and Security',
+    1: 'Medical Physics',
+    2: 'Audio and Speech Processing',
+    3: 'Combinatorics',
+    4: 'Information Theory',
+    5: 'Quantum Physics',
+    6: 'Nuclear Theory',
+    7: 'Computers and Society',
+    8: 'Pattern Formation and Solitons',
+    9: 'General Finance',
+    10: 'Multiagent Systems',
+    11: 'Trading and Market Microstructure',
+    12: 'Mesoscale and Nanoscale Physics',
+    13: 'Instrumentation and Detectors',
+    14: 'Emerging Technologies',
+    15: 'Software Engineering',
+    16: 'Computational Physics',
+    17: 'Econometrics',
+    18: 'Materials Science',
+    19: 'Computer Vision and Pattern Recognition',
+    20: 'Differential Geometry',
+    21: 'General Literature',
+    22: 'Computation and Language',
+    23: 'Superconductivity',
+    24: 'Risk Management',
+    25: 'Other Condensed Matter',
+    26: 'Other Quantitative Biology',
+    27: 'High Energy Physics - Phenomenology',
+    28: 'Analysis of PDEs',
+    29: 'Earth and Planetary Astrophysics',
+    30: 'Optics',
+    31: 'Hardware Architecture',
+    32: 'Optimization and Control',
+    33: 'Methodology',
+    34: 'Number Theory',
+    35: 'General Topology',
+    36: 'Populations and Evolution',
+    37: 'Solar and Stellar Astrophysics',
+    38: 'Distributed, Parallel, and Cluster Computing',
+    39: 'Chaotic Dynamics',
+    40: 'History and Philosophy of Physics',
+    41: 'Computational Engineering, Finance, and Science',
+    42: 'Discrete Mathematics',
+    43: 'Statistical Mechanics',
+    44: 'Operating Systems',
+    45: 'Data Structures and Algorithms',
+    46: 'Geophysics',
+    47: 'Quantum Algebra',
+    48: 'Systems and Control',
+    49: 'Statistics Theory',
+    50: 'High Energy Physics - Theory',
+    51: 'Rings and Algebras',
+    52: 'Neural and Evolutionary Computing',
+    53: 'General Physics',
+    54: 'Computational Geometry',
+    55: 'Signal Processing',
+    56: 'Computational Finance',
+    57: 'History and Overview',
+    58: 'Space Physics',
+    59: 'Physics and Society',
+    60: 'Cosmology and Nongalactic Astrophysics',
+    61: 'Information Retrieval',
+    62: 'Symbolic Computation',
+    63: 'Statistical Finance',
+    64: 'Image and Video Processing',
+    65: 'Quantum Gases',
+    66: 'Artificial Intelligence',
+    67: 'Nuclear Experiment',
+    68: 'General Mathematics',
+    69: 'Complex Variables',
+    70: 'Logic in Computer Science',
+    71: 'Data Analysis, Statistics and Probability',
+    72: 'Fluid Dynamics',
+    73: 'Dynamical Systems',
+    74: 'High Energy Astrophysical Phenomena',
+    75: 'Programming Languages',
+    76: 'Mathematical Physics',
+    77: 'Logic',
+    78: 'Social and Information Networks',
+    79: 'Numerical Analysis',
+    80: 'Sound',
+    81: 'Chemical Physics',
+    82: 'Genomics',
+    83: 'Instrumentation and Methods for Astrophysics',
+    84: 'Applications',
+    85: 'Representation Theory',
+    86: 'Machine Learning',
+    87: 'Formal Languages and Automata Theory',
+    88: 'Quantitative Methods',
+    89: 'Atmospheric and Oceanic Physics',
+    90: 'Subcellular Processes',
+    91: 'Networking and Internet Architecture',
+    92: 'Functional Analysis',
+    93: 'Metric Geometry',
+    94: 'General Relativity and Quantum Cosmology',
+    95: 'Spectral Theory',
+    96: 'Graphics',
+    97: 'Adaptation and Self-Organizing Systems',
+    98: 'Economics',
+    99: 'Classical Analysis and ODEs',
+    100: 'Other Computer Science',
+    101: 'Geometric Topology',
+    102: 'Pricing of Securities',
+    103: 'High Energy Physics - Experiment',
+    104: 'Category Theory',
+    105: 'Human-Computer Interaction',
+    106: 'Biological Physics',
+    107: 'Popular Physics',
+    108: 'Probability',
+    109: 'Commutative Algebra',
+    110: 'Strongly Correlated Electrons',
+    111: 'Group Theory',
+    112: 'Computation',
+    113: 'Digital Libraries',
+    114: 'Classical Physics',
+    115: 'Neurons and Cognition',
+    116: 'Operator Algebras',
+    117: 'Tissues and Organs',
+    118: 'High Energy Physics - Lattice',
+    119: 'Robotics',
+    120: 'Portfolio Management',
+    121: 'Computational Complexity',
+    122: 'Soft Condensed Matter',
+    123: 'Mathematical Software',
+    124: 'Applied Physics',
+    125: 'Computer Science and Game Theory',
+    126: 'Multimedia',
+    127: 'Molecular Networks',
+    128: 'Disordered Systems and Neural Networks',
+    129: 'Other Statistics',
+    130: 'Cell Behavior',
+    131: 'Performance',
+    132: 'Biomolecules',
+    133: 'Astrophysics of Galaxies',
+    134: 'Databases',
+    135: 'Algebraic Topology',
+    136: 'Cellular Automata and Lattice Gases',
+    137: 'Algebraic Geometry'
+}
+@st.cache_resource
 def load_model():
     tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
     model = AutoModelForSequenceClassification.from_pretrained(
     )
     return model, tokenizer
 try:
     model, tokenizer = load_model()
 except OSError as e:
+    st.error(f"Ошибка при загрузке модели: {e}")
     st.stop()
 def classify_text(title, description):
+    text = f"{title.strip()} {description.strip()}"
     try:
+        classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=len(id_to_cat))
+        results = classifier(text)
     except Exception as e:
+        st.error(f"Ошибка при классификации текста: {e}")
         return []
+    readable_results = [
+        (id_to_cat[int(entry['label'].split('_')[1])], entry['score'])
+        for entry in results[0]
+    ]
+    return readable_results
+st.set_page_config(page_title="Классификация статей", layout="wide")
+st.title("🔬 Классификация научных статей")
+st.markdown("Введите заголовок и краткое описание научной статьи, чтобы определить её тематические категории.")
+title = st.text_input("📝 Заголовок статьи", placeholder="Например: Deep Learning for Image Recognition")
+description = st.text_area("🧾 Краткое описание статьи", height=150, placeholder="Кратко опишите содержание статьи...")
+top_percent = st.text_input("📊 Порог вероятности (например, 95 или 0.95 для top 95%)", value="95")
+if st.button("🚀 Классифицировать"):
     if not title and not description:
+        st.warning("Пожалуйста, введите заголовок или описание статьи.")
     else:
+        try:
+            t = float(top_percent)
+            if t > 1:
+                t = t / 100
+            if not (0 < t <= 1):
+                raise ValueError()
+        except ValueError:
+            st.warning("Некорректное значение для порога вероятности. Используем значение по умолчанию: 95%.")
+            t = 0.95
+        with st.spinner("🔍 Классификация..."):
             results = classify_text(title, description)
             if results:
+                cumulative_prob = 0.0
+                st.subheader(f"📚 Топ категорий (до {int(t*100)}% совокупной вероятности):")
+                for label, score in results:
+                    st.write(f"- **{label}**: {score:.4f}")
+                    cumulative_prob += score
+                    if cumulative_prob >= t:
+                        break
             else:
                 st.info("Не удалось получить результаты классификации.")
+elif title or description:
+    st.warning("Нажмите кнопку 'Классифицировать', чтобы получить результат.")