Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from .streamlit_utils import ( | |
| make_multiselect, | |
| make_selectbox, | |
| make_text_area, | |
| make_text_input, | |
| make_radio, | |
| ) | |
| N_FIELDS = 7 | |
| def results_page(): | |
| st.session_state.card_dict["results"] = st.session_state.card_dict.get( | |
| "results", {} | |
| ) | |
| with st.expander("Previous Results", expanded=False): | |
| key_pref = ["results", "results"] | |
| st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[ | |
| "results" | |
| ].get("results", {}) | |
| make_multiselect( | |
| label="What metrics are typically used for this task?", | |
| key_list=key_pref + ["metrics"], | |
| options=[ | |
| "BERT-Score", | |
| "BLEU", | |
| "BLEURT", | |
| "ChrF", | |
| "Entailment", | |
| "FeQA", | |
| "METEOR", "MoverScore", | |
| "QAGS", | |
| "ROUGE", | |
| "WER", | |
| "Other: Other Metrics" | |
| ], | |
| help="Select all metrics that are typically used when evaluating models for this task.", | |
| ) | |
| make_text_area( | |
| label="Describe the metrics and evaluation methodology that the dataset creators used when introducing this task.", | |
| key_list=key_pref + ["original-evaluation"], | |
| help="When the generation task was not evaluated when this dataset was introduced, write N/A.", | |
| ) | |
| make_radio( | |
| label="Are previous results available?", | |
| options=["no", "yes"], | |
| key_list=key_pref + ["has-previous-results"], | |
| help="Have papers evaluated models on this task? If no, write N/A for the following three questions.", | |
| ) | |
| make_text_area( | |
| label="What evaluation approaches have others used?", | |
| key_list=key_pref + ["modern-evaluation"], | |
| help="If the modern evaluation strategy diverts from the original, describe how models are being evaluated.", | |
| ) | |
| make_text_area( | |
| label="What are previous results", | |
| key_list=key_pref + ["previous-results"], | |
| help="List the source and performance metrics for models on this dataset.", | |
| ) | |
| make_text_area( | |
| label="Definitions", | |
| key_list=key_pref + ["definitions"], | |
| help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.", | |
| ) | |
| make_text_area( | |
| label="What aspect of model ability can be measured with this dataset?", | |
| key_list=key_pref + ["model-abilities"], | |
| help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.", | |
| ) | |
| def results_summary(): | |
| total_filled = sum( | |
| [len(dct) for dct in st.session_state.card_dict.get("results", {}).values()] | |
| ) | |
| with st.expander( | |
| f"Previous Results Completion - {total_filled} of {N_FIELDS}", expanded=False | |
| ): | |
| completion_markdown = "" | |
| completion_markdown += ( | |
| f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n" | |
| ) | |
| completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n" | |
| st.markdown(completion_markdown) | |