Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,370 Bytes
6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc b85d9b0 6efebdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from pathlib import Path
from leaderboard_tab import (
create_leaderboard_tab,
search_leaderboard,
update_columns_to_show,
)
from utils import load_json_results
# Constants
RETRIEVAL_ABOUT_SECTION = """
## About Retrieval Evaluation
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
### Web Search Dataset Metrics
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
### Model Requirements
- Must support Arabic text embeddings
- Should handle queries of at least 512 tokens
- Must work with `sentence-transformers` library
### Evaluation Process
1. Models process Arabic web search queries
2. Retrieved documents are evaluated using:
- MRR for first relevant result positioning
- nDCG for overall ranking quality
- Recall@5 for top results accuracy
3. Metrics are averaged to calculate the overall score
4. Models are ranked based on their overall performance
### How to Prepare Your Model
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
- Model should output fixed-dimension embeddings for text
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
"""
# Global variables
retrieval_df = None
def load_retrieval_leaderboard():
"""Load and prepare the retrieval leaderboard data"""
global retrieval_df
# Prepare retrieval dataframe
dataframe_path = Path(__file__).parent / "results" / "retrieval_results.json"
retrieval_df = load_json_results(
dataframe_path, True, "Average Score", drop_cols=["Revision", "Task"]
)
retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df)))
return retrieval_df
def retrieval_search_leaderboard(model_name, columns_to_show):
"""Search function for retrieval leaderboard"""
return search_leaderboard(retrieval_df, model_name, columns_to_show)
def update_retrieval_columns_to_show(columns_to_show):
"""Update displayed columns for retrieval leaderboard"""
return update_columns_to_show(retrieval_df, columns_to_show)
def create_retrieval_tab():
"""Create the complete retrieval leaderboard tab"""
global retrieval_df
# Load data if not already loaded
if retrieval_df is None:
retrieval_df = load_retrieval_leaderboard()
# Define default columns to show
default_columns = [
"Rank",
"Model",
"Average Score",
"Model Size (MB)",
"Context Length",
"Embedding Dimension",
"Web Search Dataset",
"Islamic Knowledge Dataset",
]
# Create and return the tab
return create_leaderboard_tab(
df=retrieval_df,
initial_columns_to_show=default_columns,
search_function=retrieval_search_leaderboard,
update_function=update_retrieval_columns_to_show,
about_section=RETRIEVAL_ABOUT_SECTION,
task_type="Retriever",
)
|