File size: 3,370 Bytes
6efebdc
b85d9b0
 
 
 
 
 
6efebdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b85d9b0
6efebdc
b85d9b0
 
 
 
6efebdc
b85d9b0
6efebdc
 
b85d9b0
6efebdc
 
 
 
b85d9b0
6efebdc
 
 
 
b85d9b0
6efebdc
 
 
b85d9b0
6efebdc
 
 
b85d9b0
6efebdc
b85d9b0
 
 
 
 
 
 
 
 
 
 
6efebdc
 
 
 
 
 
 
b85d9b0
6efebdc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from pathlib import Path

from leaderboard_tab import (
    create_leaderboard_tab,
    search_leaderboard,
    update_columns_to_show,
)
from utils import load_json_results

# Constants
RETRIEVAL_ABOUT_SECTION = """
## About Retrieval Evaluation

The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:

### Web Search Dataset Metrics
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5

### Model Requirements
- Must support Arabic text embeddings
- Should handle queries of at least 512 tokens 
- Must work with `sentence-transformers` library

### Evaluation Process
1. Models process Arabic web search queries
2. Retrieved documents are evaluated using:
   - MRR for first relevant result positioning
   - nDCG for overall ranking quality
   - Recall@5 for top results accuracy
3. Metrics are averaged to calculate the overall score
4. Models are ranked based on their overall performance

### How to Prepare Your Model
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
- Model should output fixed-dimension embeddings for text
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
"""

# Global variables
retrieval_df = None


def load_retrieval_leaderboard():
    """Load and prepare the retrieval leaderboard data"""
    global retrieval_df

    # Prepare retrieval dataframe
    dataframe_path = Path(__file__).parent / "results" / "retrieval_results.json"
    retrieval_df = load_json_results(
        dataframe_path, True, "Average Score", drop_cols=["Revision", "Task"]
    )
    retrieval_df.insert(0, "Rank", range(1, 1 + len(retrieval_df)))

    return retrieval_df


def retrieval_search_leaderboard(model_name, columns_to_show):
    """Search function for retrieval leaderboard"""
    return search_leaderboard(retrieval_df, model_name, columns_to_show)


def update_retrieval_columns_to_show(columns_to_show):
    """Update displayed columns for retrieval leaderboard"""
    return update_columns_to_show(retrieval_df, columns_to_show)


def create_retrieval_tab():
    """Create the complete retrieval leaderboard tab"""
    global retrieval_df

    # Load data if not already loaded
    if retrieval_df is None:
        retrieval_df = load_retrieval_leaderboard()

    # Define default columns to show
    default_columns = [
        "Rank",
        "Model",
        "Average Score",
        "Model Size (MB)",
        "Context Length",
        "Embedding Dimension",
        "Web Search Dataset",
        "Islamic Knowledge Dataset",
    ]

    # Create and return the tab
    return create_leaderboard_tab(
        df=retrieval_df,
        initial_columns_to_show=default_columns,
        search_function=retrieval_search_leaderboard,
        update_function=update_retrieval_columns_to_show,
        about_section=RETRIEVAL_ABOUT_SECTION,
        task_type="Retriever",
    )