File size: 10,420 Bytes
6a95078
 
 
 
93b22a6
6a95078
93b22a6
 
 
483d70f
0e67577
 
93b22a6
 
 
 
 
 
6a95078
 
9aca0a7
 
 
 
 
 
 
 
5736b44
 
 
 
 
795f399
5736b44
 
1db82f6
5736b44
1db82f6
5736b44
 
 
1db82f6
5736b44
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a1b4a
 
 
5736b44
 
 
 
 
 
 
202b027
5736b44
 
86b0c2d
5736b44
86b0c2d
5736b44
 
86b0c2d
5736b44
86b0c2d
5736b44
 
86b0c2d
5736b44
86b0c2d
 
5736b44
86b0c2d
5736b44
86b0c2d
5736b44
 
86b0c2d
5736b44
86b0c2d
 
5736b44
86b0c2d
5736b44
 
86b0c2d
5736b44
86b0c2d
5736b44
86b0c2d
5736b44
 
 
 
86b0c2d
5736b44
 
 
 
 
 
 
 
8c12e10
 
 
edb7a23
 
 
8c12e10
 
edb7a23
8c12e10
 
 
42c2c6f
93b22a6
 
 
 
dba1c4d
93b22a6
 
 
 
 
 
 
 
 
6a95078
 
 
82d7596
 
 
 
100843f
82d7596
 
202b027
 
 
82d7596
 
 
 
 
 
 
6a95078
93b22a6
4c3ce7a
 
 
 
 
 
 
 
 
 
 
019699e
 
 
 
 
 
4c3ce7a
6a95078
 
 
93b22a6
 
 
 
5736b44
202b027
5736b44
202b027
 
6a95078
 
 
 
 
93b22a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description" content="JQL: Judging Quality across Languages - A pipeline for multilingual data filtering.">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>JQL: Judging Quality across Languages</title>
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <style>
    body { font-family: 'Noto Sans', sans-serif; }
    .hero.is-primary { background-color: #f9d5e5; }
    .subtitle img { max-width: 100%; height: auto; }
    .section-title { margin-top: 2em; }
  </style>
</head>
<body>
<section class="hero is-primary">
  <div class="hero-body">
    <div class="container has-text-centered">
      <h1 class="title is-1">🦊 JQL: Judging Quality across Languages</h1>
      <p class="subtitle is-5">Scalable and lightweight multilingual data filtering with LLM-based annotators</p>
    </div>
  </div>
</section>
<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
<h1 class="title is-1 publication-title">Judging Quality Across Languages: A Multilingual Approach to Pretraining Data Filtering with Language Models</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">Mehdi Ali<sup>1,2</sup>†,</span>
            <span class="author-block">Manuel Brack<sup>3,5</sup>†,</span>
            <span class="author-block">Max Lübbering<sup>1,2</sup>†,</span>
            <span class="author-block">Elias Wendt<sup>5</sup>†,</span>
            <span class="author-block">Abbas Goher Khan<sup>1</sup>†,</span>
            <span class="author-block">Richard Rutmann<sup>1,2</sup>,</span>
            <span class="author-block">Alex Jude<sup>2</sup>,</span>
            <span class="author-block">Maurice Kraus<sup>5</sup>,</span>
            <span class="author-block">Alexander Arno Weber<sup>1,2</sup>,</span>
            <span class="author-block">Felix Stollenwerk<sup>6</sup>,</span>
            <span class="author-block">David Kaczér<sup>1</sup>,</span>
            <span class="author-block">Florian Mai<sup>1</sup>,</span>
            <span class="author-block">Lucie Flek<sup>1</sup>,</span>
            <span class="author-block">Rafet Sifa<sup>1,2</sup>,</span>
            <span class="author-block">Nicolas Flores-Herr<sup>2</sup>,</span>
            <span class="author-block">Joachim Köhler<sup>1,2</sup>,</span>
            <span class="author-block">Patrick Schramowski<sup>3,4,5</sup>,</span>
            <span class="author-block">Michael Fromm<sup>1,2</sup>,</span>
            <span class="author-block">Kristian Kersting<sup>3,4,5</sup></span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>Lamarr Institute,</span>
            <span class="author-block"><sup>2</sup>Fraunhofer IAIS,</span>
            <span class="author-block"><sup>3</sup>DFKI SAINT,</span>
            <span class="author-block"><sup>4</sup>Hessian AI,</span>
            <span class="author-block"><sup>5</sup>Computer Science Department, TU Darmstadt,</span>
            <span class="author-block"><sup>6</sup>AI Sweden</span>
          </div>

          <div class="column has-text-centered">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2505.22232" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/JQL-AI/JQL-Annotation-Pipeline/" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <!-- Dataset Link1 . -->
              <span class="link-block">
                <a href="https://huggingface.co/datasets/Jackal-AI/JQL-Human-Edu-Annotations" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="far fa-images"></i>
                  </span>
                  <span>Human Annotations</span>
              <!-- Dataset Link2. -->
              <span class="link-block">
                <a href="https://huggingface.co/datasets/Jackal-AI/JQL-LLM-Edu-Annotations" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="far fa-images"></i>
                  </span>
                  <span>LLM Annotations</span>
              <span class="link-block">
                <a href="https://huggingface.co/Jackal-AI/JQL-Edu-Heads" target="_blank"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="far fa-images"></i>
                  </span>
                  <span>Lightweight Annotator</span>
                  </a>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>
<section class="section">
  <div class="container content">
    <p>
      High-quality multilingual data is crucial for training effective large language models (LLMs).
      <strong>JQL (Judging Quality across Languages)</strong> is a scalable and lightweight multilingual data filtering approach that distills the judgment capabilities of strong 
      multilingual LLMs into efficient cross-lingual annotators. 
    </p>
    <p>
      Overall, JQL improves data quality, retains more tokens, and generalizes to unseen languages. It outperforms heuristic baselines and enables cost-efficient multilingual pretraining data curation at scale.
    </p>
  </div>
</section>
  
<section class="section">
  <div class="container content">
    <h2 class="title is-3">🧩 Main Pipeline Steps</h2>
    <figure>
      <img src="https://cdn-uploads.huggingface.co/production/uploads/64bfc4d55ce3d382c05c0f9a/1zPQcwqt9Li_gCvd04_2_.png" alt="JQL Pipeline Overview">
      <figcaption><em>Figure 1: Overview of the JQL pipeline</em></figcaption>
    </figure>

    <ol>
      <li><strong>📋 Ground Truth Creation:</strong> Human annotators label monolingual documents based on a structured instruction prompt. These documents are translated into all target languages to create a multilingual gold-standard dataset. (See Figure 1)</li>
      <li><strong>🤖 LLM-as-a-Judge Selection & Data Annotation:</strong> Strong multilingual LLMs (e.g., Gemma, Mistral, LLaMA) are evaluated against the ground truth, and top-performing models are used to produce synthetic annotations. (See Figure 1)</li>
      <li><strong>🪶 Lightweight Annotator Training:</strong> Train compact regression heads on frozen multilingual embeddings to create efficient, high-throughput annotators. (See Figure 1)</li>
      <li><strong>🚀 Scalable Data Filtering:</strong> Use trained annotators to filter large-scale pretraining corpora using quantile thresholds. (See Figure 1)</li>
    </ol>
  </div>
</section>

<section class="section">
  <div class="container content">
    <h2 class="title is-3">📊 Results</h2>
    <ul>
      <li><strong>✔️ Accuracy:</strong> Good correlation with human ground truth</li>
      <li><strong>📈 Downstream LLM Training:</strong>
        <ul>
          <li>Benchmark performance improvement over FineWeb2</li>
          <li>Higher document retention vs. FineWeb2 heuristic filter</li>
          <li>Effective dynamic threshold strategies: Trade-off document quality for quantity</li>
        </ul>
      </li>
      <li><strong>⚡ Annotation Speed:</strong> ~11,000 docs/min (A100 GPU, avg. 690 tokens)</li>
    </ul>
  </div>
</section>

<section class="section">
  <div class="container content">
    <h2 class="title is-3">📁 Available Artifacts</h2>
    <ul>
      <li><a href="https://huggingface.co/datasets/Jackal-AI/JQL-Human-Edu-Annotations" target="_blank">📄 Ground truth annotations in 35 languages</a></li>
      <li><a href="https://huggingface.co/datasets/Jackal-AI/JQL-LLM-Edu-Annotations" target="_blank">🧠 Synthetic LLM-annotated dataset (14M+ documents)</a></li>
      <li><a href="https://huggingface.co/Jackal-AI/JQL-Edu-Heads" target="_blank">🪶 Lightweight annotation models</a>:
        <ul>
          <li>JQL-Gemma</li>
          <li>JQL-Mistral</li>
          <li>JQL-Llama</li>
        </ul>
      </li>
      <li>🛠️ Training & inference scripts</li>
      <ul>
          <li><a href="https://huggingface.co/Jackal-AI/JQL-Edu-Heads" target="_blank">Web Corpus Annotation</a></li>
          <li>More coming soon</li>
        </ul>
      <li>🗄️ Large-scale dataset coming soon</li>
    </ul>
  </div>
</section>

<section class="section">
  <div class="container content">
    <h2 class="title is-3">📜 Citation</h2>
    <p>If you use JQL, the annotations, or the pretrained annotators, please cite the paper:</p>
    <pre><code>@article{ali2024jql,
  title={Judging Quality Across Languages: A Multilingual Approach to Pretraining Data Filtering with Language Modelss},
  author={Ali, Mehdi and Brack, Manuel and Lübbering, Max and Wendt, Elias and Khan, Abbas Goher and Rutmann, Richard and Jude, Alex and Kraus, Maurice and Weber, Alexander Arno and Stollenwerk, Felix and Kaczér, David and Mai, Florian and Flek, Lucie and Sifa, Rafet and Flores-Herr, Nicolas and Köhler, Joachim and Schramowski, Patrick and Fromm, Michael and Kersting, Kristian},
  journal={arXiv preprint arXiv:2505.22232},
  year={2025}
}</code></pre>
  </div>
</section>

</body>
</html>