sumuks HF Staff commited on
Commit
e66fa6d
Β·
verified Β·
1 Parent(s): d63968d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +756 -0
app.py ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Enhanced web document annotation tool with modern UI."""
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import uuid
7
+ from collections import defaultdict, deque
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+ from random import randint, randrange
12
+
13
+ import gradio as gr
14
+ from datasets import Dataset, load_dataset
15
+ from loguru import logger
16
+
17
+
18
+ def doc_hash(url: str, text: str) -> str:
19
+ return hashlib.sha256(f"{url}{text}".encode()).hexdigest()
20
+
21
+
22
+ def filterfunc(x: dict) -> bool:
23
+ # text length
24
+ if len(x.get("text", "").split()) < 100:
25
+ # very short content usually means it's not a high quality document
26
+ return False
27
+
28
+ excluded = {"Promotional/Advertisement", "Machine-Generated", "Images/Videos/Audio",
29
+ "Truncated", "Spam/Ads", "Product Page", "Content Listing"}
30
+
31
+ for version in ["document_type_v1", "document_type_v2"]:
32
+ for level in ["primary", "secondary"]:
33
+ if label := x.get("eai_taxonomy", {}).get(version, {}).get(level, {}).get("label"):
34
+ if label in excluded:
35
+ return False
36
+ return True
37
+
38
+
39
+ class DocLoader:
40
+ __slots__ = ("queue", "k", "counts", "processed", "total_docs", "_dataset")
41
+
42
+ def __init__(self, processed: set[str], k: int = 20):
43
+ self.queue = deque()
44
+ self.k = k
45
+ self.counts = defaultdict(int)
46
+ self.processed = processed
47
+ self.total_docs = 0
48
+ self._dataset = None
49
+ self._load()
50
+
51
+ def _load(self):
52
+ ds = load_dataset("sumuks/essential-web-v1.0-sample-10M", split="train")
53
+ logger.info(f"Loaded {len(ds)} documents")
54
+
55
+ ds = ds.filter(filterfunc)
56
+ logger.info(f"Filtered to {len(ds)} documents")
57
+
58
+ self._dataset = {}
59
+ for idx, doc in enumerate(ds):
60
+ doc_key = doc.get("id", idx)
61
+ doc_with_key = dict(doc)
62
+ doc_with_key["_dataset_key"] = doc_key
63
+ self._dataset[doc_key] = doc_with_key
64
+
65
+ for doc_id, doc in self._dataset.items():
66
+ url = doc.get("metadata", {}).get("url", doc.get("url", ""))
67
+ h = doc_hash(url, doc.get("text", ""))
68
+
69
+ if h in self.processed:
70
+ continue
71
+
72
+ if cat := doc.get("eai_taxonomy", {}).get("document_type_v2", {}).get("primary", {}).get("label"):
73
+ min_count = min(self.counts.values(), default=0)
74
+ if self.counts[cat] <= min_count or randrange(self.k) == 0:
75
+ self.queue.append(doc)
76
+ self.counts[cat] += 1
77
+
78
+ self.total_docs = len(self.queue)
79
+ logger.info(f"Loaded {self.total_docs} documents")
80
+
81
+ def next(self) -> dict | None:
82
+ return self.queue.popleft() if self.queue else None
83
+
84
+ def get_by_id(self, doc_id: str | int) -> dict | None:
85
+ # Handle both string and int lookups
86
+ result = self._dataset.get(doc_id)
87
+ if result is None and isinstance(doc_id, str) and doc_id.isdigit():
88
+ result = self._dataset.get(int(doc_id))
89
+ elif result is None and isinstance(doc_id, int):
90
+ result = self._dataset.get(str(doc_id))
91
+ return result
92
+
93
+ @property
94
+ def remaining(self) -> int:
95
+ return len(self.queue)
96
+
97
+
98
+ @dataclass(slots=True)
99
+ class AnnotationStore:
100
+ path: Path
101
+ session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
102
+ buffer: list[dict] = field(default_factory=list)
103
+ threshold: int = field(default_factory=lambda: randint(20, 30))
104
+ processed: set[str] = field(default_factory=set)
105
+ annotations: list[dict] = field(default_factory=list)
106
+ session_stats: dict = field(default_factory=lambda: {
107
+ "total": 0,
108
+ "selected": 0,
109
+ "discarded": 0,
110
+ "start_time": datetime.now(timezone.utc),
111
+ "decisions": []
112
+ })
113
+
114
+ def __post_init__(self):
115
+ self.path.parent.mkdir(parents=True, exist_ok=True)
116
+ if self.path.exists():
117
+ for line in self.path.read_text().splitlines():
118
+ if rec := self._parse_line(line):
119
+ self.processed.add(rec["hash"])
120
+ self.annotations.append(rec)
121
+ # Initialize session stats for loaded annotations
122
+ if "decision" in rec:
123
+ decision = rec["decision"]
124
+ if decision not in self.session_stats:
125
+ self.session_stats[decision] = 0
126
+
127
+ def _parse_line(self, line: str) -> dict | None:
128
+ try:
129
+ return json.loads(line)
130
+ except:
131
+ return None
132
+
133
+ def add(self, doc_hash: str, decision: str, doc_id: str | int):
134
+ if doc_hash in self.processed:
135
+ return
136
+
137
+ rec = {
138
+ "hash": doc_hash,
139
+ "decision": decision,
140
+ "session": self.session_id,
141
+ "id": doc_id,
142
+ "timestamp": datetime.now(timezone.utc).isoformat(),
143
+ }
144
+
145
+ self.path.open("a").write(json.dumps(rec) + "\n")
146
+ self.processed.add(doc_hash)
147
+ self.buffer.append(rec)
148
+ self.annotations.append(rec)
149
+
150
+ self.session_stats["total"] += 1
151
+ if decision not in self.session_stats:
152
+ self.session_stats[decision] = 0
153
+ self.session_stats[decision] += 1
154
+ self.session_stats["decisions"].append((datetime.now(timezone.utc), decision))
155
+
156
+ if len(self.buffer) >= self.threshold:
157
+ self.flush()
158
+
159
+ def flush(self):
160
+ if not self.buffer or not (token := os.getenv("HF_TOKEN")):
161
+ self.buffer.clear()
162
+ self.threshold = randint(20, 30)
163
+ return
164
+
165
+ try:
166
+ Dataset.from_list(self.buffer).push_to_hub(
167
+ "yourbench/essential-web-annotations",
168
+ token=token
169
+ )
170
+ logger.info(f"Pushed {len(self.buffer)} annotations")
171
+ self.buffer.clear()
172
+ except Exception as e:
173
+ logger.error(f"Push failed: {e}")
174
+ finally:
175
+ self.threshold = randint(20, 30)
176
+
177
+ def get_rate(self) -> float:
178
+ if not self.session_stats["decisions"]:
179
+ return 0.0
180
+ elapsed = (datetime.now(timezone.utc) - self.session_stats["start_time"]).total_seconds()
181
+ return (self.session_stats["total"] / elapsed * 3600) if elapsed > 0 else 0.0
182
+
183
+ def get_filtered(self, decision: str | None = None) -> list[dict]:
184
+ if decision is None or decision == "all":
185
+ return self.annotations
186
+ return [a for a in self.annotations if a.get("decision") == decision]
187
+
188
+
189
+ SESSION_LIMIT = 50
190
+
191
+ store = AnnotationStore(Path("data/annotations.jsonl"))
192
+ loader = DocLoader(store.processed)
193
+ current = loader.next()
194
+
195
+
196
+ # Viewer state
197
+ viewer_state = {
198
+ "annotations": [],
199
+ "index": 0,
200
+ "filter": "all"
201
+ }
202
+
203
+
204
+ def format_stats() -> str:
205
+ stats = store.session_stats
206
+ rate = store.get_rate()
207
+
208
+ selected_count = stats.get('selected', 0)
209
+ discarded_count = stats.get('discarded', 0)
210
+
211
+ return f"""
212
+ <div class="stats-container">
213
+ <div class="stat-item">
214
+ <div class="stat-value">{stats['total']}</div>
215
+ <div class="stat-label">Total Annotated</div>
216
+ </div>
217
+ <div class="stat-item">
218
+ <div class="stat-value">{selected_count}</div>
219
+ <div class="stat-label">Selected</div>
220
+ </div>
221
+ <div class="stat-item">
222
+ <div class="stat-value">{discarded_count}</div>
223
+ <div class="stat-label">Discarded</div>
224
+ </div>
225
+ <div class="stat-item">
226
+ <div class="stat-value">{rate:.0f}/hr</div>
227
+ <div class="stat-label">Annotation Rate</div>
228
+ </div>
229
+ </div>
230
+ """
231
+
232
+
233
+ def format_progress() -> tuple[str, float]:
234
+ session_completed = store.session_stats["total"]
235
+ session_total = SESSION_LIMIT
236
+ progress = (session_completed / session_total) if session_total > 0 else 0
237
+
238
+ percentage = progress * 100
239
+
240
+ return (
241
+ f"""
242
+ <div class="progress-container">
243
+ <div class="progress-header">
244
+ <span class="progress-title">Session Progress</span>
245
+ <span class="progress-numbers">{session_completed:,} / {session_total:,}</span>
246
+ </div>
247
+ <div class="progress-bar-bg">
248
+ <div class="progress-bar-fill" style="width: {percentage:.1f}%"></div>
249
+ </div>
250
+ <div class="progress-percentage">{percentage:.1f}% Complete</div>
251
+ </div>
252
+ """,
253
+ progress
254
+ )
255
+
256
+
257
+ def format_document_info(doc: dict, annotation: dict | None = None) -> str:
258
+ if not doc:
259
+ return ""
260
+
261
+ meta = doc.get("metadata", {})
262
+ url = meta.get("url", doc.get("url", ""))
263
+ domain = url.split('/')[2] if url and '/' in url else "Unknown"
264
+
265
+ cat = doc.get("eai_taxonomy", {}).get("document_type_v2", {}).get("primary", {}).get("label", "Uncategorized")
266
+
267
+ word_count = len(doc.get("text", "").split())
268
+
269
+ annotation_info = ""
270
+ if annotation:
271
+ timestamp = datetime.fromisoformat(annotation["timestamp"].replace("Z", "+00:00"))
272
+ decision_color = "#667eea" if annotation["decision"] == "selected" else "#f5576c"
273
+ annotation_info = f"""
274
+ <div class="annotation-info" style="border-left: 4px solid {decision_color};">
275
+ <span class="annotation-decision" style="color: {decision_color};">
276
+ {"βœ…" if annotation["decision"] == "selected" else "❌"} {annotation["decision"].title()}
277
+ </span>
278
+ <span class="annotation-time">πŸ“… {timestamp.strftime("%Y-%m-%d %H:%M:%S")}</span>
279
+ </div>
280
+ """
281
+
282
+ return f"""
283
+ <div class="doc-info">
284
+ {annotation_info}
285
+ <div class="doc-meta">
286
+ <span class="doc-domain">πŸ“Œ {domain}</span>
287
+ <span class="doc-category">🏷️ {cat}</span>
288
+ <span class="doc-words">πŸ“ {word_count:,} words</span>
289
+ </div>
290
+ <a href="{url}" target="_blank" class="doc-url">{url}</a>
291
+ </div>
292
+ """
293
+
294
+
295
+ def choose(decision: str):
296
+ global current
297
+
298
+ if not current:
299
+ return done_state()
300
+
301
+ url = current.get("metadata", {}).get("url", current.get("url", ""))
302
+ h = doc_hash(url, current.get("text", ""))
303
+ doc_id = current.get("_dataset_key", current.get("id", ""))
304
+ store.add(h, decision, doc_id)
305
+
306
+ if store.session_stats["total"] >= SESSION_LIMIT:
307
+ return done_state()
308
+
309
+ current = loader.next()
310
+
311
+ if not current:
312
+ return done_state()
313
+
314
+ progress_html, progress_num = format_progress()
315
+
316
+ return (
317
+ format_document_info(current),
318
+ current.get("text", ""),
319
+ gr.update(interactive=True),
320
+ gr.update(interactive=True),
321
+ format_stats(),
322
+ progress_html,
323
+ progress_num
324
+ )
325
+
326
+
327
+ def done_state():
328
+ progress_html, progress_num = format_progress()
329
+
330
+ if store.session_stats["total"] >= SESSION_LIMIT:
331
+ message = "πŸŽ‰ Session Complete!"
332
+ description = f"Great job! You've completed your session of {SESSION_LIMIT} documents."
333
+ else:
334
+ message = "πŸŽ‰ All documents annotated!"
335
+ description = "Great job! You've completed all available documents."
336
+
337
+ return (
338
+ f"<div class='done-message'>{message}</div>",
339
+ description,
340
+ gr.update(interactive=False),
341
+ gr.update(interactive=False),
342
+ format_stats(),
343
+ progress_html,
344
+ 1.0
345
+ )
346
+
347
+
348
+ def update_viewer_filter(filter_value: str):
349
+ viewer_state["filter"] = filter_value
350
+ viewer_state["index"] = 0
351
+
352
+ # Get filtered annotations
353
+ viewer_state["annotations"] = store.get_filtered(filter_value)
354
+
355
+ # Log for debugging
356
+ logger.info(f"Filter: {filter_value}, Found {len(viewer_state['annotations'])} annotations")
357
+
358
+ return update_viewer_display()
359
+
360
+
361
+ def navigate_viewer(direction: int):
362
+ if not viewer_state["annotations"]:
363
+ return update_viewer_display()
364
+
365
+ viewer_state["index"] = (viewer_state["index"] + direction) % len(viewer_state["annotations"])
366
+ return update_viewer_display()
367
+
368
+
369
+ def update_viewer_display():
370
+ if not viewer_state["annotations"]:
371
+ return (
372
+ "<div class='viewer-empty'>No annotations to display</div>",
373
+ "",
374
+ f"0 / 0",
375
+ gr.update(interactive=False),
376
+ gr.update(interactive=False)
377
+ )
378
+
379
+ idx = viewer_state["index"]
380
+ annotation = viewer_state["annotations"][idx]
381
+ doc = loader.get_by_id(annotation["id"])
382
+
383
+ if not doc:
384
+ # Log the issue for debugging
385
+ logger.warning(f"Document not found for ID: {annotation['id']} (type: {type(annotation['id'])})")
386
+ return (
387
+ "<div class='viewer-error'>Document not found in dataset</div>",
388
+ f"Annotation details: {json.dumps(annotation, indent=2)}",
389
+ f"{idx + 1} / {len(viewer_state['annotations'])}",
390
+ gr.update(interactive=idx > 0),
391
+ gr.update(interactive=idx < len(viewer_state["annotations"]) - 1)
392
+ )
393
+
394
+ return (
395
+ format_document_info(doc, annotation),
396
+ doc.get("text", ""),
397
+ f"{idx + 1} / {len(viewer_state['annotations'])}",
398
+ gr.update(interactive=idx > 0),
399
+ gr.update(interactive=idx < len(viewer_state["annotations"]) - 1)
400
+ )
401
+
402
+
403
+ def build() -> gr.Blocks:
404
+ css = """
405
+ .stats-container {
406
+ display: flex;
407
+ gap: 15px;
408
+ margin: 10px 0;
409
+ flex-wrap: nowrap;
410
+ justify-content: space-between;
411
+ }
412
+ .stat-item {
413
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
414
+ border-radius: 12px;
415
+ padding: 15px;
416
+ flex: 1;
417
+ min-width: 100px;
418
+ text-align: center;
419
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
420
+ transition: transform 0.2s;
421
+ }
422
+ .stat-item:hover {
423
+ transform: translateY(-2px);
424
+ }
425
+ .stat-value {
426
+ font-size: 24px;
427
+ font-weight: bold;
428
+ color: white;
429
+ margin-bottom: 3px;
430
+ }
431
+ .stat-label {
432
+ font-size: 12px;
433
+ color: rgba(255, 255, 255, 0.9);
434
+ }
435
+ .progress-container {
436
+ background: #f8f9fa;
437
+ border-radius: 12px;
438
+ padding: 15px;
439
+ margin: 10px 0;
440
+ }
441
+ .progress-header {
442
+ display: flex;
443
+ justify-content: space-between;
444
+ margin-bottom: 10px;
445
+ font-weight: 600;
446
+ }
447
+ .progress-bar-bg {
448
+ background: #e9ecef;
449
+ height: 20px;
450
+ border-radius: 10px;
451
+ overflow: hidden;
452
+ margin-bottom: 10px;
453
+ }
454
+ .progress-bar-fill {
455
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
456
+ height: 100%;
457
+ transition: width 0.3s ease;
458
+ }
459
+ .progress-percentage {
460
+ text-align: center;
461
+ color: #6c757d;
462
+ font-size: 14px;
463
+ }
464
+ .doc-info {
465
+ background: #f8f9fa;
466
+ border-radius: 12px;
467
+ padding: 15px;
468
+ margin-bottom: 10px;
469
+ }
470
+ .doc-meta {
471
+ display: flex;
472
+ gap: 20px;
473
+ margin-bottom: 10px;
474
+ flex-wrap: wrap;
475
+ }
476
+ .doc-meta span {
477
+ font-size: 14px;
478
+ color: #495057;
479
+ }
480
+ .doc-url {
481
+ font-size: 14px;
482
+ color: #667eea;
483
+ text-decoration: none;
484
+ word-break: break-all;
485
+ }
486
+ .doc-url:hover {
487
+ text-decoration: underline;
488
+ }
489
+ .done-message {
490
+ font-size: 32px;
491
+ text-align: center;
492
+ padding: 40px;
493
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
494
+ color: white;
495
+ border-radius: 12px;
496
+ font-weight: bold;
497
+ }
498
+ .annotation-info {
499
+ display: flex;
500
+ justify-content: space-between;
501
+ margin-bottom: 10px;
502
+ padding-left: 10px;
503
+ }
504
+ .annotation-decision {
505
+ font-weight: 600;
506
+ }
507
+ .annotation-time {
508
+ color: #6c757d;
509
+ font-size: 12px;
510
+ }
511
+ .viewer-empty, .viewer-error {
512
+ text-align: center;
513
+ padding: 40px;
514
+ color: #6c757d;
515
+ font-size: 18px;
516
+ }
517
+ .viewer-nav {
518
+ display: flex;
519
+ justify-content: center;
520
+ align-items: center;
521
+ gap: 20px;
522
+ margin: 10px 0;
523
+ }
524
+ .viewer-counter {
525
+ font-weight: 600;
526
+ color: #495057;
527
+ }
528
+ #select {
529
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
530
+ border: none;
531
+ font-size: 18px;
532
+ padding: 12px 24px;
533
+ }
534
+ #discard {
535
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
536
+ border: none;
537
+ font-size: 18px;
538
+ padding: 12px 24px;
539
+ }
540
+ .dark .stat-item {
541
+ background: linear-gradient(135deg, #434343 0%, #000000 100%);
542
+ }
543
+ .dark .progress-container, .dark .doc-info {
544
+ background: #1a1a1a;
545
+ }
546
+ .dark .progress-bar-bg {
547
+ background: #2a2a2a;
548
+ }
549
+ @keyframes pulse {
550
+ 0% { transform: scale(1); }
551
+ 50% { transform: scale(1.05); }
552
+ 100% { transform: scale(1); }
553
+ }
554
+ """
555
+
556
+ shortcut_js = """
557
+ <script>
558
+ function handleKeyboardShortcuts(e) {
559
+ var target = e.target || e.srcElement;
560
+ switch (target.tagName.toLowerCase()) {
561
+ case "input":
562
+ case "textarea":
563
+ case "select":
564
+ case "button":
565
+ return;
566
+ default:
567
+ if (e.code === "Digit1" || e.key === "1") {
568
+ var selectBtn = document.getElementById("select");
569
+ if (selectBtn && !selectBtn.disabled) {
570
+ selectBtn.click();
571
+ e.preventDefault();
572
+ }
573
+ }
574
+ else if (e.code === "Digit2" || e.key === "2") {
575
+ var discardBtn = document.getElementById("discard");
576
+ if (discardBtn && !discardBtn.disabled) {
577
+ discardBtn.click();
578
+ e.preventDefault();
579
+ }
580
+ }
581
+ }
582
+ }
583
+
584
+ document.addEventListener('keyup', handleKeyboardShortcuts, false);
585
+
586
+ document.addEventListener('keydown', function(e) {
587
+ if ((e.code === "Digit1" || e.key === "1") && document.getElementById("select") && !document.getElementById("select").disabled) {
588
+ document.getElementById("select").style.transform = "scale(0.95)";
589
+ }
590
+ if ((e.code === "Digit2" || e.key === "2") && document.getElementById("discard") && !document.getElementById("discard").disabled) {
591
+ document.getElementById("discard").style.transform = "scale(0.95)";
592
+ }
593
+ });
594
+
595
+ document.addEventListener('keyup', function(e) {
596
+ if (e.code === "Digit1" || e.key === "1") {
597
+ var btn = document.getElementById("select");
598
+ if (btn) btn.style.transform = "scale(1)";
599
+ }
600
+ if (e.code === "Digit2" || e.key === "2") {
601
+ var btn = document.getElementById("discard");
602
+ if (btn) btn.style.transform = "scale(1)";
603
+ }
604
+ });
605
+ </script>
606
+ """
607
+
608
+ with gr.Blocks(
609
+ title="Essential Web Annotation",
610
+ theme=gr.themes.Default(),
611
+ css=css,
612
+ head=shortcut_js
613
+ ) as demo:
614
+ gr.Markdown("# πŸš€ Essential Web Annotation Tool")
615
+
616
+ with gr.Tabs():
617
+ with gr.Tab("Annotate"):
618
+ gr.Markdown("""
619
+ ## πŸ“‹ Document Quality Assessment
620
+
621
+ Your task is to evaluate documents for **high-quality, valuable content** that provides generalizable information.
622
+
623
+ ### βœ… **Select High-Quality Documents:**
624
+ Examples include:
625
+ - **Technical blogs** with detailed explanations
626
+ - **Scientific papers** and research articles
627
+ - **Information-rich discussions** with insights
628
+ - **Educational content** with actionable knowledge
629
+ - **Professional documentation** and guides
630
+
631
+ ### ❌ **Discard Low-Quality Documents:**
632
+ - Content with minimal informational value
633
+
634
+ ### 🎯 **Quick Assessment Tips:**
635
+ - High-quality documents are usually immediately recognizable to a human.
636
+ - Use the **Viewer** tab to browse examples of selected documents
637
+ - Trust your judgment on content value and depth
638
+
639
+ ### ⌨️ **Keyboard Shortcuts:**
640
+ | Key | Action |
641
+ |-----|--------|
642
+ | **`1`** | βœ… Select document |
643
+ | **`2`** | ❌ Discard document |
644
+ """)
645
+
646
+ progress_html, progress_num = format_progress()
647
+
648
+ progress_display = gr.HTML(progress_html)
649
+ stats_display = gr.HTML(format_stats())
650
+
651
+ if current:
652
+ doc_info_html = format_document_info(current)
653
+ text_val = current.get("text", "")
654
+ else:
655
+ doc_info_html = "<div class='doc-info'>No documents loaded.</div>"
656
+ text_val = ""
657
+
658
+ doc_info = gr.HTML(doc_info_html)
659
+
660
+ with gr.Column(variant="panel"):
661
+ text_display = gr.Textbox(
662
+ text_val,
663
+ label="πŸ“„ Document Content",
664
+ lines=20,
665
+ interactive=False,
666
+ show_copy_button=True
667
+ )
668
+
669
+ with gr.Row():
670
+ btn_sel = gr.Button(
671
+ "βœ… Select (1)",
672
+ elem_id="select",
673
+ variant="primary",
674
+ interactive=bool(current),
675
+ size="lg"
676
+ )
677
+ btn_dis = gr.Button(
678
+ "❌ Discard (2)",
679
+ elem_id="discard",
680
+ variant="stop",
681
+ interactive=bool(current),
682
+ size="lg"
683
+ )
684
+
685
+ progress_bar = gr.Number(value=progress_num, visible=False)
686
+
687
+ outputs = [doc_info, text_display, btn_sel, btn_dis, stats_display, progress_display, progress_bar]
688
+
689
+ btn_sel.click(lambda: choose("selected"), outputs=outputs)
690
+ btn_dis.click(lambda: choose("discarded"), outputs=outputs)
691
+
692
+ with gr.Tab("Viewer"):
693
+ gr.Markdown("### πŸ“š Browse Annotated Documents")
694
+
695
+ with gr.Row():
696
+ filter_dropdown = gr.Radio(
697
+ choices=["all", "selected", "discarded"],
698
+ value="all",
699
+ label="Filter",
700
+ interactive=True
701
+ )
702
+
703
+ viewer_info = gr.HTML()
704
+
705
+ with gr.Column(variant="panel"):
706
+ viewer_text = gr.Textbox(
707
+ label="πŸ“„ Document Content",
708
+ lines=20,
709
+ interactive=False,
710
+ show_copy_button=True
711
+ )
712
+
713
+ with gr.Row():
714
+ prev_btn = gr.Button("← Previous", size="lg")
715
+ viewer_counter = gr.HTML("<div class='viewer-counter'>0 / 0</div>")
716
+ next_btn = gr.Button("Next β†’", size="lg")
717
+
718
+ # Initialize viewer
719
+ filter_dropdown.change(
720
+ update_viewer_filter,
721
+ inputs=[filter_dropdown],
722
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
723
+ )
724
+
725
+ prev_btn.click(
726
+ lambda: navigate_viewer(-1),
727
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
728
+ )
729
+
730
+ next_btn.click(
731
+ lambda: navigate_viewer(1),
732
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
733
+ )
734
+
735
+ # Load initial viewer state
736
+ demo.load(
737
+ lambda: update_viewer_filter("all"),
738
+ outputs=[viewer_info, viewer_text, viewer_counter, prev_btn, next_btn]
739
+ )
740
+
741
+ gr.HTML("""
742
+ <script>
743
+ const observer = new MutationObserver(() => {
744
+ document.querySelectorAll('.stat-item').forEach(item => {
745
+ item.style.animation = 'pulse 0.3s ease-out';
746
+ });
747
+ });
748
+ observer.observe(document.body, { childList: true, subtree: true });
749
+ </script>
750
+ """)
751
+
752
+ return demo
753
+
754
+
755
+ if __name__ == "__main__":
756
+ build().launch()