File size: 3,349 Bytes
53c0cc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceffe7d
 
 
 
 
 
53c0cc8
 
 
 
 
 
 
 
 
ceffe7d
53c0cc8
 
 
 
 
ceffe7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# app.py – Gradio Space wrapper for modular_graph_and_candidates

from __future__ import annotations

import json
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta
from functools import lru_cache
from pathlib import Path

import gradio as gr

# —— your refactored helpers ——
#   • build_graph_json(transformers_dir: Path, threshold: float, multimodal: bool, sim_method: str) -> dict
#   • generate_html(graph: dict) -> str (returns full <html>… string)
from modular_graph_and_candidates import build_graph_json, generate_html

HF_MAIN_REPO = "https://github.com/huggingface/transformers"


@lru_cache(maxsize=4)
def clone_or_cache(repo_url: str) -> Path:
    """Clone *repo_url* at most **once per 24 h**.

    The repo is cached under /tmp/<hash>. A hidden ``.cloned_at`` file stores the
    UTC ISO timestamp of the last clone; if that stamp is < 24 h old we reuse the
    existing checkout, otherwise we wipe the directory and clone afresh. This
    guarantees deterministic daily snapshots while avoiding repeated network
    cost within the same day (even across independent Space sessions if the
    container persists).
    """
    tmp_root = Path(tempfile.gettempdir())
    cache_dir = tmp_root / f"repo_{abs(hash(repo_url))}"
    stamp = cache_dir / ".cloned_at"

    if cache_dir.exists() and stamp.exists():
        try:
            last = datetime.fromisoformat(stamp.read_text().strip())
            if datetime.utcnow() - last < timedelta(days=1):
                return cache_dir  # fresh enough
        except Exception:
            # malformed stamp → fall through to re‑clone
            pass
        # stale cache → remove dir completely
        shutil.rmtree(cache_dir, ignore_errors=True)

    subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(cache_dir)])
    stamp.write_text(datetime.utcnow().isoformat())
    return cache_dir


def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
    repo_path = clone_or_cache(repo_url)

    graph = build_graph_json(
        transformers_dir=repo_path,
        threshold=threshold,
        multimodal=multimodal,
        sim_method=sim_method,
    )

    html = generate_html(graph)

    # Save graph JSON to a temp file so the user can download it.
    json_path = Path(tempfile.mktemp(suffix=".json"))
    json_path.write_text(json.dumps(graph), encoding="utf-8")

    return html, str(json_path)



CUSTOM_CSS = """
#graph_html iframe {height:85vh !important; width:100% !important; border:none;}
"""

with gr.Blocks(css=CUSTOM_CSS) as demo:
    gr.Markdown("## 🔍 Modular‑candidate explorer for 🤗 Transformers")

    with gr.Row():
        repo_in   = gr.Text(value=HF_MAIN_REPO, label="Repo / fork URL")
        thresh    = gr.Slider(0.50, 0.95, value=0.78, step=0.01, label="Similarity ≥")
        multi_cb  = gr.Checkbox(label="Only multimodal models")
        sim_radio = gr.Radio(["jaccard", "embedding"], value="jaccard", label="Similarity metric")
        go_btn    = gr.Button("Build graph")

    html_out  = gr.HTML(elem_id="graph_html", sanitize=False, show_label=False)
    json_out  = gr.File(label="Download graph.json")

    go_btn.click(run, [repo_in, thresh, multi_cb, sim_radio], [html_out, json_out])

if __name__ == "__main__":
    demo.launch()