Spaces:

Shuu12121
/

CodeSearch-ModernBERT-Owl-Demo

Sleeping

App Files Files Community

Shuu12121 commited on Apr 16

Commit

182358f

verified ·

1 Parent(s): c0b2459

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -26

app.py CHANGED Viewed

@@ -11,51 +11,47 @@ model = SentenceTransformer("Shuu12121/CodeSearch-ModernBERT-Owl")
 model.eval()
 # --- Load CodeSearchNet dataset (test split only) ---
-dataset_all = load_dataset("code_search_net", split="test", trust_remote_code=True)
-lang_filter = ["python", "java", "javascript", "ruby", "go", "php"]
-# --- UI for language choice ---
-def get_random_query(lang: str, seed: int = 42):
-    subset = dataset_all.filter(lambda x: x["language"] == lang)
     random.seed(seed)
-    idx = random.randint(0, len(subset) - 1)
-    sample = subset[idx]
-    return sample["func_code_string"] or "", sample["func_documentation_string"] or ""
 @GPU
-def code_search_demo(lang: str, seed: int):
-    code_str, doc_str = get_random_query(lang, seed)
     query_emb = model.encode(doc_str, convert_to_tensor=True)
-    # ランダムに取得した同一言語の10件の関数とドキュメントを比較対象として選択
-    candidates = dataset_all.filter(lambda x: x["language"] == lang).shuffle(seed=seed).select(range(10))
-    candidate_texts = [c["func_code_string"] or "" for c in candidates]
-    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
-    # 類似度計算
     cos_scores = util.cos_sim(query_emb, candidate_embeddings)[0]
-    results = sorted(zip(candidate_texts, cos_scores), key=lambda x: x[1], reverse=True)
-    # 結果フォーマット（ランキング付き）
-    output = f"### 🔍 Query Docstring (Language: {lang})\n\n" + doc_str + "\n\n"
     output += "## 🏆 Top Matches:\n"
     medals = ["🥇", "🥈", "🥉"] + [f"#{i+1}" for i in range(3, len(results))]
     for i, (code, score) in enumerate(results):
         label = medals[i] if i < len(medals) else f"#{i+1}"
-        output += f"\n**{label}** - Similarity: {score.item():.4f}\\n\\n```\\n{code.strip()[:1000]}\\n```\\n"
     return output
-# --- Gradio Interface ---
 demo = gr.Interface(
     fn=code_search_demo,
-    inputs=[
-        gr.Dropdown(["python", "java", "javascript", "ruby", "go", "php"], label="Language", value="python"),
-        gr.Slider(0, 100000, value=42, step=1, label="Random Seed")
-    ],
     outputs=gr.Markdown(label="Search Result"),
     title="🔎 CodeSearch-ModernBERT-Owl Demo",
-    description="コードドキュメントから関数検索を行うデモ（CodeSearchNet + CodeModernBERT-Owl）"
 )
 if __name__ == "__main__":
-    demo.launch()

 model.eval()
 # --- Load CodeSearchNet dataset (test split only) ---
+dataset = load_dataset("code_x_glue_tc_nl_code_search_adv", trust_remote_code=True)
+# --- Query & Candidate Generator ---
+def get_random_query(seed: int = 42):
     random.seed(seed)
+    idx = random.randint(0, len(dataset) - 1)
+    sample = dataset[idx]
+    return sample["code"], sample["docstring"]
 @GPU
+def code_search_demo(seed: int):
+    code_str, doc_str = get_random_query(seed)
     query_emb = model.encode(doc_str, convert_to_tensor=True)
+    # ランダムに10件取得
+    candidates = dataset.shuffle(seed=seed).select(range(10))
+    candidate_codes = [c["code"] for c in candidates]
+    candidate_embeddings = model.encode(candidate_codes, convert_to_tensor=True)
+    # 類似度スコア算出
     cos_scores = util.cos_sim(query_emb, candidate_embeddings)[0]
+    results = sorted(zip(candidate_codes, cos_scores), key=lambda x: x[1], reverse=True)
+    # 結果出力
+    output = f"### 🔍 Query Docstring\n\n{doc_str}\n\n"
     output += "## 🏆 Top Matches:\n"
     medals = ["🥇", "🥈", "🥉"] + [f"#{i+1}" for i in range(3, len(results))]
     for i, (code, score) in enumerate(results):
         label = medals[i] if i < len(medals) else f"#{i+1}"
+        output += f"\n**{label}** - Similarity: {score.item():.4f}\n\n```python\n{code.strip()[:1000]}\n```\n"
     return output
+# --- Gradio UI ---
 demo = gr.Interface(
     fn=code_search_demo,
+    inputs=gr.Slider(0, 100000, value=42, step=1, label="Random Seed"),
     outputs=gr.Markdown(label="Search Result"),
     title="🔎 CodeSearch-ModernBERT-Owl Demo",
+    description="docstring から類似 Python 関数を検索（CodeXGlue + ModernBERT-Owl）"
 )
 if __name__ == "__main__":
+    demo.launch()