File size: 4,402 Bytes
3fffb69 cadb900 3fffb69 cadb900 3fffb69 cadb900 3fffb69 cadb900 3fffb69 cadb900 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# -*- coding: utf-8 -*-
"""
Martinez-Gil, J. (2025). Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks.
International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678.
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from transformers import RobertaTokenizer, RobertaModel
import torch
import gradio as gr
from io import BytesIO
from PIL import Image
# Load GraphCodeBERT from Hugging Face (with cache)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/")
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/")
# Define sorting algorithms
sorting_algorithms = {
"Bubble_Sort": """
def bubble_sort(arr):
n = len(arr)
for i in range(n):
for j in range(0, n-i-1):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]
return arr
""",
"Selection_Sort": """
def selection_sort(arr):
for i in range(len(arr)):
min_idx = i
for j in range(i+1, len(arr)):
if arr[j] < arr[min_idx]:
min_idx = j
arr[i], arr[min_idx] = arr[min_idx], arr[i]
return arr
""",
"Insertion_Sort": """
def insertion_sort(arr):
for i in range(1, len(arr)):
key = arr[i]
j = i-1
while j >= 0 and key < arr[j]:
arr[j + 1] = arr[j]
j -= 1
arr[j + 1] = key
return arr
""",
"Merge_Sort": """
def merge_sort(arr):
if len(arr) > 1:
mid = len(arr) // 2
L = arr[:mid]
R = arr[mid:]
merge_sort(L)
merge_sort(R)
i = j = k = 0
while i < len(L) and j < len(R):
if L[i] < R[j]:
arr[k] = L[i]
i += 1
else:
arr[k] = R[j]
j += 1
k += 1
while i < len(L):
arr[k] = L[i]
i += 1
k += 1
while j < len(R):
arr[k] = R[j]
j += 1
k += 1
return arr
""",
"Quick_Sort": """
def partition(arr, low, high):
i = (low - 1)
pivot = arr[high]
for j in range(low, high):
if arr[j] <= pivot:
i += 1
arr[i], arr[j] = arr[j], arr[i]
arr[i+1], arr[high] = arr[high], arr[i+1]
return (i + 1)
def quick_sort(arr, low, high):
if low < high:
pi = partition(arr, low, high)
quick_sort(arr, low, pi - 1)
quick_sort(arr, pi + 1, high)
return arr
"""
}
# Get token embeddings for a code snippet
def get_token_embeddings(code):
inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
return token_embeddings, tokens
# Plot comparison between two algorithms
def compare_algorithms(algo1_name, algo2_name):
code1 = sorting_algorithms[algo1_name]
code2 = sorting_algorithms[algo2_name]
emb1, tokens1 = get_token_embeddings(code1)
emb2, tokens2 = get_token_embeddings(code2)
combined = np.concatenate([emb1, emb2], axis=0)
pca = PCA(n_components=2)
coords = pca.fit_transform(combined)
plt.figure(figsize=(6, 5), dpi=150)
plt.scatter(coords[:len(tokens1), 0], coords[:len(tokens1), 1], color='red', label=algo1_name, s=20)
plt.scatter(coords[len(tokens1):, 0], coords[len(tokens1):, 1], color='blue', label=algo2_name, s=20)
plt.legend()
plt.xticks([]); plt.yticks([]); plt.grid(False)
buf = BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight')
plt.close()
buf.seek(0)
return Image.open(buf)
# Gradio interface
interface = gr.Interface(
fn=compare_algorithms,
inputs=[
gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 1"),
gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 2")
],
outputs=gr.Image(type="pil", label="Token Embedding PCA"),
title="GraphCodeBERT Token Embedding Comparison",
description="Visual comparison of token-level embeddings from GraphCodeBERT for classical sorting algorithms."
)
if __name__ == "__main__":
interface.launch()
|