File size: 6,724 Bytes
49a4932 03c8514 49a4932 03c8514 49a4932 03c8514 49a4932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import streamlit as st
import javalang
import torch
import torch.nn.functional as F
import re
from transformers import AutoTokenizer, AutoModel
import warnings
import pandas as pd
import zipfile
import os
# Set up page config
st.set_page_config(
page_title="Java Code Clone Detector (IJaDataset 2.1)",
page_icon="π",
layout="wide"
)
# Suppress warnings
warnings.filterwarnings("ignore")
# Constants
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATASET_PATH = "archive (1).zip" # Update this path if needed
# Initialize models with caching
@st.cache_resource
def load_models():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
return tokenizer, model
except Exception as e:
st.error(f"Failed to load models: {str(e)}")
return None, None
@st.cache_resource
def load_dataset():
try:
# Extract dataset if needed
if not os.path.exists("Subject_CloneTypes_Directories"):
with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
zip_ref.extractall(".")
# Load sample pairs (modify this based on your dataset structure)
clone_pairs = []
base_path = "Subject_CloneTypes_Directories"
# Example: Load one pair from each clone type
for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
type_path = os.path.join(base_path, clone_type)
if os.path.exists(type_path):
for root, _, files in os.walk(type_path):
if files:
# Take first two files as a pair
if len(files) >= 2:
with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
code1 = f1.read()
with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
code2 = f2.read()
clone_pairs.append({
"type": clone_type,
"code1": code1,
"code2": code2
})
break # Just take one pair per type for demo
return clone_pairs[:10] # Return first 10 pairs for demo
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
return []
tokenizer, code_model = load_models()
dataset_pairs = load_dataset()
# Normalization function
def normalize_code(code):
try:
code = re.sub(r'//.*', '', code) # Remove single-line comments
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Multi-line comments
code = re.sub(r'\s+', ' ', code).strip() # Normalize whitespace
return code
except Exception:
return code
# Embedding generation
def get_embedding(code):
try:
code = normalize_code(code)
inputs = tokenizer(
code,
return_tensors="pt",
truncation=True,
max_length=MAX_LENGTH,
padding='max_length'
).to(DEVICE)
with torch.no_grad():
outputs = code_model(**inputs)
return outputs.last_hidden_state.mean(dim=1) # Pooled embedding
except Exception as e:
st.error(f"Error processing code: {str(e)}")
return None
# Comparison function
def compare_code(code1, code2):
if not code1 or not code2:
return None
with st.spinner('Analyzing code...'):
emb1 = get_embedding(code1)
emb2 = get_embedding(code2)
if emb1 is None or emb2 is None:
return None
with torch.no_grad():
similarity = F.cosine_similarity(emb1, emb2).item()
return similarity
# UI Elements
st.title("π Java Code Clone Detector (IJaDataset 2.1)")
st.markdown("""
Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.
""")
# Dataset selector
selected_pair = None
if dataset_pairs:
pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys()))
selected_pair = pair_options[selected_option]
# Layout
col1, col2 = st.columns(2)
with col1:
code1 = st.text_area(
"First Java Code",
height=300,
value=selected_pair["code1"] if selected_pair else "",
help="Enter the first Java code snippet"
)
with col2:
code2 = st.text_area(
"Second Java Code",
height=300,
value=selected_pair["code2"] if selected_pair else "",
help="Enter the second Java code snippet"
)
# Threshold slider
threshold = st.slider(
"Clone Detection Threshold",
min_value=0.5,
max_value=1.0,
value=0.85,
step=0.01,
help="Adjust the similarity threshold for clone detection"
)
# Compare button
if st.button("Compare Code", type="primary"):
if tokenizer is None or code_model is None:
st.error("Models failed to load. Please check the logs.")
else:
similarity = compare_code(code1, code2)
if similarity is not None:
# Display results
st.subheader("Results")
# Progress bar for visualization
st.progress(similarity)
# Metrics columns
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Similarity Score", f"{similarity:.3f}")
with col2:
st.metric("Threshold", f"{threshold:.3f}")
with col3:
is_clone = similarity >= threshold
st.metric(
"Clone Detection",
"β
Clone" if is_clone else "β Not a Clone",
delta=f"{similarity-threshold:+.3f}"
)
# Show normalized code for debugging
with st.expander("Show normalized code"):
tab1, tab2 = st.tabs(["First Code", "Second Code"])
with tab1:
st.code(normalize_code(code1))
with tab2:
st.code(normalize_code(code2))
# Footer
st.markdown("---")
st.markdown("""
**Dataset Information**:
- Using IJaDataset 2.1 from Kaggle
- Contains 100K Java files with clone annotations
- Clone types: Type-1, Type-2, and Type-3 clones
""") |