LLMSniffer / backend_model.py
Abir66's picture
Update backend_model.py
e18c8d1 verified
raw
history blame
3.76 kB
import torch
import torch.nn as nn
import torch.optim as optim
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
def remove_java_comments(code):
# Remove single-line comments (//)
code = re.sub(r'//.*', '', code)
# Remove multi-line comments (/* ... */)
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
return code
def remove_python_comments(code):
# Remove single-line comments (#)
code = re.sub(r'#.*', '', code)
# Remove multi-line comments (""" ... """ or ''' ... ''')
code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
code = re.sub(r"'''.*?'''", '', code, flags=re.DOTALL)
return code
# Model with Binary Classifier
class CodeBERTBinaryClassifier(nn.Module):
def __init__(self, encoder_model, hidden_size=256, num_layers=2):
super(CodeBERTBinaryClassifier, self).__init__()
self.encoder = encoder_model
self.classifier = nn.Sequential(
nn.Dropout(0.3), # Dropout with 30%
nn.Linear(self.encoder.config.hidden_size, 128), # Hidden layer with 128 units
nn.BatchNorm1d(128), # Batch normalization for the hidden layer
nn.ReLU(), # ReLU activation for the hidden layer
nn.Dropout(0.3), # Dropout with 30%
nn.Linear(128, 1) # Output layer with 1 unit
)
def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
cls_output = outputs.last_hidden_state[:, 0, :] # [CLS] token representation
logits = self.classifier(cls_output.detach()).squeeze(-1) # Squeeze for binary logit
return logits, cls_output
def infer_single_sample(code_text, model, tokenizer, language='java'):
# Ensure model is in evaluation mode
model.eval()
# Remove comments from the code (assuming the same preprocessing as during training)
if language == 'python':
code_text = remove_python_comments(code_text)
else:
code_text = remove_java_comments(code_text)
# print(code_text)
# Tokenize the input
inputs = tokenizer.encode_plus(
code_text,
padding='max_length',
max_length=512,
truncation=True,
return_tensors='pt'
)
# Move inputs to the specified device
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
# Disable gradient computation for inference
with torch.no_grad():
# Get model prediction
logits, _ = model(input_ids, attention_mask)
# Apply sigmoid to get probability
probability = torch.sigmoid(logits).cpu().item()
# Classify based on 0.5 threshold
predicted_label = 1 if probability > 0.5 else 0
return {
'probability': probability,
'predicted_label': predicted_label,
'interpretation': 'GPT-generated' if predicted_label == 0 else 'Human-written'
}
def load_model_and_tokenizer(model_architecture, model_path):
tokenizer = AutoTokenizer.from_pretrained(model_architecture)
base_model = AutoModel.from_pretrained(model_architecture)
model = CodeBERTBinaryClassifier(base_model)
# model = model.to(device)
map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load(model_path, map_location=map_location))
model = model.to(map_location)
return model, tokenizer