Spaces:

Abir66
/

LLMSniffer

Running

App Files Files Community

LLMSniffer / backend_model.py

Abir66

Update backend_model.py

e18c8d1 verified 7 months ago

raw

history blame

3.76 kB

	import torch
	import torch.nn as nn
	import torch.optim as optim
	import re

	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, Trainer, TrainingArguments
	from torch.utils.data import DataLoader, Dataset

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# device = torch.device('cpu')


	def remove_java_comments(code):
	# Remove single-line comments (//)
	code = re.sub(r'//.*', '', code)

	# Remove multi-line comments (/* ... */)
	code = re.sub(r'/\.?\*/', '', code, flags=re.DOTALL)

	return code


	def remove_python_comments(code):

	# Remove single-line comments (#)
	code = re.sub(r'#.*', '', code)

	# Remove multi-line comments (""" ... """ or ''' ... ''')
	code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
	code = re.sub(r"'''.*?'''", '', code, flags=re.DOTALL)

	return code


	# Model with Binary Classifier
	class CodeBERTBinaryClassifier(nn.Module):
	def __init__(self, encoder_model, hidden_size=256, num_layers=2):
	super(CodeBERTBinaryClassifier, self).__init__()
	self.encoder = encoder_model

	self.classifier = nn.Sequential(
	nn.Dropout(0.3), # Dropout with 30%
	nn.Linear(self.encoder.config.hidden_size, 128), # Hidden layer with 128 units
	nn.BatchNorm1d(128), # Batch normalization for the hidden layer
	nn.ReLU(), # ReLU activation for the hidden layer
	nn.Dropout(0.3), # Dropout with 30%
	nn.Linear(128, 1) # Output layer with 1 unit
	)


	def forward(self, input_ids, attention_mask):
	outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
	cls_output = outputs.last_hidden_state[:, 0, :] # [CLS] token representation
	logits = self.classifier(cls_output.detach()).squeeze(-1) # Squeeze for binary logit
	return logits, cls_output



	def infer_single_sample(code_text, model, tokenizer, language='java'):

	# Ensure model is in evaluation mode
	model.eval()

	# Remove comments from the code (assuming the same preprocessing as during training)
	if language == 'python':
	code_text = remove_python_comments(code_text)

	else:
	code_text = remove_java_comments(code_text)

	# print(code_text)

	# Tokenize the input
	inputs = tokenizer.encode_plus(
	code_text,
	padding='max_length',
	max_length=512,
	truncation=True,
	return_tensors='pt'
	)

	# Move inputs to the specified device
	input_ids = inputs['input_ids'].to(device)
	attention_mask = inputs['attention_mask'].to(device)

	# Disable gradient computation for inference
	with torch.no_grad():
	# Get model prediction
	logits, _ = model(input_ids, attention_mask)

	# Apply sigmoid to get probability
	probability = torch.sigmoid(logits).cpu().item()

	# Classify based on 0.5 threshold
	predicted_label = 1 if probability > 0.5 else 0

	return {
	'probability': probability,
	'predicted_label': predicted_label,
	'interpretation': 'GPT-generated' if predicted_label == 0 else 'Human-written'
	}



	def load_model_and_tokenizer(model_architecture, model_path):
	tokenizer = AutoTokenizer.from_pretrained(model_architecture)
	base_model = AutoModel.from_pretrained(model_architecture)

	model = CodeBERTBinaryClassifier(base_model)
	# model = model.to(device)

	map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.load_state_dict(torch.load(model_path, map_location=map_location))
	model = model.to(map_location)

	return model, tokenizer