Spaces:

idkash1
/

detect-edits-in-ai-generated-text

Sleeping

App Files Files Community

detect-edits-in-ai-generated-text / human_text_detect.py

idkash1

Update human_text_detect.py

ee477c6 verified 5 months ago

raw

history blame

5.97 kB

	import torch
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import logging
	import numpy as np
	import pickle
	from src.DetectLM import DetectLM
	from src.PerplexityEvaluator import PerplexityEvaluator
	from src.PrepareArticles import PrepareArticles #Idan
	from src.fit_survival_function import fit_per_length_survival_function
	from glob import glob
	import spacy
	import re
	import os

	logging.basicConfig(level=logging.INFO)


	def read_all_csv_files(pattern):
	df = pd.DataFrame()
	print(pattern)
	for f in glob(pattern):
	df = pd.concat([df, pd.read_csv(f)])
	return df


	def get_survival_function(df, G=101):
	"""
	Returns a survival function for every sentence length in tokens.

	Args:
	:df: data frame with columns 'response' and 'length'
	:G: number of interpolation points

	Return:
	bivariate function (length, responce) -> (0,1)

	"""
	assert not df.empty
	value_name = "response" if "response" in df.columns else "logloss"

	df1 = df[~df[value_name].isna()]
	ll = df1['length']
	xx1 = df1[value_name]
	return fit_per_length_survival_function(ll, xx1, log_space=True, G=G)


	def mark_edits_remove_tags(chunks, tag="edit"):
	text_chunks = chunks['text']
	edits = []
	for i,text in enumerate(text_chunks):
	chunk_text = re.findall(rf"<{tag}>(.+)</{tag}>", text)
	if len(chunk_text) > 0:
	import pdb; pdb.set_trace()
	chunks['text'][i] = chunk_text[0]
	chunks['length'][i] -= 2
	edits.append(True)
	else:
	edits.append(False)

	return chunks, edits

	def get_null_data(model_name, topic):
	data = None
	try:
	file = open(f'nullData/{model_name}_{topic}.pkl', 'rb')
	data = pickle.load(file)
	except:
	pass

	return data

	def get_threshold_obj(model_name, topic):
	threshold = None
	try:
	file = open('threshold_obj.pkl', 'rb')
	threshold_obj = pickle.load(file)
	threshold = threshold_obj[model_name][topic]
	except:
	pass

	return threshold

	def detect_human_text(model_name, topic, text):

	# Get null data
	print('Get null data')
	df_null = get_null_data(model_name, topic)
	if 'num' in df_null.columns:
	df_null = df_null[df_null.num > 1]

	# Get survival function
	print('Get survival function')
	pval_functions = get_survival_function(df_null, G=43)

	min_tokens_per_sentence = 10
	max_tokens_per_sentence = 100

	####
	cache_dir = "/cache/huggingface"
	# Check if the directory exists and is writable
	print(f"Cache directory exists: {os.path.exists(cache_dir)}")
	print(f"Cache directory is writable: {os.access(cache_dir, os.W_OK)}")

	# List contents of the directory
	print("Contents of cache directory before loading model:")
	os.system(f"ls -lah {cache_dir}")
	###

	# Init model
	print('Init model')
	lm_name = 'gpt2-xl' if model_name == 'GPT2XL' else 'microsoft/phi-2'
	cache_dir = "/cache/huggingface"
	tokenizer = AutoTokenizer.from_pretrained(lm_name, cache_dir=cache_dir)
	model = AutoModelForCausalLM.from_pretrained(lm_name, cache_dir=cache_dir)

	###
	print("Contents of cache directory after loading model:")
	os.system(f"ls -lah {cache_dir}")

	print(f"Current HF_HOME: {os.getenv('HF_HOME')}")
	print(f"Current TRANSFORMERS_CACHE: {os.getenv('TRANSFORMERS_CACHE')}")

	# Check where the tokenizer and model are actually downloaded
	print(f"Tokenizer saved at: {tokenizer.save_pretrained(cache_dir)}")
	print(f"Model saved at: {model.save_pretrained(cache_dir)}")
	###

	print('Init PerplexityEvaluator')
	sentence_detector = PerplexityEvaluator(model, tokenizer)

	if torch.backends.mps.is_available():
	device = 'mps'
	elif torch.cuda.is_available():
	device = 'cuda'
	else:
	device = 'cpu'

	print(f'device {device}')
	model.to(device)

	print('Init DetectLM')
	detector = DetectLM(sentence_detector, pval_functions,
	min_len=min_tokens_per_sentence,
	max_len=max_tokens_per_sentence,
	length_limit_policy='truncate',
	HC_type='stbl',
	ignore_first_sentence= False
	)

	# Convert text to object
	print('Analyze text')
	article_obj = get_article_obj(text)
	parser = PrepareArticles(article_obj, min_tokens=min_tokens_per_sentence, max_tokens=max_tokens_per_sentence)
	chunks = parser(combined=False)

	# Go over all the document
	for i in range(len(chunks['text'])):
	print(chunks['text'][i])
	# for p,v in enumerate(chunks['text'][i]):
	# print(f'{p}: {v}')
	res = detector(chunks['text'][i], chunks['context'][i], dashboard=None)

	# print(f"Num of Edits (rate) = {np.sum(df['tag'] == '<edit>')} ({edit_rate})")
	# print(f"HC = {res['HC']}")
	# print(f"Fisher = {res['fisher']}")
	# print(f"Fisher (chisquared pvalue) = {res['fisher_pvalue']}")

	results = res['HC']

	threshold = get_threshold_obj(model_name, topic)
	print(f"threshold: {threshold}, results: {results}")
	return (results / threshold) - 1, res['sentences']

	# Convert article text into object
	def get_article_obj(text):
	# Init article object
	article_obj = {
	'sub_titles': [{
	'sentences': []
	}]
	}

	nlp = spacy.load("en_core_web_sm") # Load model

	for line in text.split('\n'):
	doc = nlp(line) # Analyze text
	sentences = [sent.text for sent in doc.sents if len(sent) >= 10] # Split it by sentence
	for sentence in sentences:
	sentence = re.sub(r' +', ' ', sentence) # Remove duplicate spaces
	article_obj['sub_titles'][0]['sentences'].append({'sentence': sentence})

	return article_obj