Spaces:
Runtime error
Runtime error
aliasgerovs
commited on
Commit
·
79b97e2
1
Parent(s):
1797f70
Updated
Browse files- app.py +8 -2
- predictors.py +55 -0
- requirements.txt +3 -1
- utils.py +20 -5
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
import numpy as np
|
| 3 |
from datetime import date
|
| 4 |
from predictors import predict_bc_scores, predict_mc_scores
|
|
|
|
| 5 |
from analysis import depth_analysis
|
| 6 |
from predictors import predict_quillbot
|
| 7 |
from plagiarism import plagiarism_check, build_date
|
|
@@ -112,6 +113,12 @@ with gr.Blocks() as demo:
|
|
| 112 |
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
| 113 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
with gr.Row():
|
| 116 |
models = gr.Dropdown(
|
| 117 |
model_list,
|
|
@@ -310,6 +317,5 @@ with gr.Blocks() as demo:
|
|
| 310 |
date_from = ""
|
| 311 |
date_to = ""
|
| 312 |
|
| 313 |
-
|
| 314 |
if __name__ == "__main__":
|
| 315 |
-
demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from datetime import date
|
| 4 |
from predictors import predict_bc_scores, predict_mc_scores
|
| 5 |
+
from predictors import update, correct_text, split_text
|
| 6 |
from analysis import depth_analysis
|
| 7 |
from predictors import predict_quillbot
|
| 8 |
from plagiarism import plagiarism_check, build_date
|
|
|
|
| 113 |
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
| 114 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
| 115 |
|
| 116 |
+
with gr.Row():
|
| 117 |
+
btn = gr.Button("Bias Buster")
|
| 118 |
+
out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
|
| 119 |
+
corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
|
| 120 |
+
btn.click(fn=update, inputs=input_text, outputs=[out, corrections_output])
|
| 121 |
+
|
| 122 |
with gr.Row():
|
| 123 |
models = gr.Dropdown(
|
| 124 |
model_list,
|
|
|
|
| 317 |
date_from = ""
|
| 318 |
date_to = ""
|
| 319 |
|
|
|
|
| 320 |
if __name__ == "__main__":
|
| 321 |
+
demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))
|
predictors.py
CHANGED
|
@@ -21,6 +21,15 @@ import os
|
|
| 21 |
from utils import *
|
| 22 |
import joblib
|
| 23 |
from optimum.bettertransformer import BetterTransformer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
with open("config.yaml", "r") as file:
|
| 26 |
params = yaml.safe_load(file)
|
|
@@ -37,6 +46,8 @@ mc_label_map = params["MC_OUTPUT_LABELS"]
|
|
| 37 |
text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
|
| 38 |
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
| 39 |
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
|
|
|
|
|
|
| 40 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
| 41 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
| 42 |
text_bc_model_path
|
|
@@ -57,6 +68,21 @@ for model_name, model in zip(mc_label_map, text_1on1_models):
|
|
| 57 |
model
|
| 58 |
).to(device)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# proxy models for explainability
|
| 61 |
mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
|
| 62 |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
|
@@ -79,6 +105,34 @@ quillbot_model = BetterTransformer.transform(quillbot_model)
|
|
| 79 |
iso_reg = joblib.load("isotonic_regression_model.joblib")
|
| 80 |
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
def split_text_allow_complete_sentences_nltk(
|
| 83 |
text,
|
| 84 |
max_length=256,
|
|
@@ -252,6 +306,7 @@ def predict_bc_scores(input):
|
|
| 252 |
human_score = 1 - ai_score
|
| 253 |
bc_score = {"AI": ai_score, "HUMAN": human_score}
|
| 254 |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
|
|
|
|
| 255 |
return bc_score
|
| 256 |
|
| 257 |
|
|
|
|
| 21 |
from utils import *
|
| 22 |
import joblib
|
| 23 |
from optimum.bettertransformer import BetterTransformer
|
| 24 |
+
import gc
|
| 25 |
+
from cleantext import clean
|
| 26 |
+
import gradio as gr
|
| 27 |
+
from tqdm.auto import tqdm
|
| 28 |
+
from transformers import pipeline
|
| 29 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 30 |
+
import nltk
|
| 31 |
+
from nltk.tokenize import sent_tokenize
|
| 32 |
+
from optimum.pipelines import pipeline
|
| 33 |
|
| 34 |
with open("config.yaml", "r") as file:
|
| 35 |
params = yaml.safe_load(file)
|
|
|
|
| 46 |
text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
|
| 47 |
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
| 48 |
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
| 49 |
+
bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH']
|
| 50 |
+
bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH']
|
| 51 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
| 52 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
| 53 |
text_bc_model_path
|
|
|
|
| 68 |
model
|
| 69 |
).to(device)
|
| 70 |
|
| 71 |
+
|
| 72 |
+
bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
|
| 73 |
+
tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
|
| 74 |
+
bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
|
| 75 |
+
bias_checker = pipeline(
|
| 76 |
+
"text-classification",
|
| 77 |
+
model=model,
|
| 78 |
+
tokenizer=tokenizer,
|
| 79 |
+
)
|
| 80 |
+
gc.collect()
|
| 81 |
+
bias_corrector = pipeline(
|
| 82 |
+
"text2text-generation", model=bias_corrector_model_name, accelerator="ort"
|
| 83 |
+
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
# proxy models for explainability
|
| 87 |
mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
|
| 88 |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
|
|
|
|
| 105 |
iso_reg = joblib.load("isotonic_regression_model.joblib")
|
| 106 |
|
| 107 |
|
| 108 |
+
def split_text(text: str) -> list:
|
| 109 |
+
sentences = sent_tokenize(text)
|
| 110 |
+
return [[sentence] for sentence in sentences]
|
| 111 |
+
|
| 112 |
+
def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple:
|
| 113 |
+
sentence_batches = split_text(text)
|
| 114 |
+
corrected_text = []
|
| 115 |
+
corrections = []
|
| 116 |
+
for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."):
|
| 117 |
+
raw_text = " ".join(batch)
|
| 118 |
+
results = bias_checker(raw_text)
|
| 119 |
+
if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9):
|
| 120 |
+
corrected_batch = bias_corrector(raw_text)
|
| 121 |
+
corrected_version = corrected_batch[0]["generated_text"]
|
| 122 |
+
corrected_text.append(corrected_version)
|
| 123 |
+
corrections.append((raw_text, corrected_version))
|
| 124 |
+
else:
|
| 125 |
+
corrected_text.append(raw_text)
|
| 126 |
+
corrected_text = separator.join(corrected_text)
|
| 127 |
+
return corrected_text, corrections
|
| 128 |
+
|
| 129 |
+
def update(text: str):
|
| 130 |
+
text = clean(text, lower=False)
|
| 131 |
+
corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
|
| 132 |
+
corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
|
| 133 |
+
return corrected_text, corrections_display
|
| 134 |
+
|
| 135 |
+
|
| 136 |
def split_text_allow_complete_sentences_nltk(
|
| 137 |
text,
|
| 138 |
max_length=256,
|
|
|
|
| 306 |
human_score = 1 - ai_score
|
| 307 |
bc_score = {"AI": ai_score, "HUMAN": human_score}
|
| 308 |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
|
| 309 |
+
print(f"Input Text: {cleaned_text_bc}")
|
| 310 |
return bc_score
|
| 311 |
|
| 312 |
|
requirements.txt
CHANGED
|
@@ -26,4 +26,6 @@ Unidecode
|
|
| 26 |
python-dotenv
|
| 27 |
lime
|
| 28 |
joblib
|
| 29 |
-
optimum
|
|
|
|
|
|
|
|
|
| 26 |
python-dotenv
|
| 27 |
lime
|
| 28 |
joblib
|
| 29 |
+
optimum
|
| 30 |
+
clean-text
|
| 31 |
+
optimum[onnxruntime]
|
utils.py
CHANGED
|
@@ -31,13 +31,28 @@ def remove_accents(input_str):
|
|
| 31 |
|
| 32 |
|
| 33 |
def remove_special_characters(text):
|
| 34 |
-
text =
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
return text
|
| 39 |
|
| 40 |
-
|
| 41 |
def remove_special_characters_2(text):
|
| 42 |
pattern = r"[^a-zA-Z0-9 ]+"
|
| 43 |
text = re.sub(pattern, "", text)
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def remove_special_characters(text):
|
| 34 |
+
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
| 35 |
+
emoji_pattern = re.compile("["
|
| 36 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
| 37 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 38 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 39 |
+
u"\U0001F700-\U0001F77F" # alchemical symbols
|
| 40 |
+
u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
|
| 41 |
+
u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
|
| 42 |
+
u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
|
| 43 |
+
u"\U0001FA00-\U0001FA6F" # Chess Symbols
|
| 44 |
+
u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
|
| 45 |
+
u"\U00002702-\U000027B0" # Dingbats
|
| 46 |
+
u"\U000024C2-\U0001F251"
|
| 47 |
+
"]+", flags=re.UNICODE)
|
| 48 |
+
text = emoji_pattern.sub('', text)
|
| 49 |
+
text = re.sub(r'#\w+', '', text)
|
| 50 |
+
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
|
| 51 |
+
text = re.sub(r'\s+([.,!?;])', r'\1', text)
|
| 52 |
+
text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
|
| 53 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 54 |
return text
|
| 55 |
|
|
|
|
| 56 |
def remove_special_characters_2(text):
|
| 57 |
pattern = r"[^a-zA-Z0-9 ]+"
|
| 58 |
text = re.sub(pattern, "", text)
|