taha092 commited on
Commit
31d085e
·
verified ·
1 Parent(s): fd8896a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -46
app.py CHANGED
@@ -9,13 +9,17 @@ import random
9
  import re
10
 
11
  # ----------------------
12
- # Paraphrasing Model Setup (Pegasus)
13
  # ----------------------
14
- PARAPHRASE_MODEL_NAME = "tuner007/pegasus_paraphrase"
15
- paraphrase_tokenizer = AutoTokenizer.from_pretrained(PARAPHRASE_MODEL_NAME)
16
- paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained(PARAPHRASE_MODEL_NAME)
 
 
 
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
- paraphrase_model = paraphrase_model.to(device)
 
19
 
20
  # ----------------------
21
  # Semantic Similarity Model
@@ -31,7 +35,7 @@ ai_detector = pipeline("text-classification", model=AI_DETECTOR_MODEL, device=0
31
  # ----------------------
32
  # Prompt Variations for Humanization
33
  # ----------------------
34
- PROMPT_VARIANTS = [
35
  "Paraphrase this naturally:",
36
  "Rewrite as if explaining to a friend:",
37
  "Make this sound like a real conversation:",
@@ -41,6 +45,12 @@ PROMPT_VARIANTS = [
41
  "Rewrite in a friendly, informal tone:",
42
  "Paraphrase in a way a student would say it:",
43
  ]
 
 
 
 
 
 
44
 
45
  # ----------------------
46
  # Sentence Splitter
@@ -50,7 +60,7 @@ def split_sentences(text):
50
  return [s for s in sentences if s]
51
 
52
  # ----------------------
53
- # Light Post-Processing
54
  # ----------------------
55
  def postprocess_text(text):
56
  contractions = {
@@ -64,40 +74,99 @@ def postprocess_text(text):
64
  "at the end of the day", "to be honest", "as a matter of fact", "for what it's worth",
65
  "in a nutshell", "the bottom line is", "all things considered"
66
  ]
 
 
 
67
  if random.random() < 0.3:
68
  text += " " + random.choice(idioms) + "."
 
 
 
 
 
 
 
 
 
69
  return text
70
 
71
  # ----------------------
72
- # Sentence-level Paraphrasing with Prompt Variation
73
  # ----------------------
74
- def paraphrase_sentence(sentence, tone):
75
- prompt = random.choice(PROMPT_VARIANTS)
76
- if tone != "Stealth":
77
- prompt = f"{prompt} ({tone} tone):"
78
  full_prompt = f"{prompt} {sentence}"
79
- batch = paraphrase_tokenizer([full_prompt], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(device)
80
- outputs = paraphrase_model.generate(
81
  **batch,
82
  max_length=60,
83
  num_beams=5,
84
  num_return_sequences=1,
85
  temperature=1.0
86
  )
87
- tgt_text = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)
88
  return tgt_text[0] if tgt_text else sentence
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  # ----------------------
91
- # Main Paraphrasing Function
92
  # ----------------------
93
- def paraphrase(text, tone):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  sentences = split_sentences(text)
95
  paraphrased = []
96
  for sent in sentences:
97
- rewritten = paraphrase_sentence(sent, tone)
98
- paraphrased.append(rewritten)
 
 
 
99
  joined = ' '.join(paraphrased)
100
- return postprocess_text(joined)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # ----------------------
103
  # Semantic Similarity Function
@@ -108,22 +177,6 @@ def semantic_similarity(text1, text2):
108
  sim = util.pytorch_cos_sim(emb1, emb2).item()
109
  return sim
110
 
111
- # ----------------------
112
- # Local AI Detection Function
113
- # ----------------------
114
- def check_ai_score(text):
115
- try:
116
- result = ai_detector(text)
117
- for r in result:
118
- # LABEL_1 = AI, LABEL_0 = Human
119
- if r['label'] in ['LABEL_1', 'Fake']:
120
- return r['score'], None
121
- elif r['label'] in ['LABEL_0', 'Real']:
122
- return 1.0 - r['score'], None
123
- return 0.5, None # fallback
124
- except Exception as e:
125
- return None, f"AI detection error: {str(e)}"
126
-
127
  # ----------------------
128
  # Humanization Score & Rating
129
  # ----------------------
@@ -149,22 +202,35 @@ def process(text, tone):
149
  if pre_ai_prob is None:
150
  return "", f"AI Detection Error: {pre_err}", 0.0, "", 0.0, ""
151
  try:
152
- paraphrased = paraphrase(text, tone)
 
153
  except Exception as e:
154
  return f"[Paraphrasing error: {str(e)}]", "", 0.0, "", 0.0, ""
155
- post_ai_prob, post_err = check_ai_score(paraphrased)
156
- if post_ai_prob is None:
157
- return paraphrased, f"AI Detection Error: {post_err}", 0.0, "", 0.0, ""
158
- sim = semantic_similarity(text, paraphrased)
159
- score = humanization_score(sim, post_ai_prob)
160
- rating = humanization_rating(score)
161
- ai_score_str = f"Pre: {100*(1-pre_ai_prob):.1f}% human | Post: {100*(1-post_ai_prob):.1f}% human"
 
 
 
 
 
 
 
 
 
 
 
 
162
  return (
163
- paraphrased,
164
  ai_score_str,
165
  sim,
166
  rating,
167
- score * 100,
168
  ""
169
  )
170
 
 
9
  import re
10
 
11
  # ----------------------
12
+ # Paraphrasing Model Setup (Pegasus + T5)
13
  # ----------------------
14
+ PEGASUS_MODEL_NAME = "tuner007/pegasus_paraphrase"
15
+ T5_MODEL_NAME = "Vamsi/T5_Paraphrase_Paws"
16
+ pegasus_tokenizer = AutoTokenizer.from_pretrained(PEGASUS_MODEL_NAME)
17
+ pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(PEGASUS_MODEL_NAME)
18
+ t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL_NAME)
19
+ t5_model = AutoModelForSeq2SeqLM.from_pretrained(T5_MODEL_NAME)
20
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+ pegasus_model = pegasus_model.to(device)
22
+ t5_model = t5_model.to(device)
23
 
24
  # ----------------------
25
  # Semantic Similarity Model
 
35
  # ----------------------
36
  # Prompt Variations for Humanization
37
  # ----------------------
38
+ PEGASUS_PROMPTS = [
39
  "Paraphrase this naturally:",
40
  "Rewrite as if explaining to a friend:",
41
  "Make this sound like a real conversation:",
 
45
  "Rewrite in a friendly, informal tone:",
46
  "Paraphrase in a way a student would say it:",
47
  ]
48
+ T5_PROMPTS = [
49
+ "Paraphrase the following text in a formal, academic tone:",
50
+ "Paraphrase the following text in a casual, conversational tone:",
51
+ "Paraphrase the following text in a friendly, approachable tone:",
52
+ "Paraphrase the following text to bypass AI detectors and sound as human as possible:",
53
+ ]
54
 
55
  # ----------------------
56
  # Sentence Splitter
 
60
  return [s for s in sentences if s]
61
 
62
  # ----------------------
63
+ # Aggressive Post-Processing
64
  # ----------------------
65
  def postprocess_text(text):
66
  contractions = {
 
74
  "at the end of the day", "to be honest", "as a matter of fact", "for what it's worth",
75
  "in a nutshell", "the bottom line is", "all things considered"
76
  ]
77
+ transitions = [
78
+ "Interestingly,", "In fact,", "To be clear,", "As a result,", "For example,", "On the other hand,", "In other words,"
79
+ ]
80
  if random.random() < 0.3:
81
  text += " " + random.choice(idioms) + "."
82
+ if random.random() < 0.3:
83
+ text = random.choice(transitions) + " " + text
84
+ # Randomly lower-case a word to mimic human error
85
+ if random.random() < 0.2:
86
+ words = text.split()
87
+ if len(words) > 3:
88
+ idx = random.randint(1, len(words)-2)
89
+ words[idx] = words[idx].lower()
90
+ text = ' '.join(words)
91
  return text
92
 
93
  # ----------------------
94
+ # Multi-Model, Multi-Pass Paraphrasing
95
  # ----------------------
96
+ def pegasus_paraphrase(sentence):
97
+ prompt = random.choice(PEGASUS_PROMPTS)
 
 
98
  full_prompt = f"{prompt} {sentence}"
99
+ batch = pegasus_tokenizer([full_prompt], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(device)
100
+ outputs = pegasus_model.generate(
101
  **batch,
102
  max_length=60,
103
  num_beams=5,
104
  num_return_sequences=1,
105
  temperature=1.0
106
  )
107
+ tgt_text = pegasus_tokenizer.batch_decode(outputs, skip_special_tokens=True)
108
  return tgt_text[0] if tgt_text else sentence
109
 
110
+ def t5_paraphrase(sentence):
111
+ prompt = random.choice(T5_PROMPTS) + " " + sentence
112
+ input_ids = t5_tokenizer.encode(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
113
+ outputs = t5_model.generate(
114
+ input_ids,
115
+ do_sample=True,
116
+ top_k=120,
117
+ top_p=0.95,
118
+ temperature=0.7,
119
+ repetition_penalty=1.2,
120
+ max_length=256,
121
+ num_return_sequences=1
122
+ )
123
+ paraphrased = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
124
+ return paraphrased
125
+
126
  # ----------------------
127
+ # Feedback Loop with AI Detector
128
  # ----------------------
129
+ def check_ai_score(text):
130
+ try:
131
+ result = ai_detector(text)
132
+ for r in result:
133
+ if r['label'] in ['LABEL_1', 'Fake']:
134
+ return r['score'], None
135
+ elif r['label'] in ['LABEL_0', 'Real']:
136
+ return 1.0 - r['score'], None
137
+ return 0.5, None
138
+ except Exception as e:
139
+ return None, f"AI detection error: {str(e)}"
140
+
141
+ # ----------------------
142
+ # Main Humanizer Pipeline
143
+ # ----------------------
144
+ def humanize_pipeline(text, tone, max_feedback_loops=2):
145
  sentences = split_sentences(text)
146
  paraphrased = []
147
  for sent in sentences:
148
+ # First pass: Pegasus
149
+ peg = pegasus_paraphrase(sent)
150
+ # Second pass: T5
151
+ t5 = t5_paraphrase(peg)
152
+ paraphrased.append(t5)
153
  joined = ' '.join(paraphrased)
154
+ processed = postprocess_text(joined)
155
+ # Feedback loop: if still flagged as AI, re-paraphrase flagged sentences
156
+ for _ in range(max_feedback_loops):
157
+ ai_prob, _ = check_ai_score(processed)
158
+ if ai_prob is not None and ai_prob < 0.5:
159
+ break # Considered human
160
+ # Re-paraphrase all sentences again
161
+ sentences = split_sentences(processed)
162
+ paraphrased = []
163
+ for sent in sentences:
164
+ peg = pegasus_paraphrase(sent)
165
+ t5 = t5_paraphrase(peg)
166
+ paraphrased.append(t5)
167
+ joined = ' '.join(paraphrased)
168
+ processed = postprocess_text(joined)
169
+ return processed
170
 
171
  # ----------------------
172
  # Semantic Similarity Function
 
177
  sim = util.pytorch_cos_sim(emb1, emb2).item()
178
  return sim
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # ----------------------
181
  # Humanization Score & Rating
182
  # ----------------------
 
202
  if pre_ai_prob is None:
203
  return "", f"AI Detection Error: {pre_err}", 0.0, "", 0.0, ""
204
  try:
205
+ # Generate 3 versions for user choice
206
+ outputs = [humanize_pipeline(text, tone) for _ in range(3)]
207
  except Exception as e:
208
  return f"[Paraphrasing error: {str(e)}]", "", 0.0, "", 0.0, ""
209
+ # Pick the most human-like version (lowest ai_prob)
210
+ best = None
211
+ best_score = -1
212
+ best_ai_prob = 1.0
213
+ for out in outputs:
214
+ post_ai_prob, _ = check_ai_score(out)
215
+ sim = semantic_similarity(text, out)
216
+ score = humanization_score(sim, post_ai_prob if post_ai_prob is not None else 1.0)
217
+ if post_ai_prob is not None and post_ai_prob < best_ai_prob:
218
+ best = out
219
+ best_score = score
220
+ best_ai_prob = post_ai_prob
221
+ if best is None:
222
+ best = outputs[0]
223
+ best_score = 0.0
224
+ best_ai_prob = 1.0
225
+ sim = semantic_similarity(text, best)
226
+ rating = humanization_rating(best_score)
227
+ ai_score_str = f"Pre: {100*(1-pre_ai_prob):.1f}% human | Post: {100*(1-best_ai_prob):.1f}% human"
228
  return (
229
+ best,
230
  ai_score_str,
231
  sim,
232
  rating,
233
+ best_score * 100,
234
  ""
235
  )
236