Commit
·
f3d44f4
1
Parent(s):
f177806
Further changed the model
Browse files
app.py
CHANGED
@@ -1,30 +1,34 @@
|
|
1 |
# app.py: AI Detection and Plagiarism Check API
|
2 |
|
3 |
-
|
4 |
import os
|
5 |
import re
|
6 |
import torch
|
7 |
import logging
|
8 |
import tempfile
|
9 |
import numpy as np
|
10 |
-
from fastapi import FastAPI, UploadFile, File, HTTPException
|
11 |
from fastapi.responses import JSONResponse
|
12 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification,
|
13 |
from sentence_transformers import SentenceTransformer
|
14 |
from PyPDF2 import PdfReader
|
15 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# Configuration
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
OVERLAP = 1
|
28 |
|
29 |
app = FastAPI(title="Essay Analyzer", version="2.0.0")
|
30 |
|
@@ -39,32 +43,65 @@ logging.basicConfig(level=logging.INFO)
|
|
39 |
logger = logging.getLogger(__name__)
|
40 |
|
41 |
# Global models
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
47 |
|
48 |
def load_models():
|
49 |
-
|
50 |
-
|
51 |
try:
|
52 |
-
logger.info("Initializing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
# Load
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
num_labels=2,
|
59 |
-
trust_remote_code=True
|
60 |
-
).eval()
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
# Load embedding model
|
67 |
-
embedder = SentenceTransformer(SENTENCE_MODEL)
|
|
|
|
|
68 |
|
69 |
logger.info("All models initialized successfully")
|
70 |
return True
|
@@ -73,135 +110,341 @@ def load_models():
|
|
73 |
logger.error(f"Model initialization failed: {str(e)}")
|
74 |
return False
|
75 |
|
76 |
-
def
|
77 |
-
"""Enhanced PDF text extraction"""
|
78 |
try:
|
79 |
-
with tempfile.NamedTemporaryFile() as tmp:
|
80 |
tmp.write(file.file.read())
|
81 |
-
tmp.
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
except Exception as e:
|
89 |
logger.error(f"PDF processing error: {str(e)}")
|
90 |
-
raise HTTPException(500, "PDF processing failed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
def calculate_perplexity(text: str) -> float:
|
93 |
-
"""
|
|
|
|
|
|
|
94 |
try:
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
except Exception as e:
|
106 |
-
logger.error(f"Perplexity error: {str(e)}")
|
107 |
return 100.0
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
def analyze_text(text: str) -> dict:
|
110 |
-
"""
|
111 |
try:
|
112 |
-
|
113 |
-
|
|
|
114 |
raise HTTPException(400, "Text too short for accurate analysis")
|
115 |
|
116 |
-
#
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
for chunk in chunks:
|
126 |
-
inputs = detection_tokenizer(
|
127 |
-
chunk,
|
128 |
-
padding=True,
|
129 |
-
truncation=True,
|
130 |
-
max_length=MAX_SEQ_LENGTH,
|
131 |
-
return_tensors="pt"
|
132 |
-
)
|
133 |
-
with torch.no_grad():
|
134 |
-
outputs = detection_model(**inputs)
|
135 |
-
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
136 |
-
ai_confidences.append(probs[0][1].item()) # AI class probability
|
137 |
|
138 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
perplexity = calculate_perplexity(text)
|
140 |
-
perplexity_score = max(0, min(1, (perplexity - 20) / 60)) # 20-80 → 0-1
|
141 |
|
142 |
-
#
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
#
|
148 |
-
|
149 |
-
|
150 |
-
np.fill_diagonal(similarity, 0)
|
151 |
-
plagiarism_score = (similarity > PLAGIARISM_THRESHOLD).mean() * 100
|
152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
return {
|
154 |
-
"human_written": round(100 - ai_percentage, 2),
|
155 |
"ai_generated": round(ai_percentage, 2),
|
|
|
156 |
"plagiarism_risk": round(plagiarism_score, 2),
|
157 |
"perplexity": round(perplexity, 2),
|
158 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
}
|
|
|
160 |
|
161 |
except HTTPException:
|
162 |
raise
|
163 |
except Exception as e:
|
164 |
logger.error(f"Analysis error: {str(e)}")
|
165 |
-
raise HTTPException(500, "Analysis failed")
|
166 |
|
167 |
@app.on_event("startup")
|
168 |
async def startup():
|
169 |
-
logger.info("Starting
|
170 |
if not load_models():
|
171 |
logger.error("Service initialization failed")
|
172 |
raise RuntimeError("Failed to initialize models")
|
173 |
|
174 |
@app.post("/analyze")
|
175 |
-
async def analyze(file: UploadFile = File(...)):
|
|
|
176 |
try:
|
|
|
177 |
if not file.filename.lower().endswith(".pdf"):
|
178 |
raise HTTPException(400, "Only PDF files accepted")
|
179 |
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
except HTTPException as he:
|
184 |
raise he
|
185 |
except Exception as e:
|
186 |
logger.error(f"Unexpected error: {str(e)}")
|
187 |
-
raise HTTPException(500, "Internal server error")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
@app.get("/health")
|
190 |
async def health():
|
|
|
191 |
return {
|
192 |
"status": "operational",
|
193 |
-
"
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
196 |
}
|
197 |
|
198 |
@app.get("/")
|
199 |
async def root():
|
|
|
200 |
return {
|
201 |
"service": "Essay Analyzer",
|
202 |
"version": "2.0.0",
|
203 |
-
"
|
204 |
-
"/analyze": "POST - Analyze PDF
|
|
|
205 |
"/health": "GET - Service status"
|
206 |
}
|
207 |
}
|
|
|
1 |
# app.py: AI Detection and Plagiarism Check API
|
2 |
|
|
|
3 |
import os
|
4 |
import re
|
5 |
import torch
|
6 |
import logging
|
7 |
import tempfile
|
8 |
import numpy as np
|
9 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
|
10 |
from fastapi.responses import JSONResponse
|
11 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
12 |
from sentence_transformers import SentenceTransformer
|
13 |
from PyPDF2 import PdfReader
|
14 |
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
+
import nltk
|
16 |
+
import scipy.stats
|
17 |
+
from typing import List, Optional
|
18 |
+
|
19 |
+
# Download NLTK data
|
20 |
+
nltk.download('punkt', quiet=True)
|
21 |
|
22 |
# Configuration
|
23 |
+
PRIMARY_DETECTOR = "roberta-base-openai-detector" # More reliable base model
|
24 |
+
SECONDARY_DETECTOR = "Hello-SimpleAI/chatgpt-detector-roberta" # Current model as backup
|
25 |
+
TERTIARY_DETECTOR = "mitchelldehaven/roberta-base-openai-detector-balanced" # Balanced detector
|
26 |
+
PERPLEXITY_MODEL = "gpt2-medium" # Larger model for better perplexity estimation
|
27 |
+
SENTENCE_MODEL = "sentence-transformers/all-mpnet-base-v2" # Upgraded sentence embeddings
|
28 |
+
BATCH_SIZE = 8
|
29 |
+
MAX_TEXT_LENGTH = 10000 # Increased for better analysis
|
30 |
+
CHUNK_SIZE = 5 # Sentences per chunk
|
31 |
+
OVERLAP = 2 # Increased overlap for better continuity
|
|
|
32 |
|
33 |
app = FastAPI(title="Essay Analyzer", version="2.0.0")
|
34 |
|
|
|
43 |
logger = logging.getLogger(__name__)
|
44 |
|
45 |
# Global models
|
46 |
+
models = {
|
47 |
+
"primary": None,
|
48 |
+
"secondary": None,
|
49 |
+
"tertiary": None,
|
50 |
+
"perplexity": None,
|
51 |
+
"embedder": None
|
52 |
+
}
|
53 |
|
54 |
def load_models():
|
55 |
+
"""Load and initialize all models with optimized settings"""
|
|
|
56 |
try:
|
57 |
+
logger.info("Initializing ensemble models...")
|
58 |
+
|
59 |
+
# Primary detector
|
60 |
+
models["primary"] = pipeline(
|
61 |
+
"text-classification",
|
62 |
+
model=PRIMARY_DETECTOR,
|
63 |
+
tokenizer=PRIMARY_DETECTOR,
|
64 |
+
device=0 if torch.cuda.is_available() else -1,
|
65 |
+
top_k=None # Return all classes
|
66 |
+
)
|
67 |
+
|
68 |
+
# Secondary detector
|
69 |
+
models["secondary"] = pipeline(
|
70 |
+
"text-classification",
|
71 |
+
model=SECONDARY_DETECTOR,
|
72 |
+
tokenizer=SECONDARY_DETECTOR,
|
73 |
+
device=0 if torch.cuda.is_available() else -1,
|
74 |
+
top_k=None
|
75 |
+
)
|
76 |
+
|
77 |
+
# Tertiary detector
|
78 |
+
models["tertiary"] = pipeline(
|
79 |
+
"text-classification",
|
80 |
+
model=TERTIARY_DETECTOR,
|
81 |
+
tokenizer=TERTIARY_DETECTOR,
|
82 |
+
device=0 if torch.cuda.is_available() else -1,
|
83 |
+
top_k=None
|
84 |
+
)
|
85 |
|
86 |
+
# Load perplexity model with FP16 optimization if available
|
87 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
88 |
+
perp_tokenizer = GPT2TokenizerFast.from_pretrained(PERPLEXITY_MODEL)
|
89 |
+
perp_model = GPT2LMHeadModel.from_pretrained(PERPLEXITY_MODEL)
|
|
|
|
|
|
|
90 |
|
91 |
+
if torch.cuda.is_available():
|
92 |
+
perp_model = perp_model.half().cuda() # Use FP16 on GPU
|
93 |
+
else:
|
94 |
+
perp_model = perp_model.eval()
|
95 |
+
|
96 |
+
models["perplexity"] = {
|
97 |
+
"model": perp_model,
|
98 |
+
"tokenizer": perp_tokenizer
|
99 |
+
}
|
100 |
|
101 |
# Load embedding model
|
102 |
+
models["embedder"] = SentenceTransformer(SENTENCE_MODEL)
|
103 |
+
if torch.cuda.is_available():
|
104 |
+
models["embedder"].to(torch.device('cuda'))
|
105 |
|
106 |
logger.info("All models initialized successfully")
|
107 |
return True
|
|
|
110 |
logger.error(f"Model initialization failed: {str(e)}")
|
111 |
return False
|
112 |
|
113 |
+
def extract_text_from_pdf(file: UploadFile) -> str:
|
114 |
+
"""Enhanced PDF text extraction with error handling"""
|
115 |
try:
|
116 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
117 |
tmp.write(file.file.read())
|
118 |
+
tmp_path = tmp.name
|
119 |
+
|
120 |
+
reader = PdfReader(tmp_path)
|
121 |
+
text = []
|
122 |
+
for page in reader.pages:
|
123 |
+
page_text = page.extract_text() or ""
|
124 |
+
# Clean up text formatting
|
125 |
+
page_text = re.sub(r'\s+', ' ', page_text)
|
126 |
+
text.append(page_text.strip())
|
127 |
+
|
128 |
+
os.unlink(tmp_path) # Clean up temp file
|
129 |
+
complete_text = " ".join(text).strip()
|
130 |
+
|
131 |
+
# Remove excessive whitespace and normalize
|
132 |
+
complete_text = re.sub(r'\s+', ' ', complete_text)
|
133 |
+
|
134 |
+
return complete_text[:MAX_TEXT_LENGTH]
|
135 |
except Exception as e:
|
136 |
logger.error(f"PDF processing error: {str(e)}")
|
137 |
+
raise HTTPException(500, "PDF processing failed: " + str(e))
|
138 |
+
|
139 |
+
def get_segmented_texts(text: str) -> List[str]:
|
140 |
+
"""Create multiple segmentations for robust analysis"""
|
141 |
+
sentences = nltk.sent_tokenize(text)
|
142 |
+
|
143 |
+
# Create segments of different sizes for analysis
|
144 |
+
segments = []
|
145 |
+
|
146 |
+
# Full text (if under limit)
|
147 |
+
if len(text) <= 1024:
|
148 |
+
segments.append(text)
|
149 |
+
|
150 |
+
# Regular chunks with overlap
|
151 |
+
for i in range(0, len(sentences), CHUNK_SIZE - OVERLAP):
|
152 |
+
chunk = ' '.join(sentences[i:i+CHUNK_SIZE])
|
153 |
+
if len(chunk) >= 100: # Minimum meaningful length
|
154 |
+
segments.append(chunk)
|
155 |
+
|
156 |
+
# Paragraph-based segments (using double newlines as separators)
|
157 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
158 |
+
for para in paragraphs:
|
159 |
+
clean_para = para.strip()
|
160 |
+
if len(clean_para) >= 200: # Longer paragraph threshold
|
161 |
+
segments.append(clean_para)
|
162 |
+
|
163 |
+
return segments
|
164 |
|
165 |
def calculate_perplexity(text: str) -> float:
|
166 |
+
"""Advanced perplexity calculation with sliding window"""
|
167 |
+
perp_model = models["perplexity"]["model"]
|
168 |
+
perp_tokenizer = models["perplexity"]["tokenizer"]
|
169 |
+
|
170 |
try:
|
171 |
+
# Break into smaller chunks for accurate perplexity
|
172 |
+
sentences = nltk.sent_tokenize(text)
|
173 |
+
if not sentences:
|
174 |
+
return 100.0
|
175 |
+
|
176 |
+
# Process in sliding windows of 5 sentences
|
177 |
+
window_size = 5
|
178 |
+
stride = 2
|
179 |
+
perplexities = []
|
180 |
+
|
181 |
+
for i in range(0, max(1, len(sentences) - window_size + 1), stride):
|
182 |
+
window_text = " ".join(sentences[i:i+window_size])
|
183 |
+
if len(window_text) < 10:
|
184 |
+
continue
|
185 |
+
|
186 |
+
encodings = perp_tokenizer(window_text, return_tensors="pt", truncation=True, max_length=512)
|
187 |
+
if torch.cuda.is_available():
|
188 |
+
encodings = {k: v.cuda() for k, v in encodings.items()}
|
189 |
+
|
190 |
+
with torch.no_grad():
|
191 |
+
outputs = perp_model(**encodings, labels=encodings["input_ids"])
|
192 |
+
neg_log_likelihood = outputs.loss
|
193 |
+
perplexity = torch.exp(neg_log_likelihood).item()
|
194 |
+
perplexities.append(perplexity)
|
195 |
+
|
196 |
+
# Filter out extreme outliers
|
197 |
+
if perplexities:
|
198 |
+
filtered_perps = [p for p in perplexities if p < 1000] # Remove extreme values
|
199 |
+
if filtered_perps:
|
200 |
+
return np.median(filtered_perps) # Median is more robust than mean
|
201 |
+
|
202 |
+
return 100.0 # Default fallback
|
203 |
except Exception as e:
|
204 |
+
logger.error(f"Perplexity calculation error: {str(e)}")
|
205 |
return 100.0
|
206 |
|
207 |
+
def detect_linguistic_patterns(text: str) -> dict:
|
208 |
+
"""Detect linguistic patterns that differentiate human vs AI text"""
|
209 |
+
try:
|
210 |
+
sentences = nltk.sent_tokenize(text)
|
211 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
212 |
+
|
213 |
+
# Analyze sentence length distribution (AI often has more uniform length)
|
214 |
+
sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
|
215 |
+
sent_length_std = np.std(sent_lengths) if sent_lengths else 0
|
216 |
+
|
217 |
+
# Analyze lexical diversity (type-token ratio)
|
218 |
+
unique_words = len(set(words))
|
219 |
+
total_words = len(words)
|
220 |
+
lexical_diversity = unique_words / total_words if total_words > 0 else 0
|
221 |
+
|
222 |
+
# Sentence starter variety (AI often has repetitive starters)
|
223 |
+
starters = [s.split()[0].lower() if s.split() else "" for s in sentences]
|
224 |
+
starter_ratio = len(set(starters)) / len(starters) if starters else 0
|
225 |
+
|
226 |
+
# Paragraph length analysis
|
227 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
228 |
+
para_lengths = [len(p.split()) for p in paragraphs if p.strip()]
|
229 |
+
para_length_std = np.std(para_lengths) if para_lengths else 0
|
230 |
+
|
231 |
+
return {
|
232 |
+
"sentence_length_std": sent_length_std,
|
233 |
+
"lexical_diversity": lexical_diversity,
|
234 |
+
"starter_diversity": starter_ratio,
|
235 |
+
"paragraph_length_std": para_length_std
|
236 |
+
}
|
237 |
+
except Exception as e:
|
238 |
+
logger.error(f"Linguistic pattern detection error: {str(e)}")
|
239 |
+
return {
|
240 |
+
"sentence_length_std": 0,
|
241 |
+
"lexical_diversity": 0,
|
242 |
+
"starter_diversity": 0,
|
243 |
+
"paragraph_length_std": 0
|
244 |
+
}
|
245 |
+
|
246 |
def analyze_text(text: str) -> dict:
|
247 |
+
"""Enhanced analysis pipeline using ensemble methods"""
|
248 |
try:
|
249 |
+
# Initial text preprocessing
|
250 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
251 |
+
if len(text) < 150:
|
252 |
raise HTTPException(400, "Text too short for accurate analysis")
|
253 |
|
254 |
+
# Get text segments for analysis
|
255 |
+
segments = get_segmented_texts(text)
|
256 |
+
if not segments:
|
257 |
+
raise HTTPException(400, "Could not extract meaningful text segments")
|
258 |
+
|
259 |
+
# AI Detection using ensemble approach
|
260 |
+
primary_scores = []
|
261 |
+
secondary_scores = []
|
262 |
+
tertiary_scores = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
+
# Process each segment with both models
|
265 |
+
for segment in segments:
|
266 |
+
# Skip segments that are too short
|
267 |
+
if len(segment) < 100:
|
268 |
+
continue
|
269 |
+
|
270 |
+
# Primary model prediction
|
271 |
+
primary_result = models["primary"](segment)
|
272 |
+
ai_score = next((item["score"] for item in primary_result[0] if item["label"] == "fake"), 0.5)
|
273 |
+
primary_scores.append(ai_score)
|
274 |
+
|
275 |
+
# Secondary model prediction
|
276 |
+
secondary_result = models["secondary"](segment)
|
277 |
+
ai_score = next((item["score"] for item in secondary_result[0] if item["label"] == "AI-generated"), 0.5)
|
278 |
+
secondary_scores.append(ai_score)
|
279 |
+
|
280 |
+
# Tertiary model prediction
|
281 |
+
tertiary_result = models["tertiary"](segment)
|
282 |
+
ai_score = next((item["score"] for item in tertiary_result[0] if item["label"] in ["fake", "AI-generated"]), 0.5)
|
283 |
+
tertiary_scores.append(ai_score)
|
284 |
+
|
285 |
+
# Calculate perplexity score
|
286 |
perplexity = calculate_perplexity(text)
|
|
|
287 |
|
288 |
+
# Normalized perplexity score (lower perplexity = more likely AI-generated)
|
289 |
+
# GPT-2 typically shows perplexity of 20-60 for AI text and 50-120 for human text
|
290 |
+
perplexity_score = max(0, min(1, (120 - perplexity) / 100))
|
291 |
+
|
292 |
+
# Linguistic feature analysis
|
293 |
+
linguistic_features = detect_linguistic_patterns(text)
|
294 |
+
|
295 |
+
# Calculate linguistic pattern score (higher = more likely AI)
|
296 |
+
# AI text tends to have lower std devs and higher uniformity
|
297 |
+
linguistic_score = 0.0
|
298 |
+
if linguistic_features["sentence_length_std"] < 3.5:
|
299 |
+
linguistic_score += 0.2
|
300 |
+
if linguistic_features["lexical_diversity"] < 0.6:
|
301 |
+
linguistic_score += 0.2
|
302 |
+
if linguistic_features["starter_diversity"] < 0.7:
|
303 |
+
linguistic_score += 0.2
|
304 |
+
if linguistic_features["paragraph_length_std"] < 20:
|
305 |
+
linguistic_score += 0.2
|
306 |
+
|
307 |
+
# Ensemble scoring with weighted average
|
308 |
+
# Balance the different models based on empirical performance
|
309 |
+
weights = {
|
310 |
+
"primary": 0.35,
|
311 |
+
"secondary": 0.25,
|
312 |
+
"tertiary": 0.15,
|
313 |
+
"perplexity": 0.15,
|
314 |
+
"linguistic": 0.10
|
315 |
+
}
|
316 |
+
|
317 |
+
# Use percentiles to get more stable scores from each model
|
318 |
+
primary_confidence = np.percentile(primary_scores, 75) if primary_scores else 0.5
|
319 |
+
secondary_confidence = np.percentile(secondary_scores, 75) if secondary_scores else 0.5
|
320 |
+
tertiary_confidence = np.percentile(tertiary_scores, 75) if tertiary_scores else 0.5
|
321 |
+
|
322 |
+
# Calculate final weighted score
|
323 |
+
final_score = (
|
324 |
+
primary_confidence * weights["primary"] +
|
325 |
+
secondary_confidence * weights["secondary"] +
|
326 |
+
tertiary_confidence * weights["tertiary"] +
|
327 |
+
perplexity_score * weights["perplexity"] +
|
328 |
+
linguistic_score * weights["linguistic"]
|
329 |
+
)
|
330 |
|
331 |
+
# Convert to percentage with calibration factor
|
332 |
+
# Apply sigmoid curve to get smoother probability distribution
|
333 |
+
ai_percentage = 100 / (1 + np.exp(-10 * (final_score - 0.5)))
|
|
|
|
|
334 |
|
335 |
+
# Optional plagiarism check
|
336 |
+
sentences = nltk.sent_tokenize(text)
|
337 |
+
if len(sentences) > 5:
|
338 |
+
embeddings = models["embedder"].encode(sentences, batch_size=BATCH_SIZE)
|
339 |
+
similarity = cosine_similarity(embeddings)
|
340 |
+
np.fill_diagonal(similarity, 0)
|
341 |
+
plagiarism_score = (similarity > 0.85).mean() * 100 # Higher threshold for accuracy
|
342 |
+
else:
|
343 |
+
plagiarism_score = 0
|
344 |
+
|
345 |
+
# Return just the AI percentage for the simplified API
|
346 |
+
return {
|
347 |
+
"ai_generated": round(ai_percentage, 2)
|
348 |
+
}
|
349 |
+
|
350 |
+
# Alternatively, return detailed analytics for debugging
|
351 |
+
"""
|
352 |
return {
|
|
|
353 |
"ai_generated": round(ai_percentage, 2),
|
354 |
+
"human_written": round(100 - ai_percentage, 2),
|
355 |
"plagiarism_risk": round(plagiarism_score, 2),
|
356 |
"perplexity": round(perplexity, 2),
|
357 |
+
"model_confidences": {
|
358 |
+
"primary": round(primary_confidence * 100, 2),
|
359 |
+
"secondary": round(secondary_confidence * 100, 2),
|
360 |
+
"tertiary": round(tertiary_confidence * 100, 2)
|
361 |
+
},
|
362 |
+
"linguistic_analysis": {
|
363 |
+
"sentence_length_variation": round(linguistic_features["sentence_length_std"], 2),
|
364 |
+
"lexical_diversity": round(linguistic_features["lexical_diversity"], 2),
|
365 |
+
"sentence_starter_variety": round(linguistic_features["starter_diversity"], 2)
|
366 |
+
},
|
367 |
+
"segments_analyzed": len(segments)
|
368 |
}
|
369 |
+
"""
|
370 |
|
371 |
except HTTPException:
|
372 |
raise
|
373 |
except Exception as e:
|
374 |
logger.error(f"Analysis error: {str(e)}")
|
375 |
+
raise HTTPException(500, f"Analysis failed: {str(e)}")
|
376 |
|
377 |
@app.on_event("startup")
|
378 |
async def startup():
|
379 |
+
logger.info("Starting enhanced AI detection service...")
|
380 |
if not load_models():
|
381 |
logger.error("Service initialization failed")
|
382 |
raise RuntimeError("Failed to initialize models")
|
383 |
|
384 |
@app.post("/analyze")
|
385 |
+
async def analyze(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
|
386 |
+
"""Analyze uploaded PDF for AI content detection"""
|
387 |
try:
|
388 |
+
# Validate file type
|
389 |
if not file.filename.lower().endswith(".pdf"):
|
390 |
raise HTTPException(400, "Only PDF files accepted")
|
391 |
|
392 |
+
# Extract and analyze text
|
393 |
+
text = extract_text_from_pdf(file)
|
394 |
+
if not text or len(text) < 100:
|
395 |
+
return JSONResponse({
|
396 |
+
"ai_generated": 0,
|
397 |
+
"error": "Insufficient text extracted from PDF"
|
398 |
+
})
|
399 |
+
|
400 |
+
# Run analysis
|
401 |
+
results = analyze_text(text)
|
402 |
+
return JSONResponse(results)
|
403 |
|
404 |
except HTTPException as he:
|
405 |
raise he
|
406 |
except Exception as e:
|
407 |
logger.error(f"Unexpected error: {str(e)}")
|
408 |
+
raise HTTPException(500, f"Internal server error: {str(e)}")
|
409 |
+
|
410 |
+
@app.post("/analyze/text")
|
411 |
+
async def analyze_text_endpoint(text: str = Form(...)):
|
412 |
+
"""Analyze raw text for AI content detection"""
|
413 |
+
try:
|
414 |
+
if not text or len(text) < 100:
|
415 |
+
raise HTTPException(400, "Text too short for analysis (minimum 100 characters)")
|
416 |
+
|
417 |
+
results = analyze_text(text)
|
418 |
+
return JSONResponse(results)
|
419 |
+
|
420 |
+
except HTTPException as he:
|
421 |
+
raise he
|
422 |
+
except Exception as e:
|
423 |
+
logger.error(f"Unexpected text analysis error: {str(e)}")
|
424 |
+
raise HTTPException(500, f"Analysis error: {str(e)}")
|
425 |
|
426 |
@app.get("/health")
|
427 |
async def health():
|
428 |
+
"""Service health check endpoint"""
|
429 |
return {
|
430 |
"status": "operational",
|
431 |
+
"models": {
|
432 |
+
"primary": PRIMARY_DETECTOR,
|
433 |
+
"secondary": SECONDARY_DETECTOR,
|
434 |
+
"tertiary": TERTIARY_DETECTOR
|
435 |
+
},
|
436 |
+
"version": "3.0.0"
|
437 |
}
|
438 |
|
439 |
@app.get("/")
|
440 |
async def root():
|
441 |
+
"""API documentation endpoint"""
|
442 |
return {
|
443 |
"service": "Essay Analyzer",
|
444 |
"version": "2.0.0",
|
445 |
+
"endpoints": {
|
446 |
+
"/analyze": "POST - Analyze PDF for AI detection",
|
447 |
+
"/analyze/text": "POST - Analyze raw text for AI detection",
|
448 |
"/health": "GET - Service status"
|
449 |
}
|
450 |
}
|