Essay-Grader commited on
Commit
f3d44f4
·
1 Parent(s): f177806

Further changed the model

Browse files
Files changed (1) hide show
  1. app.py +346 -103
app.py CHANGED
@@ -1,30 +1,34 @@
1
  # app.py: AI Detection and Plagiarism Check API
2
 
3
-
4
  import os
5
  import re
6
  import torch
7
  import logging
8
  import tempfile
9
  import numpy as np
10
- from fastapi import FastAPI, UploadFile, File, HTTPException
11
  from fastapi.responses import JSONResponse
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
13
  from sentence_transformers import SentenceTransformer
14
  from PyPDF2 import PdfReader
15
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
16
 
17
  # Configuration
18
- DETECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
19
- PERPLEXITY_MODEL = "gpt2"
20
- SENTENCE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
- AI_THRESHOLD = 0.55
22
- PLAGIARISM_THRESHOLD = 0.75
23
- MAX_SEQ_LENGTH = 256 # Reduced for better chunk processing
24
- BATCH_SIZE = 4
25
- MAX_TEXT_LENGTH = 4096
26
- CHUNK_SIZE = 3 # Sentences per chunk
27
- OVERLAP = 1
28
 
29
  app = FastAPI(title="Essay Analyzer", version="2.0.0")
30
 
@@ -39,32 +43,65 @@ logging.basicConfig(level=logging.INFO)
39
  logger = logging.getLogger(__name__)
40
 
41
  # Global models
42
- detection_model = None
43
- perplexity_model = None
44
- detection_tokenizer = None
45
- perp_tokenizer = None
46
- embedder = None
 
 
47
 
48
  def load_models():
49
- global detection_model, perplexity_model, detection_tokenizer, perp_tokenizer, embedder
50
-
51
  try:
52
- logger.info("Initializing optimized models...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Load detection model
55
- detection_tokenizer = AutoTokenizer.from_pretrained(DETECTION_MODEL)
56
- detection_model = AutoModelForSequenceClassification.from_pretrained(
57
- DETECTION_MODEL,
58
- num_labels=2,
59
- trust_remote_code=True
60
- ).eval()
61
 
62
- # Load perplexity model
63
- perp_tokenizer = AutoTokenizer.from_pretrained(PERPLEXITY_MODEL)
64
- perplexity_model = AutoModelForCausalLM.from_pretrained(PERPLEXITY_MODEL).eval()
 
 
 
 
 
 
65
 
66
  # Load embedding model
67
- embedder = SentenceTransformer(SENTENCE_MODEL)
 
 
68
 
69
  logger.info("All models initialized successfully")
70
  return True
@@ -73,135 +110,341 @@ def load_models():
73
  logger.error(f"Model initialization failed: {str(e)}")
74
  return False
75
 
76
- def process_pdf(file: UploadFile) -> str:
77
- """Enhanced PDF text extraction"""
78
  try:
79
- with tempfile.NamedTemporaryFile() as tmp:
80
  tmp.write(file.file.read())
81
- tmp.seek(0)
82
- reader = PdfReader(tmp.name)
83
- text = []
84
- for page in reader.pages:
85
- page_text = page.extract_text() or ""
86
- text.append(page_text.strip())
87
- return " ".join(text)[:MAX_TEXT_LENGTH].strip()
 
 
 
 
 
 
 
 
 
 
88
  except Exception as e:
89
  logger.error(f"PDF processing error: {str(e)}")
90
- raise HTTPException(500, "PDF processing failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def calculate_perplexity(text: str) -> float:
93
- """Robust perplexity calculation"""
 
 
 
94
  try:
95
- inputs = perp_tokenizer(
96
- text,
97
- return_tensors="pt",
98
- max_length=MAX_SEQ_LENGTH,
99
- truncation=True,
100
- padding=True
101
- )
102
- with torch.no_grad():
103
- outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
104
- return torch.exp(outputs.loss).item()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  except Exception as e:
106
- logger.error(f"Perplexity error: {str(e)}")
107
  return 100.0
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def analyze_text(text: str) -> dict:
110
- """Chunk-based analysis pipeline"""
111
  try:
112
- text = re.sub(r'\s+', ' ', text).strip()[:MAX_TEXT_LENGTH]
113
- if len(text) < 500:
 
114
  raise HTTPException(400, "Text too short for accurate analysis")
115
 
116
- # Split text into overlapping chunks
117
- sentences = re.split(r'(?<=[.!?])\s+', text)
118
- chunks = []
119
- for i in range(0, len(sentences), CHUNK_SIZE - OVERLAP):
120
- chunk = ' '.join(sentences[i:i+CHUNK_SIZE])
121
- chunks.append(chunk)
122
-
123
- # AI Detection
124
- ai_confidences = []
125
- for chunk in chunks:
126
- inputs = detection_tokenizer(
127
- chunk,
128
- padding=True,
129
- truncation=True,
130
- max_length=MAX_SEQ_LENGTH,
131
- return_tensors="pt"
132
- )
133
- with torch.no_grad():
134
- outputs = detection_model(**inputs)
135
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
136
- ai_confidences.append(probs[0][1].item()) # AI class probability
137
 
138
- # Perplexity analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  perplexity = calculate_perplexity(text)
140
- perplexity_score = max(0, min(1, (perplexity - 20) / 60)) # 20-80 → 0-1
141
 
142
- # Combined scoring
143
- model_confidence = np.percentile(ai_confidences, 75) # Use 75th percentile
144
- final_score = (model_confidence * 0.8) + (perplexity_score * 0.2)
145
- ai_percentage = min(100, max(0, final_score * 125)) # Amplify scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- # Plagiarism check
148
- embeddings = embedder.encode(sentences, batch_size=BATCH_SIZE)
149
- similarity = cosine_similarity(embeddings)
150
- np.fill_diagonal(similarity, 0)
151
- plagiarism_score = (similarity > PLAGIARISM_THRESHOLD).mean() * 100
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  return {
154
- "human_written": round(100 - ai_percentage, 2),
155
  "ai_generated": round(ai_percentage, 2),
 
156
  "plagiarism_risk": round(plagiarism_score, 2),
157
  "perplexity": round(perplexity, 2),
158
- "chunks_analyzed": len(chunks)
 
 
 
 
 
 
 
 
 
 
159
  }
 
160
 
161
  except HTTPException:
162
  raise
163
  except Exception as e:
164
  logger.error(f"Analysis error: {str(e)}")
165
- raise HTTPException(500, "Analysis failed")
166
 
167
  @app.on_event("startup")
168
  async def startup():
169
- logger.info("Starting optimized service...")
170
  if not load_models():
171
  logger.error("Service initialization failed")
172
  raise RuntimeError("Failed to initialize models")
173
 
174
  @app.post("/analyze")
175
- async def analyze(file: UploadFile = File(...)):
 
176
  try:
 
177
  if not file.filename.lower().endswith(".pdf"):
178
  raise HTTPException(400, "Only PDF files accepted")
179
 
180
- text = process_pdf(file)
181
- return JSONResponse(analyze_text(text))
 
 
 
 
 
 
 
 
 
182
 
183
  except HTTPException as he:
184
  raise he
185
  except Exception as e:
186
  logger.error(f"Unexpected error: {str(e)}")
187
- raise HTTPException(500, "Internal server error")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  @app.get("/health")
190
  async def health():
 
191
  return {
192
  "status": "operational",
193
- "model": DETECTION_MODEL,
194
- "chunk_size": CHUNK_SIZE,
195
- "max_text_length": MAX_TEXT_LENGTH
 
 
 
196
  }
197
 
198
  @app.get("/")
199
  async def root():
 
200
  return {
201
  "service": "Essay Analyzer",
202
  "version": "2.0.0",
203
- "documentation": {
204
- "/analyze": "POST - Analyze PDF (AI detection + plagiarism check)",
 
205
  "/health": "GET - Service status"
206
  }
207
  }
 
1
  # app.py: AI Detection and Plagiarism Check API
2
 
 
3
  import os
4
  import re
5
  import torch
6
  import logging
7
  import tempfile
8
  import numpy as np
9
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
10
  from fastapi.responses import JSONResponse
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
12
  from sentence_transformers import SentenceTransformer
13
  from PyPDF2 import PdfReader
14
  from sklearn.metrics.pairwise import cosine_similarity
15
+ import nltk
16
+ import scipy.stats
17
+ from typing import List, Optional
18
+
19
+ # Download NLTK data
20
+ nltk.download('punkt', quiet=True)
21
 
22
  # Configuration
23
+ PRIMARY_DETECTOR = "roberta-base-openai-detector" # More reliable base model
24
+ SECONDARY_DETECTOR = "Hello-SimpleAI/chatgpt-detector-roberta" # Current model as backup
25
+ TERTIARY_DETECTOR = "mitchelldehaven/roberta-base-openai-detector-balanced" # Balanced detector
26
+ PERPLEXITY_MODEL = "gpt2-medium" # Larger model for better perplexity estimation
27
+ SENTENCE_MODEL = "sentence-transformers/all-mpnet-base-v2" # Upgraded sentence embeddings
28
+ BATCH_SIZE = 8
29
+ MAX_TEXT_LENGTH = 10000 # Increased for better analysis
30
+ CHUNK_SIZE = 5 # Sentences per chunk
31
+ OVERLAP = 2 # Increased overlap for better continuity
 
32
 
33
  app = FastAPI(title="Essay Analyzer", version="2.0.0")
34
 
 
43
  logger = logging.getLogger(__name__)
44
 
45
  # Global models
46
+ models = {
47
+ "primary": None,
48
+ "secondary": None,
49
+ "tertiary": None,
50
+ "perplexity": None,
51
+ "embedder": None
52
+ }
53
 
54
  def load_models():
55
+ """Load and initialize all models with optimized settings"""
 
56
  try:
57
+ logger.info("Initializing ensemble models...")
58
+
59
+ # Primary detector
60
+ models["primary"] = pipeline(
61
+ "text-classification",
62
+ model=PRIMARY_DETECTOR,
63
+ tokenizer=PRIMARY_DETECTOR,
64
+ device=0 if torch.cuda.is_available() else -1,
65
+ top_k=None # Return all classes
66
+ )
67
+
68
+ # Secondary detector
69
+ models["secondary"] = pipeline(
70
+ "text-classification",
71
+ model=SECONDARY_DETECTOR,
72
+ tokenizer=SECONDARY_DETECTOR,
73
+ device=0 if torch.cuda.is_available() else -1,
74
+ top_k=None
75
+ )
76
+
77
+ # Tertiary detector
78
+ models["tertiary"] = pipeline(
79
+ "text-classification",
80
+ model=TERTIARY_DETECTOR,
81
+ tokenizer=TERTIARY_DETECTOR,
82
+ device=0 if torch.cuda.is_available() else -1,
83
+ top_k=None
84
+ )
85
 
86
+ # Load perplexity model with FP16 optimization if available
87
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
88
+ perp_tokenizer = GPT2TokenizerFast.from_pretrained(PERPLEXITY_MODEL)
89
+ perp_model = GPT2LMHeadModel.from_pretrained(PERPLEXITY_MODEL)
 
 
 
90
 
91
+ if torch.cuda.is_available():
92
+ perp_model = perp_model.half().cuda() # Use FP16 on GPU
93
+ else:
94
+ perp_model = perp_model.eval()
95
+
96
+ models["perplexity"] = {
97
+ "model": perp_model,
98
+ "tokenizer": perp_tokenizer
99
+ }
100
 
101
  # Load embedding model
102
+ models["embedder"] = SentenceTransformer(SENTENCE_MODEL)
103
+ if torch.cuda.is_available():
104
+ models["embedder"].to(torch.device('cuda'))
105
 
106
  logger.info("All models initialized successfully")
107
  return True
 
110
  logger.error(f"Model initialization failed: {str(e)}")
111
  return False
112
 
113
+ def extract_text_from_pdf(file: UploadFile) -> str:
114
+ """Enhanced PDF text extraction with error handling"""
115
  try:
116
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
117
  tmp.write(file.file.read())
118
+ tmp_path = tmp.name
119
+
120
+ reader = PdfReader(tmp_path)
121
+ text = []
122
+ for page in reader.pages:
123
+ page_text = page.extract_text() or ""
124
+ # Clean up text formatting
125
+ page_text = re.sub(r'\s+', ' ', page_text)
126
+ text.append(page_text.strip())
127
+
128
+ os.unlink(tmp_path) # Clean up temp file
129
+ complete_text = " ".join(text).strip()
130
+
131
+ # Remove excessive whitespace and normalize
132
+ complete_text = re.sub(r'\s+', ' ', complete_text)
133
+
134
+ return complete_text[:MAX_TEXT_LENGTH]
135
  except Exception as e:
136
  logger.error(f"PDF processing error: {str(e)}")
137
+ raise HTTPException(500, "PDF processing failed: " + str(e))
138
+
139
+ def get_segmented_texts(text: str) -> List[str]:
140
+ """Create multiple segmentations for robust analysis"""
141
+ sentences = nltk.sent_tokenize(text)
142
+
143
+ # Create segments of different sizes for analysis
144
+ segments = []
145
+
146
+ # Full text (if under limit)
147
+ if len(text) <= 1024:
148
+ segments.append(text)
149
+
150
+ # Regular chunks with overlap
151
+ for i in range(0, len(sentences), CHUNK_SIZE - OVERLAP):
152
+ chunk = ' '.join(sentences[i:i+CHUNK_SIZE])
153
+ if len(chunk) >= 100: # Minimum meaningful length
154
+ segments.append(chunk)
155
+
156
+ # Paragraph-based segments (using double newlines as separators)
157
+ paragraphs = re.split(r'\n\s*\n', text)
158
+ for para in paragraphs:
159
+ clean_para = para.strip()
160
+ if len(clean_para) >= 200: # Longer paragraph threshold
161
+ segments.append(clean_para)
162
+
163
+ return segments
164
 
165
  def calculate_perplexity(text: str) -> float:
166
+ """Advanced perplexity calculation with sliding window"""
167
+ perp_model = models["perplexity"]["model"]
168
+ perp_tokenizer = models["perplexity"]["tokenizer"]
169
+
170
  try:
171
+ # Break into smaller chunks for accurate perplexity
172
+ sentences = nltk.sent_tokenize(text)
173
+ if not sentences:
174
+ return 100.0
175
+
176
+ # Process in sliding windows of 5 sentences
177
+ window_size = 5
178
+ stride = 2
179
+ perplexities = []
180
+
181
+ for i in range(0, max(1, len(sentences) - window_size + 1), stride):
182
+ window_text = " ".join(sentences[i:i+window_size])
183
+ if len(window_text) < 10:
184
+ continue
185
+
186
+ encodings = perp_tokenizer(window_text, return_tensors="pt", truncation=True, max_length=512)
187
+ if torch.cuda.is_available():
188
+ encodings = {k: v.cuda() for k, v in encodings.items()}
189
+
190
+ with torch.no_grad():
191
+ outputs = perp_model(**encodings, labels=encodings["input_ids"])
192
+ neg_log_likelihood = outputs.loss
193
+ perplexity = torch.exp(neg_log_likelihood).item()
194
+ perplexities.append(perplexity)
195
+
196
+ # Filter out extreme outliers
197
+ if perplexities:
198
+ filtered_perps = [p for p in perplexities if p < 1000] # Remove extreme values
199
+ if filtered_perps:
200
+ return np.median(filtered_perps) # Median is more robust than mean
201
+
202
+ return 100.0 # Default fallback
203
  except Exception as e:
204
+ logger.error(f"Perplexity calculation error: {str(e)}")
205
  return 100.0
206
 
207
+ def detect_linguistic_patterns(text: str) -> dict:
208
+ """Detect linguistic patterns that differentiate human vs AI text"""
209
+ try:
210
+ sentences = nltk.sent_tokenize(text)
211
+ words = re.findall(r'\b\w+\b', text.lower())
212
+
213
+ # Analyze sentence length distribution (AI often has more uniform length)
214
+ sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
215
+ sent_length_std = np.std(sent_lengths) if sent_lengths else 0
216
+
217
+ # Analyze lexical diversity (type-token ratio)
218
+ unique_words = len(set(words))
219
+ total_words = len(words)
220
+ lexical_diversity = unique_words / total_words if total_words > 0 else 0
221
+
222
+ # Sentence starter variety (AI often has repetitive starters)
223
+ starters = [s.split()[0].lower() if s.split() else "" for s in sentences]
224
+ starter_ratio = len(set(starters)) / len(starters) if starters else 0
225
+
226
+ # Paragraph length analysis
227
+ paragraphs = re.split(r'\n\s*\n', text)
228
+ para_lengths = [len(p.split()) for p in paragraphs if p.strip()]
229
+ para_length_std = np.std(para_lengths) if para_lengths else 0
230
+
231
+ return {
232
+ "sentence_length_std": sent_length_std,
233
+ "lexical_diversity": lexical_diversity,
234
+ "starter_diversity": starter_ratio,
235
+ "paragraph_length_std": para_length_std
236
+ }
237
+ except Exception as e:
238
+ logger.error(f"Linguistic pattern detection error: {str(e)}")
239
+ return {
240
+ "sentence_length_std": 0,
241
+ "lexical_diversity": 0,
242
+ "starter_diversity": 0,
243
+ "paragraph_length_std": 0
244
+ }
245
+
246
  def analyze_text(text: str) -> dict:
247
+ """Enhanced analysis pipeline using ensemble methods"""
248
  try:
249
+ # Initial text preprocessing
250
+ text = re.sub(r'\s+', ' ', text).strip()
251
+ if len(text) < 150:
252
  raise HTTPException(400, "Text too short for accurate analysis")
253
 
254
+ # Get text segments for analysis
255
+ segments = get_segmented_texts(text)
256
+ if not segments:
257
+ raise HTTPException(400, "Could not extract meaningful text segments")
258
+
259
+ # AI Detection using ensemble approach
260
+ primary_scores = []
261
+ secondary_scores = []
262
+ tertiary_scores = []
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
+ # Process each segment with both models
265
+ for segment in segments:
266
+ # Skip segments that are too short
267
+ if len(segment) < 100:
268
+ continue
269
+
270
+ # Primary model prediction
271
+ primary_result = models["primary"](segment)
272
+ ai_score = next((item["score"] for item in primary_result[0] if item["label"] == "fake"), 0.5)
273
+ primary_scores.append(ai_score)
274
+
275
+ # Secondary model prediction
276
+ secondary_result = models["secondary"](segment)
277
+ ai_score = next((item["score"] for item in secondary_result[0] if item["label"] == "AI-generated"), 0.5)
278
+ secondary_scores.append(ai_score)
279
+
280
+ # Tertiary model prediction
281
+ tertiary_result = models["tertiary"](segment)
282
+ ai_score = next((item["score"] for item in tertiary_result[0] if item["label"] in ["fake", "AI-generated"]), 0.5)
283
+ tertiary_scores.append(ai_score)
284
+
285
+ # Calculate perplexity score
286
  perplexity = calculate_perplexity(text)
 
287
 
288
+ # Normalized perplexity score (lower perplexity = more likely AI-generated)
289
+ # GPT-2 typically shows perplexity of 20-60 for AI text and 50-120 for human text
290
+ perplexity_score = max(0, min(1, (120 - perplexity) / 100))
291
+
292
+ # Linguistic feature analysis
293
+ linguistic_features = detect_linguistic_patterns(text)
294
+
295
+ # Calculate linguistic pattern score (higher = more likely AI)
296
+ # AI text tends to have lower std devs and higher uniformity
297
+ linguistic_score = 0.0
298
+ if linguistic_features["sentence_length_std"] < 3.5:
299
+ linguistic_score += 0.2
300
+ if linguistic_features["lexical_diversity"] < 0.6:
301
+ linguistic_score += 0.2
302
+ if linguistic_features["starter_diversity"] < 0.7:
303
+ linguistic_score += 0.2
304
+ if linguistic_features["paragraph_length_std"] < 20:
305
+ linguistic_score += 0.2
306
+
307
+ # Ensemble scoring with weighted average
308
+ # Balance the different models based on empirical performance
309
+ weights = {
310
+ "primary": 0.35,
311
+ "secondary": 0.25,
312
+ "tertiary": 0.15,
313
+ "perplexity": 0.15,
314
+ "linguistic": 0.10
315
+ }
316
+
317
+ # Use percentiles to get more stable scores from each model
318
+ primary_confidence = np.percentile(primary_scores, 75) if primary_scores else 0.5
319
+ secondary_confidence = np.percentile(secondary_scores, 75) if secondary_scores else 0.5
320
+ tertiary_confidence = np.percentile(tertiary_scores, 75) if tertiary_scores else 0.5
321
+
322
+ # Calculate final weighted score
323
+ final_score = (
324
+ primary_confidence * weights["primary"] +
325
+ secondary_confidence * weights["secondary"] +
326
+ tertiary_confidence * weights["tertiary"] +
327
+ perplexity_score * weights["perplexity"] +
328
+ linguistic_score * weights["linguistic"]
329
+ )
330
 
331
+ # Convert to percentage with calibration factor
332
+ # Apply sigmoid curve to get smoother probability distribution
333
+ ai_percentage = 100 / (1 + np.exp(-10 * (final_score - 0.5)))
 
 
334
 
335
+ # Optional plagiarism check
336
+ sentences = nltk.sent_tokenize(text)
337
+ if len(sentences) > 5:
338
+ embeddings = models["embedder"].encode(sentences, batch_size=BATCH_SIZE)
339
+ similarity = cosine_similarity(embeddings)
340
+ np.fill_diagonal(similarity, 0)
341
+ plagiarism_score = (similarity > 0.85).mean() * 100 # Higher threshold for accuracy
342
+ else:
343
+ plagiarism_score = 0
344
+
345
+ # Return just the AI percentage for the simplified API
346
+ return {
347
+ "ai_generated": round(ai_percentage, 2)
348
+ }
349
+
350
+ # Alternatively, return detailed analytics for debugging
351
+ """
352
  return {
 
353
  "ai_generated": round(ai_percentage, 2),
354
+ "human_written": round(100 - ai_percentage, 2),
355
  "plagiarism_risk": round(plagiarism_score, 2),
356
  "perplexity": round(perplexity, 2),
357
+ "model_confidences": {
358
+ "primary": round(primary_confidence * 100, 2),
359
+ "secondary": round(secondary_confidence * 100, 2),
360
+ "tertiary": round(tertiary_confidence * 100, 2)
361
+ },
362
+ "linguistic_analysis": {
363
+ "sentence_length_variation": round(linguistic_features["sentence_length_std"], 2),
364
+ "lexical_diversity": round(linguistic_features["lexical_diversity"], 2),
365
+ "sentence_starter_variety": round(linguistic_features["starter_diversity"], 2)
366
+ },
367
+ "segments_analyzed": len(segments)
368
  }
369
+ """
370
 
371
  except HTTPException:
372
  raise
373
  except Exception as e:
374
  logger.error(f"Analysis error: {str(e)}")
375
+ raise HTTPException(500, f"Analysis failed: {str(e)}")
376
 
377
  @app.on_event("startup")
378
  async def startup():
379
+ logger.info("Starting enhanced AI detection service...")
380
  if not load_models():
381
  logger.error("Service initialization failed")
382
  raise RuntimeError("Failed to initialize models")
383
 
384
  @app.post("/analyze")
385
+ async def analyze(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
386
+ """Analyze uploaded PDF for AI content detection"""
387
  try:
388
+ # Validate file type
389
  if not file.filename.lower().endswith(".pdf"):
390
  raise HTTPException(400, "Only PDF files accepted")
391
 
392
+ # Extract and analyze text
393
+ text = extract_text_from_pdf(file)
394
+ if not text or len(text) < 100:
395
+ return JSONResponse({
396
+ "ai_generated": 0,
397
+ "error": "Insufficient text extracted from PDF"
398
+ })
399
+
400
+ # Run analysis
401
+ results = analyze_text(text)
402
+ return JSONResponse(results)
403
 
404
  except HTTPException as he:
405
  raise he
406
  except Exception as e:
407
  logger.error(f"Unexpected error: {str(e)}")
408
+ raise HTTPException(500, f"Internal server error: {str(e)}")
409
+
410
+ @app.post("/analyze/text")
411
+ async def analyze_text_endpoint(text: str = Form(...)):
412
+ """Analyze raw text for AI content detection"""
413
+ try:
414
+ if not text or len(text) < 100:
415
+ raise HTTPException(400, "Text too short for analysis (minimum 100 characters)")
416
+
417
+ results = analyze_text(text)
418
+ return JSONResponse(results)
419
+
420
+ except HTTPException as he:
421
+ raise he
422
+ except Exception as e:
423
+ logger.error(f"Unexpected text analysis error: {str(e)}")
424
+ raise HTTPException(500, f"Analysis error: {str(e)}")
425
 
426
  @app.get("/health")
427
  async def health():
428
+ """Service health check endpoint"""
429
  return {
430
  "status": "operational",
431
+ "models": {
432
+ "primary": PRIMARY_DETECTOR,
433
+ "secondary": SECONDARY_DETECTOR,
434
+ "tertiary": TERTIARY_DETECTOR
435
+ },
436
+ "version": "3.0.0"
437
  }
438
 
439
  @app.get("/")
440
  async def root():
441
+ """API documentation endpoint"""
442
  return {
443
  "service": "Essay Analyzer",
444
  "version": "2.0.0",
445
+ "endpoints": {
446
+ "/analyze": "POST - Analyze PDF for AI detection",
447
+ "/analyze/text": "POST - Analyze raw text for AI detection",
448
  "/health": "GET - Service status"
449
  }
450
  }