Essay-Grader commited on
Commit
f4e8889
·
1 Parent(s): 554a307

Done fixing

Browse files
Files changed (4) hide show
  1. convert_model.py +0 -18
  2. main.py +3 -203
  3. model_quantizer.py +0 -23
  4. requirements.txt +0 -13
convert_model.py DELETED
@@ -1,18 +0,0 @@
1
- # # convert_model.py
2
-
3
- # from optimum.onnxruntime import ORTModelForSequenceClassification
4
-
5
- # # Convert and optimize model
6
- # model = ORTModelForSequenceClassification.from_pretrained(
7
- # "Essay-Grader/roberta-ai-detector-20250401_232702",
8
- # export=True,
9
- # provider="CPUExecutionProvider"
10
- # )
11
-
12
- # # Save optimized model
13
- # model.save_pretrained(
14
- # "./optimized_model",
15
- # file_name="model_optimized.onnx"
16
- # )
17
-
18
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -212,208 +212,8 @@ async def detect_ai_and_plagiarism(file: UploadFile = File(...), background_task
212
  raise HTTPException(500, f"Processing failed: {str(e)}")
213
 
214
  # Health check endpoint for debugging
215
- @app.get("/health")
216
- async def health_check():
217
- return {"status": "healthy", "python_version": "3.11"}
218
-
219
-
220
-
221
-
222
- # # main.py: Optimized AI Detection and Plagiarism Check API
223
-
224
- # import os
225
- # from pathlib import Path
226
- # import logging
227
- # from typing import List, Tuple
228
- # import re
229
- # import time
230
-
231
- # # Configure cache directories first
232
- # os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
233
- # os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
234
- # os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"
235
-
236
- # # Create cache directories
237
- # Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
238
- # Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)
239
-
240
- # # Import remaining dependencies
241
- # from fastapi import FastAPI, UploadFile, File, HTTPException
242
- # from fastapi.middleware.cors import CORSMiddleware
243
- # from transformers import pipeline, AutoTokenizer, AutoModel
244
- # import fitz # PyMuPDF
245
- # import torch
246
- # import numpy as np
247
- # import nltk
248
- # from nltk.tokenize import sent_tokenize
249
- # from sklearn.metrics.pairwise import cosine_similarity
250
-
251
- # # Configure logging
252
- # logging.basicConfig(level=logging.INFO)
253
- # logger = logging.getLogger(__name__)
254
-
255
- # # Initialize NLTK data
256
- # try:
257
- # nltk.data.path.append("/tmp/.cache/nltk")
258
- # nltk.data.find('tokenizers/punkt')
259
- # logger.info("NLTK punkt tokenizer available")
260
- # except LookupError:
261
- # logger.info("Downloading NLTK punkt tokenizer...")
262
- # nltk.download('punkt', download_dir="/tmp/.cache/nltk")
263
- # nltk.data.path.append("/tmp/.cache/nltk")
264
-
265
- # app = FastAPI()
266
-
267
- # app.add_middleware(
268
- # CORSMiddleware,
269
- # allow_origins=["*"],
270
- # allow_methods=["POST"],
271
- # allow_headers=["*"],
272
- # )
273
-
274
- # # Configuration - Optimized for speed and accuracy
275
- # # https://huggingface.co/Essay-Grader/roberta-ai-detector-20250401_232702
276
- # # roberta-base-openai-detector
277
-
278
- # MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702" # More accurate model
279
- # EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2" # Faster model
280
- # DEVICE = 0 if torch.cuda.is_available() else -1
281
- # CHUNK_SIZE = 768 # Increased chunk size for better performance
282
- # MIN_TEXT_LENGTH = 150
283
- # MAX_TEXT_LENGTH = 8000 # Reduced for faster processing
284
- # PLAGIARISM_THRESHOLD = 0.78 # Adjusted threshold
285
- # MAX_SENTENCES = 50 # Limit sentences for plagiarism check
286
- # TIMEOUT = 25 # Seconds before timeout
287
-
288
- # # Health check endpoint
289
  # @app.get("/health")
290
- # def health_check():
291
- # return {"status": "healthy"}
292
-
293
- # # Load models at startup
294
- # try:
295
- # logger.info("Loading optimized AI detection model...")
296
- # ai_detector = pipeline(
297
- # "text-classification",
298
- # model=MODEL_NAME,
299
- # device=DEVICE,
300
- # truncation=True,
301
- # max_length=CHUNK_SIZE,
302
- # top_k=1 # Only return top prediction
303
- # )
304
-
305
- # logger.info("Loading optimized embedding model...")
306
- # tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
307
- # embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
308
- # embed_model.eval() # Set to evaluation mode
309
 
310
- # logger.info("All models loaded successfully")
311
- # except Exception as e:
312
- # logger.error(f"Model loading failed: {str(e)}", exc_info=True)
313
- # raise RuntimeError(f"Failed to initialize models: {str(e)}")
314
-
315
- # def extract_text(pdf_bytes: bytes) -> str:
316
- # """Optimized PDF text extraction with timeout check."""
317
- # start_time = time.time()
318
- # try:
319
- # with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
320
- # text = []
321
- # for page in doc:
322
- # if time.time() - start_time > TIMEOUT/2: # Half timeout for extraction
323
- # raise TimeoutError("PDF extraction taking too long")
324
- # text.append(page.get_text().strip())
325
- # full_text = "\n".join(text).strip()
326
-
327
- # if len(full_text) < MIN_TEXT_LENGTH:
328
- # raise ValueError(f"Text too short (min {MIN_TEXT_LENGTH} chars required)")
329
- # if len(full_text) > MAX_TEXT_LENGTH:
330
- # full_text = full_text[:MAX_TEXT_LENGTH]
331
- # return re.sub(r'\s+', ' ', full_text)
332
- # except Exception as e:
333
- # logger.error(f"PDF processing error: {str(e)}")
334
- # raise HTTPException(400, "Invalid PDF content")
335
-
336
- # def compute_embeddings(sentences: List[str]) -> np.ndarray:
337
- # """Optimized embedding computation with batch processing."""
338
- # inputs = tokenizer(
339
- # sentences,
340
- # padding=True,
341
- # truncation=True,
342
- # return_tensors="pt",
343
- # max_length=128 # Reduced max length for speed
344
- # ).to(embed_model.device)
345
-
346
- # with torch.no_grad():
347
- # model_output = embed_model(**inputs)
348
-
349
- # # Simplified mean pooling
350
- # attention_mask = inputs['attention_mask']
351
- # token_embeddings = model_output[0]
352
- # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
353
- # return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
354
-
355
- # def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
356
- # """Optimized plagiarism check with sentence limit."""
357
- # try:
358
- # sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5][:MAX_SENTENCES]
359
- # if len(sentences) < 2:
360
- # return 0.0, False
361
-
362
- # embeddings = compute_embeddings(sentences)
363
- # sim_matrix = cosine_similarity(embeddings)
364
- # np.fill_diagonal(sim_matrix, 0)
365
-
366
- # # Only check top 10 most similar pairs for speed
367
- # top_indices = np.argpartition(sim_matrix.flatten(), -10)[-10:]
368
- # avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
369
-
370
- # return round(float(avg_similarity) * 100, 2), bool(avg_similarity > PLAGIARISM_THRESHOLD)
371
- # except Exception as e:
372
- # logger.error(f"Plagiarism check failed: {str(e)}")
373
- # return 0.0, False
374
-
375
- # @app.post("/detect")
376
- # async def detect_ai_content(file: UploadFile = File(...)):
377
- # """Optimized detection endpoint with timeout."""
378
- # start_time = time.time()
379
-
380
- # try:
381
- # # Validate file type quickly
382
- # if not file.filename.lower().endswith('.pdf'):
383
- # raise HTTPException(400, "Only PDF files are accepted")
384
-
385
- # # Extract text with timeout check
386
- # text = extract_text(await file.read())
387
- # logger.info(f"Processing document with {len(text)} characters")
388
-
389
- # # AI Detection with timeout check
390
- # ai_score = 0.0
391
- # try:
392
- # result = ai_detector(text[:MAX_TEXT_LENGTH]) # Process only first MAX_TEXT_LENGTH chars
393
- # ai_score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
394
- # except Exception as e:
395
- # logger.error(f"AI detection failed: {str(e)}")
396
- # raise HTTPException(500, "AI detection processing error")
397
-
398
- # # Plagiarism check with timeout
399
- # plagiarism_score, plagiarism_detected = 0.0, False
400
- # if time.time() - start_time < TIMEOUT - 5: # Leave 5 seconds for response
401
- # plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
402
-
403
- # # Final timeout check
404
- # if time.time() - start_time > TIMEOUT:
405
- # raise HTTPException(500, "Analysis timed out")
406
-
407
- # return {
408
- # "ai_generated_percentage": round(float(ai_score) * 100, 2),
409
- # "plagiarism_risk": bool(plagiarism_detected),
410
- # "plagiarism_score": float(plagiarism_score),
411
- # "processing_time": round(time.time() - start_time, 2)
412
- # }
413
-
414
- # except HTTPException as he:
415
- # raise
416
- # except Exception as e:
417
- # logger.error(f"Detection error: {str(e)}", exc_info=True)
418
- # raise HTTPException(500, f"Analysis failed: {str(e)}")
419
-
 
212
  raise HTTPException(500, f"Processing failed: {str(e)}")
213
 
214
  # Health check endpoint for debugging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # @app.get("/health")
216
+ # async def health_check():
217
+ # return {"status": "healthy", "python_version": "3.11"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model_quantizer.py DELETED
@@ -1,23 +0,0 @@
1
- # # model_quantizer.py
2
-
3
- # from transformers import AutoModelForSequenceClassification
4
- # from optimum.onnxruntime import ORTOptimizer, ORTModelForSequenceClassification
5
- # from optimum.onnxruntime.configuration import OptimizationConfig
6
-
7
- # model = ORTModelForSequenceClassification.from_pretrained(
8
- # "Essay-Grader/roberta-ai-detector-20250401_232702",
9
- # from_transformers=True
10
- # )
11
-
12
- # optimizer = ORTOptimizer.from_pretrained(model)
13
- # optimization_config = OptimizationConfig(
14
- # optimization_level=99,
15
- # enable_transformers_specific_optimizations=True,
16
- # optimize_for_gpu=True,
17
- # fp16=True
18
- # )
19
-
20
- # optimizer.optimize(
21
- # save_dir="./optimized_model",
22
- # optimization_config=optimization_config
23
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -10,16 +10,3 @@ torch==2.0.1
10
  scikit-learn==1.2.2
11
  numpy==1.24.3
12
 
13
-
14
-
15
-
16
-
17
- # fastapi>=0.95.2,<1.0.0 # Pinning to major version for stability
18
- # uvicorn>=0.22.0,<0.25.0 # With compatible ASGI server range
19
- # transformers>=4.31.0,<4.35.0 # Keeping within known-compatible versions
20
- # torch>=2.0.1,<2.1.0 # Matching your CUDA/CPU requirements
21
- # pymupdf>=1.22.5,<1.24.0 # Stable PDF processing
22
- # python-multipart>=0.0.6 # For file uploads
23
- # scikit-learn>=1.2.0,<1.4.0 # For cosine_similarity
24
- # nltk>=3.8.1,<3.9.0 # For sentence tokenization
25
- # numpy>=1.23.0,<1.26.0 # For numerical operations
 
10
  scikit-learn==1.2.2
11
  numpy==1.24.3
12