Commit
·
f4e8889
1
Parent(s):
554a307
Done fixing
Browse files- convert_model.py +0 -18
- main.py +3 -203
- model_quantizer.py +0 -23
- requirements.txt +0 -13
convert_model.py
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
# # convert_model.py
|
2 |
-
|
3 |
-
# from optimum.onnxruntime import ORTModelForSequenceClassification
|
4 |
-
|
5 |
-
# # Convert and optimize model
|
6 |
-
# model = ORTModelForSequenceClassification.from_pretrained(
|
7 |
-
# "Essay-Grader/roberta-ai-detector-20250401_232702",
|
8 |
-
# export=True,
|
9 |
-
# provider="CPUExecutionProvider"
|
10 |
-
# )
|
11 |
-
|
12 |
-
# # Save optimized model
|
13 |
-
# model.save_pretrained(
|
14 |
-
# "./optimized_model",
|
15 |
-
# file_name="model_optimized.onnx"
|
16 |
-
# )
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -212,208 +212,8 @@ async def detect_ai_and_plagiarism(file: UploadFile = File(...), background_task
|
|
212 |
raise HTTPException(500, f"Processing failed: {str(e)}")
|
213 |
|
214 |
# Health check endpoint for debugging
|
215 |
-
@app.get("/health")
|
216 |
-
async def health_check():
|
217 |
-
return {"status": "healthy", "python_version": "3.11"}
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
# # main.py: Optimized AI Detection and Plagiarism Check API
|
223 |
-
|
224 |
-
# import os
|
225 |
-
# from pathlib import Path
|
226 |
-
# import logging
|
227 |
-
# from typing import List, Tuple
|
228 |
-
# import re
|
229 |
-
# import time
|
230 |
-
|
231 |
-
# # Configure cache directories first
|
232 |
-
# os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
|
233 |
-
# os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
|
234 |
-
# os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"
|
235 |
-
|
236 |
-
# # Create cache directories
|
237 |
-
# Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
|
238 |
-
# Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)
|
239 |
-
|
240 |
-
# # Import remaining dependencies
|
241 |
-
# from fastapi import FastAPI, UploadFile, File, HTTPException
|
242 |
-
# from fastapi.middleware.cors import CORSMiddleware
|
243 |
-
# from transformers import pipeline, AutoTokenizer, AutoModel
|
244 |
-
# import fitz # PyMuPDF
|
245 |
-
# import torch
|
246 |
-
# import numpy as np
|
247 |
-
# import nltk
|
248 |
-
# from nltk.tokenize import sent_tokenize
|
249 |
-
# from sklearn.metrics.pairwise import cosine_similarity
|
250 |
-
|
251 |
-
# # Configure logging
|
252 |
-
# logging.basicConfig(level=logging.INFO)
|
253 |
-
# logger = logging.getLogger(__name__)
|
254 |
-
|
255 |
-
# # Initialize NLTK data
|
256 |
-
# try:
|
257 |
-
# nltk.data.path.append("/tmp/.cache/nltk")
|
258 |
-
# nltk.data.find('tokenizers/punkt')
|
259 |
-
# logger.info("NLTK punkt tokenizer available")
|
260 |
-
# except LookupError:
|
261 |
-
# logger.info("Downloading NLTK punkt tokenizer...")
|
262 |
-
# nltk.download('punkt', download_dir="/tmp/.cache/nltk")
|
263 |
-
# nltk.data.path.append("/tmp/.cache/nltk")
|
264 |
-
|
265 |
-
# app = FastAPI()
|
266 |
-
|
267 |
-
# app.add_middleware(
|
268 |
-
# CORSMiddleware,
|
269 |
-
# allow_origins=["*"],
|
270 |
-
# allow_methods=["POST"],
|
271 |
-
# allow_headers=["*"],
|
272 |
-
# )
|
273 |
-
|
274 |
-
# # Configuration - Optimized for speed and accuracy
|
275 |
-
# # https://huggingface.co/Essay-Grader/roberta-ai-detector-20250401_232702
|
276 |
-
# # roberta-base-openai-detector
|
277 |
-
|
278 |
-
# MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702" # More accurate model
|
279 |
-
# EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2" # Faster model
|
280 |
-
# DEVICE = 0 if torch.cuda.is_available() else -1
|
281 |
-
# CHUNK_SIZE = 768 # Increased chunk size for better performance
|
282 |
-
# MIN_TEXT_LENGTH = 150
|
283 |
-
# MAX_TEXT_LENGTH = 8000 # Reduced for faster processing
|
284 |
-
# PLAGIARISM_THRESHOLD = 0.78 # Adjusted threshold
|
285 |
-
# MAX_SENTENCES = 50 # Limit sentences for plagiarism check
|
286 |
-
# TIMEOUT = 25 # Seconds before timeout
|
287 |
-
|
288 |
-
# # Health check endpoint
|
289 |
# @app.get("/health")
|
290 |
-
# def health_check():
|
291 |
-
# return {"status": "healthy"}
|
292 |
-
|
293 |
-
# # Load models at startup
|
294 |
-
# try:
|
295 |
-
# logger.info("Loading optimized AI detection model...")
|
296 |
-
# ai_detector = pipeline(
|
297 |
-
# "text-classification",
|
298 |
-
# model=MODEL_NAME,
|
299 |
-
# device=DEVICE,
|
300 |
-
# truncation=True,
|
301 |
-
# max_length=CHUNK_SIZE,
|
302 |
-
# top_k=1 # Only return top prediction
|
303 |
-
# )
|
304 |
-
|
305 |
-
# logger.info("Loading optimized embedding model...")
|
306 |
-
# tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
|
307 |
-
# embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
|
308 |
-
# embed_model.eval() # Set to evaluation mode
|
309 |
|
310 |
-
|
311 |
-
# except Exception as e:
|
312 |
-
# logger.error(f"Model loading failed: {str(e)}", exc_info=True)
|
313 |
-
# raise RuntimeError(f"Failed to initialize models: {str(e)}")
|
314 |
-
|
315 |
-
# def extract_text(pdf_bytes: bytes) -> str:
|
316 |
-
# """Optimized PDF text extraction with timeout check."""
|
317 |
-
# start_time = time.time()
|
318 |
-
# try:
|
319 |
-
# with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
320 |
-
# text = []
|
321 |
-
# for page in doc:
|
322 |
-
# if time.time() - start_time > TIMEOUT/2: # Half timeout for extraction
|
323 |
-
# raise TimeoutError("PDF extraction taking too long")
|
324 |
-
# text.append(page.get_text().strip())
|
325 |
-
# full_text = "\n".join(text).strip()
|
326 |
-
|
327 |
-
# if len(full_text) < MIN_TEXT_LENGTH:
|
328 |
-
# raise ValueError(f"Text too short (min {MIN_TEXT_LENGTH} chars required)")
|
329 |
-
# if len(full_text) > MAX_TEXT_LENGTH:
|
330 |
-
# full_text = full_text[:MAX_TEXT_LENGTH]
|
331 |
-
# return re.sub(r'\s+', ' ', full_text)
|
332 |
-
# except Exception as e:
|
333 |
-
# logger.error(f"PDF processing error: {str(e)}")
|
334 |
-
# raise HTTPException(400, "Invalid PDF content")
|
335 |
-
|
336 |
-
# def compute_embeddings(sentences: List[str]) -> np.ndarray:
|
337 |
-
# """Optimized embedding computation with batch processing."""
|
338 |
-
# inputs = tokenizer(
|
339 |
-
# sentences,
|
340 |
-
# padding=True,
|
341 |
-
# truncation=True,
|
342 |
-
# return_tensors="pt",
|
343 |
-
# max_length=128 # Reduced max length for speed
|
344 |
-
# ).to(embed_model.device)
|
345 |
-
|
346 |
-
# with torch.no_grad():
|
347 |
-
# model_output = embed_model(**inputs)
|
348 |
-
|
349 |
-
# # Simplified mean pooling
|
350 |
-
# attention_mask = inputs['attention_mask']
|
351 |
-
# token_embeddings = model_output[0]
|
352 |
-
# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
353 |
-
# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
354 |
-
|
355 |
-
# def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
|
356 |
-
# """Optimized plagiarism check with sentence limit."""
|
357 |
-
# try:
|
358 |
-
# sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5][:MAX_SENTENCES]
|
359 |
-
# if len(sentences) < 2:
|
360 |
-
# return 0.0, False
|
361 |
-
|
362 |
-
# embeddings = compute_embeddings(sentences)
|
363 |
-
# sim_matrix = cosine_similarity(embeddings)
|
364 |
-
# np.fill_diagonal(sim_matrix, 0)
|
365 |
-
|
366 |
-
# # Only check top 10 most similar pairs for speed
|
367 |
-
# top_indices = np.argpartition(sim_matrix.flatten(), -10)[-10:]
|
368 |
-
# avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
|
369 |
-
|
370 |
-
# return round(float(avg_similarity) * 100, 2), bool(avg_similarity > PLAGIARISM_THRESHOLD)
|
371 |
-
# except Exception as e:
|
372 |
-
# logger.error(f"Plagiarism check failed: {str(e)}")
|
373 |
-
# return 0.0, False
|
374 |
-
|
375 |
-
# @app.post("/detect")
|
376 |
-
# async def detect_ai_content(file: UploadFile = File(...)):
|
377 |
-
# """Optimized detection endpoint with timeout."""
|
378 |
-
# start_time = time.time()
|
379 |
-
|
380 |
-
# try:
|
381 |
-
# # Validate file type quickly
|
382 |
-
# if not file.filename.lower().endswith('.pdf'):
|
383 |
-
# raise HTTPException(400, "Only PDF files are accepted")
|
384 |
-
|
385 |
-
# # Extract text with timeout check
|
386 |
-
# text = extract_text(await file.read())
|
387 |
-
# logger.info(f"Processing document with {len(text)} characters")
|
388 |
-
|
389 |
-
# # AI Detection with timeout check
|
390 |
-
# ai_score = 0.0
|
391 |
-
# try:
|
392 |
-
# result = ai_detector(text[:MAX_TEXT_LENGTH]) # Process only first MAX_TEXT_LENGTH chars
|
393 |
-
# ai_score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
|
394 |
-
# except Exception as e:
|
395 |
-
# logger.error(f"AI detection failed: {str(e)}")
|
396 |
-
# raise HTTPException(500, "AI detection processing error")
|
397 |
-
|
398 |
-
# # Plagiarism check with timeout
|
399 |
-
# plagiarism_score, plagiarism_detected = 0.0, False
|
400 |
-
# if time.time() - start_time < TIMEOUT - 5: # Leave 5 seconds for response
|
401 |
-
# plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
|
402 |
-
|
403 |
-
# # Final timeout check
|
404 |
-
# if time.time() - start_time > TIMEOUT:
|
405 |
-
# raise HTTPException(500, "Analysis timed out")
|
406 |
-
|
407 |
-
# return {
|
408 |
-
# "ai_generated_percentage": round(float(ai_score) * 100, 2),
|
409 |
-
# "plagiarism_risk": bool(plagiarism_detected),
|
410 |
-
# "plagiarism_score": float(plagiarism_score),
|
411 |
-
# "processing_time": round(time.time() - start_time, 2)
|
412 |
-
# }
|
413 |
-
|
414 |
-
# except HTTPException as he:
|
415 |
-
# raise
|
416 |
-
# except Exception as e:
|
417 |
-
# logger.error(f"Detection error: {str(e)}", exc_info=True)
|
418 |
-
# raise HTTPException(500, f"Analysis failed: {str(e)}")
|
419 |
-
|
|
|
212 |
raise HTTPException(500, f"Processing failed: {str(e)}")
|
213 |
|
214 |
# Health check endpoint for debugging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
# @app.get("/health")
|
216 |
+
# async def health_check():
|
217 |
+
# return {"status": "healthy", "python_version": "3.11"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_quantizer.py
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
# # model_quantizer.py
|
2 |
-
|
3 |
-
# from transformers import AutoModelForSequenceClassification
|
4 |
-
# from optimum.onnxruntime import ORTOptimizer, ORTModelForSequenceClassification
|
5 |
-
# from optimum.onnxruntime.configuration import OptimizationConfig
|
6 |
-
|
7 |
-
# model = ORTModelForSequenceClassification.from_pretrained(
|
8 |
-
# "Essay-Grader/roberta-ai-detector-20250401_232702",
|
9 |
-
# from_transformers=True
|
10 |
-
# )
|
11 |
-
|
12 |
-
# optimizer = ORTOptimizer.from_pretrained(model)
|
13 |
-
# optimization_config = OptimizationConfig(
|
14 |
-
# optimization_level=99,
|
15 |
-
# enable_transformers_specific_optimizations=True,
|
16 |
-
# optimize_for_gpu=True,
|
17 |
-
# fp16=True
|
18 |
-
# )
|
19 |
-
|
20 |
-
# optimizer.optimize(
|
21 |
-
# save_dir="./optimized_model",
|
22 |
-
# optimization_config=optimization_config
|
23 |
-
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -10,16 +10,3 @@ torch==2.0.1
|
|
10 |
scikit-learn==1.2.2
|
11 |
numpy==1.24.3
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
# fastapi>=0.95.2,<1.0.0 # Pinning to major version for stability
|
18 |
-
# uvicorn>=0.22.0,<0.25.0 # With compatible ASGI server range
|
19 |
-
# transformers>=4.31.0,<4.35.0 # Keeping within known-compatible versions
|
20 |
-
# torch>=2.0.1,<2.1.0 # Matching your CUDA/CPU requirements
|
21 |
-
# pymupdf>=1.22.5,<1.24.0 # Stable PDF processing
|
22 |
-
# python-multipart>=0.0.6 # For file uploads
|
23 |
-
# scikit-learn>=1.2.0,<1.4.0 # For cosine_similarity
|
24 |
-
# nltk>=3.8.1,<3.9.0 # For sentence tokenization
|
25 |
-
# numpy>=1.23.0,<1.26.0 # For numerical operations
|
|
|
10 |
scikit-learn==1.2.2
|
11 |
numpy==1.24.3
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|