Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on 24 days ago

Commit

f4e8889

1 Parent(s): 554a307

Done fixing

Browse files

Files changed (4) hide show

convert_model.py +0 -18
main.py +3 -203
model_quantizer.py +0 -23
requirements.txt +0 -13

convert_model.py DELETED Viewed

@@ -1,18 +0,0 @@
-# # convert_model.py
-# from optimum.onnxruntime import ORTModelForSequenceClassification
-# # Convert and optimize model
-# model = ORTModelForSequenceClassification.from_pretrained(
-#     "Essay-Grader/roberta-ai-detector-20250401_232702",
-#     export=True,
-#     provider="CPUExecutionProvider"
-# )
-# # Save optimized model
-# model.save_pretrained(
-#     "./optimized_model",
-#     file_name="model_optimized.onnx"
-# )

main.py CHANGED Viewed

@@ -212,208 +212,8 @@ async def detect_ai_and_plagiarism(file: UploadFile = File(...), background_task
         raise HTTPException(500, f"Processing failed: {str(e)}")
 # Health check endpoint for debugging
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy", "python_version": "3.11"}
-# # main.py: Optimized AI Detection and Plagiarism Check API
-# import os
-# from pathlib import Path
-# import logging
-# from typing import List, Tuple
-# import re
-# import time
-# # Configure cache directories first
-# os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
-# os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
-# os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"
-# # Create cache directories
-# Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
-# Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)
-# # Import remaining dependencies
-# from fastapi import FastAPI, UploadFile, File, HTTPException
-# from fastapi.middleware.cors import CORSMiddleware
-# from transformers import pipeline, AutoTokenizer, AutoModel
-# import fitz  # PyMuPDF
-# import torch
-# import numpy as np
-# import nltk
-# from nltk.tokenize import sent_tokenize
-# from sklearn.metrics.pairwise import cosine_similarity
-# # Configure logging
-# logging.basicConfig(level=logging.INFO)
-# logger = logging.getLogger(__name__)
-# # Initialize NLTK data
-# try:
-#     nltk.data.path.append("/tmp/.cache/nltk")
-#     nltk.data.find('tokenizers/punkt')
-#     logger.info("NLTK punkt tokenizer available")
-# except LookupError:
-#     logger.info("Downloading NLTK punkt tokenizer...")
-#     nltk.download('punkt', download_dir="/tmp/.cache/nltk")
-#     nltk.data.path.append("/tmp/.cache/nltk")
-# app = FastAPI()
-# app.add_middleware(
-#     CORSMiddleware,
-#     allow_origins=["*"],
-#     allow_methods=["POST"],
-#     allow_headers=["*"],
-# )
-# # Configuration - Optimized for speed and accuracy
-# # https://huggingface.co/Essay-Grader/roberta-ai-detector-20250401_232702
-# # roberta-base-openai-detector
-# MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"  # More accurate model
-# EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # Faster model
-# DEVICE = 0 if torch.cuda.is_available() else -1
-# CHUNK_SIZE = 768  # Increased chunk size for better performance
-# MIN_TEXT_LENGTH = 150
-# MAX_TEXT_LENGTH = 8000  # Reduced for faster processing
-# PLAGIARISM_THRESHOLD = 0.78  # Adjusted threshold
-# MAX_SENTENCES = 50  # Limit sentences for plagiarism check
-# TIMEOUT = 25  # Seconds before timeout
-# # Health check endpoint
 # @app.get("/health")
-# def health_check():
-#     return {"status": "healthy"}
-# # Load models at startup
-# try:
-#     logger.info("Loading optimized AI detection model...")
-#     ai_detector = pipeline(
-#         "text-classification",
-#         model=MODEL_NAME,
-#         device=DEVICE,
-#         truncation=True,
-#         max_length=CHUNK_SIZE,
-#         top_k=1  # Only return top prediction
-#     )
-#     logger.info("Loading optimized embedding model...")
-#     tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
-#     embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
-#     embed_model.eval()  # Set to evaluation mode
-#     logger.info("All models loaded successfully")
-# except Exception as e:
-#     logger.error(f"Model loading failed: {str(e)}", exc_info=True)
-#     raise RuntimeError(f"Failed to initialize models: {str(e)}")
-# def extract_text(pdf_bytes: bytes) -> str:
-#     """Optimized PDF text extraction with timeout check."""
-#     start_time = time.time()
-#     try:
-#         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
-#             text = []
-#             for page in doc:
-#                 if time.time() - start_time > TIMEOUT/2:  # Half timeout for extraction
-#                     raise TimeoutError("PDF extraction taking too long")
-#                 text.append(page.get_text().strip())
-#             full_text = "\n".join(text).strip()
-#             if len(full_text) < MIN_TEXT_LENGTH:
-#                 raise ValueError(f"Text too short (min {MIN_TEXT_LENGTH} chars required)")
-#             if len(full_text) > MAX_TEXT_LENGTH:
-#                 full_text = full_text[:MAX_TEXT_LENGTH]
-#             return re.sub(r'\s+', ' ', full_text)
-#     except Exception as e:
-#         logger.error(f"PDF processing error: {str(e)}")
-#         raise HTTPException(400, "Invalid PDF content")
-# def compute_embeddings(sentences: List[str]) -> np.ndarray:
-#     """Optimized embedding computation with batch processing."""
-#     inputs = tokenizer(
-#         sentences,
-#         padding=True,
-#         truncation=True,
-#         return_tensors="pt",
-#         max_length=128  # Reduced max length for speed
-#     ).to(embed_model.device)
-#     with torch.no_grad():
-#         model_output = embed_model(**inputs)
-#     # Simplified mean pooling
-#     attention_mask = inputs['attention_mask']
-#     token_embeddings = model_output[0]
-#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-# def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
-#     """Optimized plagiarism check with sentence limit."""
-#     try:
-#         sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5][:MAX_SENTENCES]
-#         if len(sentences) < 2:
-#             return 0.0, False
-#         embeddings = compute_embeddings(sentences)
-#         sim_matrix = cosine_similarity(embeddings)
-#         np.fill_diagonal(sim_matrix, 0)
-#         # Only check top 10 most similar pairs for speed
-#         top_indices = np.argpartition(sim_matrix.flatten(), -10)[-10:]
-#         avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
-#         return round(float(avg_similarity) * 100, 2), bool(avg_similarity > PLAGIARISM_THRESHOLD)
-#     except Exception as e:
-#         logger.error(f"Plagiarism check failed: {str(e)}")
-#         return 0.0, False
-# @app.post("/detect")
-# async def detect_ai_content(file: UploadFile = File(...)):
-#     """Optimized detection endpoint with timeout."""
-#     start_time = time.time()
-#     try:
-#         # Validate file type quickly
-#         if not file.filename.lower().endswith('.pdf'):
-#             raise HTTPException(400, "Only PDF files are accepted")
-#         # Extract text with timeout check
-#         text = extract_text(await file.read())
-#         logger.info(f"Processing document with {len(text)} characters")
-#         # AI Detection with timeout check
-#         ai_score = 0.0
-#         try:
-#             result = ai_detector(text[:MAX_TEXT_LENGTH])  # Process only first MAX_TEXT_LENGTH chars
-#             ai_score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
-#         except Exception as e:
-#             logger.error(f"AI detection failed: {str(e)}")
-#             raise HTTPException(500, "AI detection processing error")
-#         # Plagiarism check with timeout
-#         plagiarism_score, plagiarism_detected = 0.0, False
-#         if time.time() - start_time < TIMEOUT - 5:  # Leave 5 seconds for response
-#             plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
-#         # Final timeout check
-#         if time.time() - start_time > TIMEOUT:
-#             raise HTTPException(500, "Analysis timed out")
-#         return {
-#             "ai_generated_percentage": round(float(ai_score) * 100, 2),
-#             "plagiarism_risk": bool(plagiarism_detected),
-#             "plagiarism_score": float(plagiarism_score),
-#             "processing_time": round(time.time() - start_time, 2)
-#         }
-#     except HTTPException as he:
-#         raise
-#     except Exception as e:
-#         logger.error(f"Detection error: {str(e)}", exc_info=True)
-#         raise HTTPException(500, f"Analysis failed: {str(e)}")

         raise HTTPException(500, f"Processing failed: {str(e)}")
 # Health check endpoint for debugging
 # @app.get("/health")
+# async def health_check():
+#     return {"status": "healthy", "python_version": "3.11"}

model_quantizer.py DELETED Viewed

@@ -1,23 +0,0 @@
-# # model_quantizer.py
-# from transformers import AutoModelForSequenceClassification
-# from optimum.onnxruntime import ORTOptimizer, ORTModelForSequenceClassification
-# from optimum.onnxruntime.configuration import OptimizationConfig
-# model = ORTModelForSequenceClassification.from_pretrained(
-#     "Essay-Grader/roberta-ai-detector-20250401_232702",
-#     from_transformers=True
-#  )
-# optimizer = ORTOptimizer.from_pretrained(model)
-# optimization_config = OptimizationConfig(
-#     optimization_level=99,
-#     enable_transformers_specific_optimizations=True,
-#     optimize_for_gpu=True,
-#     fp16=True
-# )
-# optimizer.optimize(
-#     save_dir="./optimized_model",
-#     optimization_config=optimization_config
-# )

requirements.txt CHANGED Viewed

@@ -10,16 +10,3 @@ torch==2.0.1
 scikit-learn==1.2.2
 numpy==1.24.3
-# fastapi>=0.95.2,<1.0.0  # Pinning to major version for stability
-# uvicorn>=0.22.0,<0.25.0  # With compatible ASGI server range
-# transformers>=4.31.0,<4.35.0  # Keeping within known-compatible versions
-# torch>=2.0.1,<2.1.0  # Matching your CUDA/CPU requirements
-# pymupdf>=1.22.5,<1.24.0  # Stable PDF processing
-# python-multipart>=0.0.6  # For file uploads
-# scikit-learn>=1.2.0,<1.4.0  # For cosine_similarity
-# nltk>=3.8.1,<3.9.0  # For sentence tokenization
-# numpy>=1.23.0,<1.26.0  # For numerical operations


10	scikit-learn==1.2.2
11	numpy==1.24.3
12