Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

COM ADMIN commited on May 4

Commit

e4267ec

1 Parent(s): ccf4e3f

Try another model for accuracy

Browse files

Files changed (1) hide show

main.py +37 -73

main.py CHANGED Viewed

@@ -16,7 +16,7 @@ os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"
 Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
 Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)
-# Import dependencies
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModel
@@ -27,11 +27,11 @@ import nltk
 from nltk.tokenize import sent_tokenize
 from sklearn.metrics.pairwise import cosine_similarity
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize NLTK
 try:
     nltk.data.path.append("/tmp/.cache/nltk")
     nltk.data.find('tokenizers/punkt')
@@ -40,8 +40,8 @@ except LookupError:
     nltk.download('punkt', download_dir="/tmp/.cache/nltk")
     nltk.data.path.append("/tmp/.cache/nltk")
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -49,104 +49,81 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Configuration - Using your fine-tuned model
-MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702" # Your fine-tuned
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Efficient embedding model
 DEVICE = 0 if torch.cuda.is_available() else -1
-MAX_TEXT_LENGTH = 6000  # Optimal balance between accuracy and speed
 PLAGIARISM_THRESHOLD = 0.75
-TIMEOUT = 25  # 25 second timeout
-AI_CHUNK_SIZE = 718  # Matches your model's expected input size
-# Health check endpoint
-@app.get("/health")
-def health_check():
-    return {"status": "healthy"}
 # Load models
 try:
-    logger.info("Loading fine-tuned AI detection model...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     ai_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE if DEVICE != -1 else "cpu")
     ai_model.eval()
-    logger.info("Loading embedding model...")
     embed_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
     embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
     embed_model.eval()
-    logger.info("All models loaded successfully")
 except Exception as e:
     logger.error(f"Model loading failed: {str(e)}", exc_info=True)
     raise RuntimeError(f"Failed to initialize models: {str(e)}")
 def extract_text(pdf_bytes: bytes) -> str:
-    """Efficient text extraction with length control"""
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
             text = []
             for page in doc:
-                if len('\n'.join(text)) > MAX_TEXT_LENGTH:
-                    break
                 text.append(page.get_text().strip())
-            full_text = re.sub(r'\s+', ' ', '\n'.join(text))[:MAX_TEXT_LENGTH]
-            if len(full_text) < 150:  # Minimum viable text length
                 raise ValueError("Text too short")
             return full_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         raise HTTPException(400, "Invalid PDF content")
 def predict_ai(text: str) -> float:
-    """Run inference using your fine-tuned model"""
-    inputs = tokenizer(
-        text,
-        truncation=True,
-        max_length=AI_CHUNK_SIZE,
-        return_tensors="pt"
-    ).to(ai_model.device)
-    with torch.no_grad():
-        outputs = ai_model(**inputs)
-    probs = torch.softmax(outputs.logits, dim=1)
-    return float(probs[0][1])  # Assuming label '1' is AI-generated
 def compute_embeddings(sentences: List[str]) -> np.ndarray:
-    """Efficient embedding computation"""
-    inputs = embed_tokenizer(
-        sentences,
-        padding=True,
-        truncation=True,
-        max_length=128,
-        return_tensors="pt"
-    ).to(embed_model.device)
     with torch.no_grad():
         outputs = embed_model(**inputs)
-    # Mean pooling
     attention_mask = inputs['attention_mask']
     last_hidden = outputs.last_hidden_state
     return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
 def check_plagiarism(text: str) -> Tuple[float, bool]:
-    """Optimized plagiarism check"""
     try:
-        sentences = [s for s in sent_tokenize(text) if 5 < len(s.split()) < 100][:40]  # Limit sentences
         if len(sentences) < 2:
             return 0.0, False
         embeddings = compute_embeddings(sentences).cpu().numpy()
         sim_matrix = cosine_similarity(embeddings)
         np.fill_diagonal(sim_matrix, 0)
-        # Check top 10% most similar pairs
         n = len(sim_matrix)
         top_k = max(1, int(0.1 * n * (n - 1) / 2))
         top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
         avg_similarity = float(np.mean(sim_matrix.flatten()[top_indices]))
         return round(avg_similarity * 100, 2), avg_similarity > PLAGIARISM_THRESHOLD
     except Exception as e:
         logger.error(f"Plagiarism check error: {str(e)}")
@@ -154,37 +131,25 @@ def check_plagiarism(text: str) -> Tuple[float, bool]:
 @app.post("/detect")
 async def detect_ai_content(file: UploadFile = File(...)):
-    """Optimized endpoint using your fine-tuned model"""
     start_time = time.time()
     try:
-        # Fast validation
         if not file.filename.lower().endswith('.pdf'):
             raise HTTPException(400, "Only PDF files are accepted")
-        # Extract text
         text = extract_text(await file.read())
-        logger.info(f"Processing {len(text)} characters")
-        # AI Detection (using your fine-tuned model)
-        ai_score = predict_ai(text[:AI_CHUNK_SIZE])
         ai_percentage = round(ai_score * 100, 2)
-        # Plagiarism check (if time permits)
-        plagiarism_score, plagiarism_risk = 0.0, False
-        if time.time() - start_time < TIMEOUT - 5:  # 5 second buffer
-            plagiarism_score, plagiarism_risk = check_plagiarism(text)
-        # Final timeout check
         if time.time() - start_time > TIMEOUT:
             raise HTTPException(500, "Processing timed out")
         return {
-            "ai_generated_percentage": ai_percentage,
-            "plagiarism_risk": plagiarism_risk,
-            "plagiarism_score": plagiarism_score,
-            "processing_time": round(time.time() - start_time, 2),
-            "model_used": MODEL_NAME  # Show which model was used
         }
     except HTTPException:
@@ -196,7 +161,6 @@ async def detect_ai_content(file: UploadFile = File(...)):
 # # main.py: Optimized AI Detection and Plagiarism Check API
 # import os

 Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
 Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)
+# Imports
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModel
 from nltk.tokenize import sent_tokenize
 from sklearn.metrics.pairwise import cosine_similarity
+# Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# NLTK setup
 try:
     nltk.data.path.append("/tmp/.cache/nltk")
     nltk.data.find('tokenizers/punkt')
     nltk.download('punkt', download_dir="/tmp/.cache/nltk")
     nltk.data.path.append("/tmp/.cache/nltk")
+# FastAPI init
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Configs
+MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 DEVICE = 0 if torch.cuda.is_available() else -1
 PLAGIARISM_THRESHOLD = 0.75
+TIMEOUT = 25
+AI_CHUNK_SIZE = 718
 # Load models
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     ai_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE if DEVICE != -1 else "cpu")
     ai_model.eval()
     embed_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
     embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
     embed_model.eval()
+    logger.info("Models loaded")
 except Exception as e:
     logger.error(f"Model loading failed: {str(e)}", exc_info=True)
     raise RuntimeError(f"Failed to initialize models: {str(e)}")
 def extract_text(pdf_bytes: bytes) -> str:
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
             text = []
             for page in doc:
                 text.append(page.get_text().strip())
+            full_text = re.sub(r'\s+', ' ', '\n'.join(text))
+            # Cut off after "References" or similar
+            match = re.search(r'(references|bibliography|works cited)', full_text, re.IGNORECASE)
+            if match:
+                full_text = full_text[:match.start()]
+            if len(full_text) < 150:
                 raise ValueError("Text too short")
             return full_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         raise HTTPException(400, "Invalid PDF content")
 def predict_ai(text: str) -> float:
+    chunks = [text[i:i+AI_CHUNK_SIZE] for i in range(0, len(text), AI_CHUNK_SIZE)]
+    total_score = 0.0
+    for chunk in chunks:
+        inputs = tokenizer(chunk, truncation=True, max_length=AI_CHUNK_SIZE, return_tensors="pt").to(ai_model.device)
+        with torch.no_grad():
+            outputs = ai_model(**inputs)
+        probs = torch.softmax(outputs.logits, dim=1)
+        total_score += float(probs[0][1])
+    avg_score = total_score / len(chunks)
+    return avg_score
 def compute_embeddings(sentences: List[str]) -> np.ndarray:
+    inputs = embed_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors="pt").to(embed_model.device)
     with torch.no_grad():
         outputs = embed_model(**inputs)
     attention_mask = inputs['attention_mask']
     last_hidden = outputs.last_hidden_state
     return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
 def check_plagiarism(text: str) -> Tuple[float, bool]:
     try:
+        sentences = [s for s in sent_tokenize(text) if 5 < len(s.split()) < 100][:40]
         if len(sentences) < 2:
             return 0.0, False
         embeddings = compute_embeddings(sentences).cpu().numpy()
         sim_matrix = cosine_similarity(embeddings)
         np.fill_diagonal(sim_matrix, 0)
         n = len(sim_matrix)
         top_k = max(1, int(0.1 * n * (n - 1) / 2))
         top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
         avg_similarity = float(np.mean(sim_matrix.flatten()[top_indices]))
         return round(avg_similarity * 100, 2), avg_similarity > PLAGIARISM_THRESHOLD
     except Exception as e:
         logger.error(f"Plagiarism check error: {str(e)}")
 @app.post("/detect")
 async def detect_ai_content(file: UploadFile = File(...)):
     start_time = time.time()
     try:
         if not file.filename.lower().endswith('.pdf'):
             raise HTTPException(400, "Only PDF files are accepted")
         text = extract_text(await file.read())
+        logger.info(f"Text length: {len(text)}")
+        ai_score = predict_ai(text)
         ai_percentage = round(ai_score * 100, 2)
+        if time.time() - start_time < TIMEOUT - 5:
+            check_plagiarism(text)  # Run, but don’t return
         if time.time() - start_time > TIMEOUT:
             raise HTTPException(500, "Processing timed out")
         return {
+            "ai_generated_percentage": ai_percentage
         }
     except HTTPException:
 # # main.py: Optimized AI Detection and Plagiarism Check API
 # import os