Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on May 28

Commit

164dd9f

1 Parent(s): 1c12f42

Fixed the main.py

Browse files

Files changed (1) hide show

main.py +85 -69

main.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import re
 import time
@@ -5,7 +7,7 @@ import logging
 from pathlib import Path
 from typing import List, Tuple
-from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 import fitz  # PyMuPDF
@@ -51,67 +53,79 @@ app.add_middleware(
 )
 # Model configs
-MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 DEVICE = 0 if torch.cuda.is_available() else -1
-MAX_TEXT_LENGTH = 10000
 AI_CHUNK_SIZE = 512
 PLAGIARISM_THRESHOLD = 0.75
-TIMEOUT = 25  # total timeout buffer
 # Load models
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-ai_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(
-    DEVICE if DEVICE != -1 else "cpu"
-)
-ai_model.eval()
-embed_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
-embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(
-    DEVICE if DEVICE != -1 else "cpu"
-)
-embed_model.eval()
-# Health check
-# @app.get("/health")
-# def health_check():
-#     return {"status": "healthy"}
 def extract_text(pdf_bytes: bytes) -> str:
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
             text = []
             for page in doc:
                 page_text = page.get_text().strip()
                 if "reference" in page_text.lower():
-                    break  # Exclude reference section
                 text.append(page_text)
             full_text = re.sub(r"\s+", " ", "\n".join(text))[:MAX_TEXT_LENGTH]
             if len(full_text) < 150:
                 raise ValueError("Text too short")
             return full_text
     except Exception as e:
         logger.error(f"PDF error: {str(e)}")
         raise HTTPException(400, "Invalid PDF")
-def predict_ai(text: str) -> float:
-    inputs = tokenizer(
-        text,
-        truncation=True,
-        max_length=AI_CHUNK_SIZE,
-        return_tensors="pt",
-    ).to(ai_model.device)
-    with torch.no_grad():
-        outputs = ai_model(**inputs)
-        probs = torch.softmax(outputs.logits, dim=1)
-        return float(probs[0][1])  # AI-generated probability
-def compute_embeddings(sentences: List[str]) -> np.ndarray:
     inputs = embed_tokenizer(
         sentences,
         padding=True,
@@ -127,34 +141,38 @@ def compute_embeddings(sentences: List[str]) -> np.ndarray:
     last_hidden = outputs.last_hidden_state
     return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(
         1, keepdim=True
-    )
-def check_plagiarism(text: str) -> Tuple[float, bool]:
     try:
-        sentences = [
-            s for s in sent_tokenize(text) if 5 < len(s.split()) < 100
-        ][:40]  # limit
-        if len(sentences) < 2:
-            return 0.0, False
-        embeddings = compute_embeddings(sentences).cpu().numpy()
-        sim_matrix = cosine_similarity(embeddings)
-        np.fill_diagonal(sim_matrix, 0)
-        n = len(sim_matrix)
-        top_k = max(1, int(0.1 * n * (n - 1) / 2))
-        top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
-        avg_similarity = float(np.mean(sim_matrix.flatten()[top_indices]))
-        return round(avg_similarity * 100, 2), avg_similarity > PLAGIARISM_THRESHOLD
     except Exception as e:
         logger.error(f"Plagiarism check error: {str(e)}")
         return 0.0, False
 @app.post("/detect")
-async def detect_ai_and_plagiarism(file: UploadFile = File(...)):
     start_time = time.time()
     try:
@@ -164,14 +182,8 @@ async def detect_ai_and_plagiarism(file: UploadFile = File(...)):
         pdf_data = await file.read()
         text = extract_text(pdf_data)
-        async def run_ai():
-            return predict_ai(text)
-        async def run_plagiarism():
-            return check_plagiarism(text)
-        ai_future = asyncio.create_task(run_ai())
-        plagiarism_future = asyncio.create_task(run_plagiarism())
         ai_score, (plag_score, plag_risk) = await asyncio.gather(
             ai_future, plagiarism_future
@@ -179,19 +191,23 @@ async def detect_ai_and_plagiarism(file: UploadFile = File(...)):
         total_time = time.time() - start_time
         if total_time > TIMEOUT:
             raise HTTPException(500, "Processing timed out")
         return {
             "ai_generated_percentage": round(ai_score * 100, 2),
             "plagiarism_percentage": plag_score,
-            # "plagiarism_risk": plag_risk
         }
     except Exception as e:
         logger.error(f"Error: {str(e)}", exc_info=True)
         raise HTTPException(500, f"Processing failed: {str(e)}")

+# main.py: API for Detection and Plagiarism Check
 import os
 import re
 import time
 from pathlib import Path
 from typing import List, Tuple
+from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 import fitz  # PyMuPDF
 )
 # Model configs
+MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
+EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"
 DEVICE = 0 if torch.cuda.is_available() else -1
+MAX_TEXT_LENGTH = 10000
 AI_CHUNK_SIZE = 512
 PLAGIARISM_THRESHOLD = 0.75
+TIMEOUT = 30
+MAX_SENTENCES = 20
 # Load models
+try:
+    logger.info("Loading models...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    ai_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(
+        DEVICE if DEVICE != -1 else "cpu"
+    )
+    ai_model.eval()
+    embed_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
+    embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(
+        DEVICE if DEVICE != -1 else "cpu"
+    )
+    embed_model.eval()
+    logger.info("Models loaded successfully")
+except Exception as e:
+    logger.error(f"Model loading failed: {str(e)}", exc_info=True)
+    raise RuntimeError(f"Failed to initialize models: {str(e)}")
 def extract_text(pdf_bytes: bytes) -> str:
     try:
+        start_time = time.time()
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
             text = []
             for page in doc:
+                if time.time() - start_time > TIMEOUT / 3:  # Early timeout for extraction
+                    raise TimeoutError("PDF extraction timed out")
                 page_text = page.get_text().strip()
                 if "reference" in page_text.lower():
+                    break
                 text.append(page_text)
             full_text = re.sub(r"\s+", " ", "\n".join(text))[:MAX_TEXT_LENGTH]
             if len(full_text) < 150:
                 raise ValueError("Text too short")
+            logger.info(f"Extracted text: {len(full_text)} characters")
             return full_text
     except Exception as e:
         logger.error(f"PDF error: {str(e)}")
         raise HTTPException(400, "Invalid PDF")
+async def predict_ai(text: str) -> float:
+    try:
+        async with asyncio.timeout(TIMEOUT / 2):  # Per-task timeout
+            inputs = tokenizer(
+                text,
+                truncation=True,
+                max_length=AI_CHUNK_SIZE,
+                return_tensors="pt",
+            ).to(ai_model.device)
+            with torch.no_grad():
+                outputs = ai_model(**inputs)
+                probs = torch.softmax(outputs.logits, dim=1)
+            logger.info("AI detection completed")
+            return float(probs[0][1])  # AI-generated probability
+    except asyncio.TimeoutError:
+        logger.error("AI detection timed out")
+        raise HTTPException(500, "AI detection timed out")
+    except Exception as e:
+        logger.error(f"AI detection error: {str(e)}")
+        raise HTTPException(500, f"AI detection failed: {str(e)}")
+async def compute_embeddings(sentences: List[str]) -> np.ndarray:
     inputs = embed_tokenizer(
         sentences,
         padding=True,
     last_hidden = outputs.last_hidden_state
     return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(
         1, keepdim=True
+    ).cpu().numpy()
+async def check_plagiarism(text: str) -> Tuple[float, bool]:
     try:
+        async with asyncio.timeout(TIMEOUT / 2):  # Per-task timeout
+            sentences = [
+                s for s in sent_tokenize(text) if 5 < len(s.split()) < 100
+            ][:MAX_SENTENCES]
+            if len(sentences) < 2:
+                logger.info("Not enough sentences for plagiarism check")
+                return 0.0, False
+            embeddings = await compute_embeddings(sentences)
+            sim_matrix = cosine_similarity(embeddings)
+            np.fill_diagonal(sim_matrix, 0)
+            n = len(sim_matrix)
+            top_k = max(1, int(0.1 * n * (n - 1) / 2))
+            top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
+            avg_similarity = float(np.mean(sim_matrix.flatten()[top_indices]))
+            logger.info("Plagiarism check completed")
+            return round(avg_similarity * 100, 2), avg_similarity > PLAGIARISM_THRESHOLD
+    except asyncio.TimeoutError:
+        logger.error("Plagiarism check timed out")
+        return 0.0, False
     except Exception as e:
         logger.error(f"Plagiarism check error: {str(e)}")
         return 0.0, False
 @app.post("/detect")
+async def detect_ai_and_plagiarism(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     start_time = time.time()
     try:
         pdf_data = await file.read()
         text = extract_text(pdf_data)
+        ai_future = asyncio.create_task(predict_ai(text))
+        plagiarism_future = asyncio.create_task(check_plagiarism(text))
         ai_score, (plag_score, plag_risk) = await asyncio.gather(
             ai_future, plagiarism_future
         total_time = time.time() - start_time
         if total_time > TIMEOUT:
+            logger.error("Processing exceeded timeout")
             raise HTTPException(500, "Processing timed out")
+        logger.info(f"Processing completed in {total_time:.2f} seconds")
         return {
             "ai_generated_percentage": round(ai_score * 100, 2),
             "plagiarism_percentage": plag_score,
+            "plagiarism_risk": plag_risk,
+            "processing_time": round(total_time, 2),
         }
+    except HTTPException as he:
+        raise
     except Exception as e:
         logger.error(f"Error: {str(e)}", exc_info=True)
         raise HTTPException(500, f"Processing failed: {str(e)}")