Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on May 2

Commit

109611c

1 Parent(s): c7b743d

Fix the api

Browse files

Files changed (1) hide show

main.py +63 -56

main.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # main.py: AI Detection and Plagiarism Check API
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline
@@ -6,6 +7,7 @@ from sentence_transformers import SentenceTransformer, util
 import fitz
 import logging
 import torch
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -25,7 +27,8 @@ AI_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
 PLAGIARISM_MODEL = "sentence-transformers/all-mpnet-base-v2"
 DEVICE = 0 if torch.cuda.is_available() else -1
 MAX_SEQ_LENGTH = 512
-CHUNK_SIZE = 500  # Characters per chunk
 # Initialize models
 ai_pipeline = None
@@ -35,110 +38,114 @@ plagiarism_model = None
 def initialize_models():
     global ai_pipeline, plagiarism_model
     try:
-        # Configure AI pipeline with proper text handling
         ai_pipeline = pipeline(
             "text-classification",
             model=AI_MODEL,
             device=DEVICE,
-            padding="max_length",
             truncation=True,
             max_length=MAX_SEQ_LENGTH
         )
-        logger.info("AI model loaded successfully")
-        # Configure plagiarism detector
         plagiarism_model = SentenceTransformer(PLAGIARISM_MODEL)
-        logger.info("Plagiarism model loaded successfully")
     except Exception as e:
-        logger.error(f"Model initialization failed: {str(e)}", exc_info=True)
-        raise RuntimeError(f"Model loading failed: {str(e)}")
 def extract_text(pdf_bytes: bytes) -> str:
-    """Extract and validate PDF text content"""
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
-            text = " ".join(page.get_text() for page in doc).strip()
-            if not text:
-                raise ValueError("Empty PDF file")
             if len(text) < 100:
-                raise ValueError("Text too short (min 100 characters)")
             return text
     except Exception as e:
-        logger.error(f"PDF extraction error: {str(e)}")
-        raise HTTPException(400, f"PDF processing failed: {str(e)}")
 def analyze_ai_content(text: str) -> float:
-    """Analyze text for AI-generated content with chunking"""
     try:
-        # Split text into manageable chunks
-        chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
-        if not chunks:
-            return 0.0
-        ai_scores = []
         for chunk in chunks:
             result = ai_pipeline(chunk)
-            score = next(
-                (r['score'] for r in result if r['label'] in ['AI', 'Fake']),
-                0.0
-            )
-            ai_scores.append(score)
-        return round((sum(ai_scores) / len(ai_scores)) * 100, 2)
     except Exception as e:
-        logger.error(f"AI analysis failed: {str(e)}", exc_info=True)
-        raise HTTPException(500, "AI analysis error")
 def analyze_plagiarism(text: str) -> float:
-    """Check for potential plagiarism"""
     try:
-        # Sample reference texts - replace with your database
         reference_texts = [
-            "Academic integrity is fundamental to learning.",
-            "Plagiarism undermines educational values.",
-            "Original thought is essential for innovation."
         ]
-        # Encode and compare
-        doc_emb = plagiarism_model.encode(text, convert_to_tensor=True)
         ref_embs = plagiarism_model.encode(reference_texts, convert_to_tensor=True)
-        similarities = util.cos_sim(doc_emb, ref_embs)[0]
-        # Calculate similarity percentage
-        match_count = sum(s > 0.75 for s in similarities)
-        return round((match_count / len(reference_texts)) * 100, 2)
     except Exception as e:
-        logger.error(f"Plagiarism check failed: {str(e)}", exc_info=True)
         return 0.0
 @app.post("/analyze")
 async def analyze_essay(file: UploadFile = File(...)):
-    """Main analysis endpoint"""
     try:
-        if not file.filename.lower().endswith(".pdf"):
-            raise HTTPException(400, "Only PDF files accepted")
-        # Process PDF
-        pdf_bytes = await file.read()
-        text = extract_text(pdf_bytes)
-        # Perform analyses
-        ai_score = analyze_ai_content(text)
-        plagiarism_score = analyze_plagiarism(text)
         return {
-            "ai_generated_percentage": ai_score,
-            "plagiarism_risk": plagiarism_score
         }
-    except HTTPException as he:
         raise
     except Exception as e:
-        logger.error(f"Unexpected error: {str(e)}", exc_info=True)
-        raise HTTPException(500, "Processing failed")
 # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks

 # main.py: AI Detection and Plagiarism Check API
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline
 import fitz
 import logging
 import torch
+import numpy as np
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 PLAGIARISM_MODEL = "sentence-transformers/all-mpnet-base-v2"
 DEVICE = 0 if torch.cuda.is_available() else -1
 MAX_SEQ_LENGTH = 512
+CHUNK_SIZE = 400  # Reduced chunk size for token safety
+SIMILARITY_THRESHOLD = 0.65  # Adjusted threshold
 # Initialize models
 ai_pipeline = None
 def initialize_models():
     global ai_pipeline, plagiarism_model
     try:
+        # Verify model labels
         ai_pipeline = pipeline(
             "text-classification",
             model=AI_MODEL,
             device=DEVICE,
+            padding=True,
             truncation=True,
             max_length=MAX_SEQ_LENGTH
         )
+        logger.info(f"AI model labels: {ai_pipeline.model.config.label2id}")
+        # Initialize plagiarism model
         plagiarism_model = SentenceTransformer(PLAGIARISM_MODEL)
+        logger.info("Models loaded successfully")
     except Exception as e:
+        logger.error(f"Initialization failed: {str(e)}", exc_info=True)
+        raise RuntimeError(f"Model loading error: {str(e)}")
 def extract_text(pdf_bytes: bytes) -> str:
+    """Improved PDF text extraction"""
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
+            text = "\n".join([page.get_text() for page in doc]).strip()
             if len(text) < 100:
+                raise ValueError("Minimum 100 characters required")
             return text
     except Exception as e:
+        logger.error(f"PDF Error: {str(e)}")
+        raise HTTPException(400, "Invalid PDF content")
 def analyze_ai_content(text: str) -> float:
+    """Robust AI detection with label verification"""
     try:
+        # Verify model labels
+        label_mapping = ai_pipeline.model.config.label2id
+        ai_labels = [k for k in label_mapping if k.lower() in ['ai', 'fake']]
+        if not ai_labels:
+            raise ValueError("No valid AI labels found in model")
+        # Process in token-aware chunks
+        chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
+        scores = []
         for chunk in chunks:
             result = ai_pipeline(chunk)
+            for item in result:
+                if item['label'] in ai_labels:
+                    scores.append(item['score'])
+        return round((sum(scores)/len(scores)) * 100, 2) if scores else 0.0
     except Exception as e:
+        logger.error(f"AI Analysis Error: {str(e)}")
+        raise HTTPException(500, "AI analysis failed")
 def analyze_plagiarism(text: str) -> float:
+    """Enhanced plagiarism detection"""
     try:
+        # Use meaningful reference texts
         reference_texts = [
+            "The importance of academic integrity cannot be overstated.",
+            "Plagiarism detection systems help maintain educational standards.",
+            "Original work demonstrates true learning and understanding.",
+            "Proper citation is essential for avoiding plagiarism.",
+            "Educational institutions take academic honesty very seriously."
         ]
+        # Sentence-level comparison
+        sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
+        if not sentences:
+            return 0.0
+        # Batch processing
+        sentence_embs = plagiarism_model.encode(sentences, convert_to_tensor=True)
         ref_embs = plagiarism_model.encode(reference_texts, convert_to_tensor=True)
+        # Calculate similarities
+        similarities = util.cos_sim(sentence_embs, ref_embs)
+        max_similarities = np.max(similarities.cpu().numpy(), axis=1)
+        # Calculate percentage above threshold
+        match_count = sum(s > SIMILARITY_THRESHOLD for s in max_similarities)
+        return round((match_count / len(sentences)) * 100, 2)
     except Exception as e:
+        logger.error(f"Plagiarism Error: {str(e)}")
         return 0.0
 @app.post("/analyze")
 async def analyze_essay(file: UploadFile = File(...)):
     try:
+        if not file.filename.lower().endswith('.pdf'):
+            raise HTTPException(400, "PDF files only")
+        text = extract_text(await file.read())
         return {
+            "ai_generated_percentage": analyze_ai_content(text),
+            "plagiarism_risk": analyze_plagiarism(text)
         }
+    except HTTPException:
         raise
     except Exception as e:
+        logger.error(f"Critical Error: {str(e)}")
+        raise HTTPException(500, "Analysis failed")
 # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks