Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on May 2

Commit

f3d44f4

1 Parent(s): f177806

Further changed the model

Browse files

Files changed (1) hide show

app.py +346 -103

app.py CHANGED Viewed

@@ -1,30 +1,34 @@
 # app.py: AI Detection and Plagiarism Check API
 import os
 import re
 import torch
 import logging
 import tempfile
 import numpy as np
-from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 from PyPDF2 import PdfReader
 from sklearn.metrics.pairwise import cosine_similarity
 # Configuration
-DETECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
-PERPLEXITY_MODEL = "gpt2"
-SENTENCE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-AI_THRESHOLD = 0.55
-PLAGIARISM_THRESHOLD = 0.75
-MAX_SEQ_LENGTH = 256  # Reduced for better chunk processing
-BATCH_SIZE = 4
-MAX_TEXT_LENGTH = 4096
-CHUNK_SIZE = 3  # Sentences per chunk
-OVERLAP = 1
 app = FastAPI(title="Essay Analyzer", version="2.0.0")
@@ -39,32 +43,65 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global models
-detection_model = None
-perplexity_model = None
-detection_tokenizer = None
-perp_tokenizer = None
-embedder = None
 def load_models():
-    global detection_model, perplexity_model, detection_tokenizer, perp_tokenizer, embedder
     try:
-        logger.info("Initializing optimized models...")
-        # Load detection model
-        detection_tokenizer = AutoTokenizer.from_pretrained(DETECTION_MODEL)
-        detection_model = AutoModelForSequenceClassification.from_pretrained(
-            DETECTION_MODEL,
-            num_labels=2,
-            trust_remote_code=True
-        ).eval()
-        # Load perplexity model
-        perp_tokenizer = AutoTokenizer.from_pretrained(PERPLEXITY_MODEL)
-        perplexity_model = AutoModelForCausalLM.from_pretrained(PERPLEXITY_MODEL).eval()
         # Load embedding model
-        embedder = SentenceTransformer(SENTENCE_MODEL)
         logger.info("All models initialized successfully")
         return True
@@ -73,135 +110,341 @@ def load_models():
         logger.error(f"Model initialization failed: {str(e)}")
         return False
-def process_pdf(file: UploadFile) -> str:
-    """Enhanced PDF text extraction"""
     try:
-        with tempfile.NamedTemporaryFile() as tmp:
             tmp.write(file.file.read())
-            tmp.seek(0)
-            reader = PdfReader(tmp.name)
-            text = []
-            for page in reader.pages:
-                page_text = page.extract_text() or ""
-                text.append(page_text.strip())
-            return " ".join(text)[:MAX_TEXT_LENGTH].strip()
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
-        raise HTTPException(500, "PDF processing failed")
 def calculate_perplexity(text: str) -> float:
-    """Robust perplexity calculation"""
     try:
-        inputs = perp_tokenizer(
-            text,
-            return_tensors="pt",
-            max_length=MAX_SEQ_LENGTH,
-            truncation=True,
-            padding=True
-        )
-        with torch.no_grad():
-            outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
-        return torch.exp(outputs.loss).item()
     except Exception as e:
-        logger.error(f"Perplexity error: {str(e)}")
         return 100.0
 def analyze_text(text: str) -> dict:
-    """Chunk-based analysis pipeline"""
     try:
-        text = re.sub(r'\s+', ' ', text).strip()[:MAX_TEXT_LENGTH]
-        if len(text) < 500:
             raise HTTPException(400, "Text too short for accurate analysis")
-        # Split text into overlapping chunks
-        sentences = re.split(r'(?<=[.!?])\s+', text)
-        chunks = []
-        for i in range(0, len(sentences), CHUNK_SIZE - OVERLAP):
-            chunk = ' '.join(sentences[i:i+CHUNK_SIZE])
-            chunks.append(chunk)
-        # AI Detection
-        ai_confidences = []
-        for chunk in chunks:
-            inputs = detection_tokenizer(
-                chunk,
-                padding=True,
-                truncation=True,
-                max_length=MAX_SEQ_LENGTH,
-                return_tensors="pt"
-            )
-            with torch.no_grad():
-                outputs = detection_model(**inputs)
-            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
-            ai_confidences.append(probs[0][1].item())  # AI class probability
-        # Perplexity analysis
         perplexity = calculate_perplexity(text)
-        perplexity_score = max(0, min(1, (perplexity - 20) / 60))  # 20-80 → 0-1
-        # Combined scoring
-        model_confidence = np.percentile(ai_confidences, 75)  # Use 75th percentile
-        final_score = (model_confidence * 0.8) + (perplexity_score * 0.2)
-        ai_percentage = min(100, max(0, final_score * 125))  # Amplify scores
-        # Plagiarism check
-        embeddings = embedder.encode(sentences, batch_size=BATCH_SIZE)
-        similarity = cosine_similarity(embeddings)
-        np.fill_diagonal(similarity, 0)
-        plagiarism_score = (similarity > PLAGIARISM_THRESHOLD).mean() * 100
         return {
-            "human_written": round(100 - ai_percentage, 2),
             "ai_generated": round(ai_percentage, 2),
             "plagiarism_risk": round(plagiarism_score, 2),
             "perplexity": round(perplexity, 2),
-            "chunks_analyzed": len(chunks)
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Analysis error: {str(e)}")
-        raise HTTPException(500, "Analysis failed")
 @app.on_event("startup")
 async def startup():
-    logger.info("Starting optimized service...")
     if not load_models():
         logger.error("Service initialization failed")
         raise RuntimeError("Failed to initialize models")
 @app.post("/analyze")
-async def analyze(file: UploadFile = File(...)):
     try:
         if not file.filename.lower().endswith(".pdf"):
             raise HTTPException(400, "Only PDF files accepted")
-        text = process_pdf(file)
-        return JSONResponse(analyze_text(text))
     except HTTPException as he:
         raise he
     except Exception as e:
         logger.error(f"Unexpected error: {str(e)}")
-        raise HTTPException(500, "Internal server error")
 @app.get("/health")
 async def health():
     return {
         "status": "operational",
-        "model": DETECTION_MODEL,
-        "chunk_size": CHUNK_SIZE,
-        "max_text_length": MAX_TEXT_LENGTH
     }
 @app.get("/")
 async def root():
     return {
         "service": "Essay Analyzer",
         "version": "2.0.0",
-        "documentation": {
-            "/analyze": "POST - Analyze PDF (AI detection + plagiarism check)",
             "/health": "GET - Service status"
         }
     }

 # app.py: AI Detection and Plagiarism Check API
 import os
 import re
 import torch
 import logging
 import tempfile
 import numpy as np
+from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
 from fastapi.responses import JSONResponse
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 from sentence_transformers import SentenceTransformer
 from PyPDF2 import PdfReader
 from sklearn.metrics.pairwise import cosine_similarity
+import nltk
+import scipy.stats
+from typing import List, Optional
+# Download NLTK data
+nltk.download('punkt', quiet=True)
 # Configuration
+PRIMARY_DETECTOR = "roberta-base-openai-detector"  # More reliable base model
+SECONDARY_DETECTOR = "Hello-SimpleAI/chatgpt-detector-roberta"  # Current model as backup
+TERTIARY_DETECTOR = "mitchelldehaven/roberta-base-openai-detector-balanced"  # Balanced detector
+PERPLEXITY_MODEL = "gpt2-medium"  # Larger model for better perplexity estimation
+SENTENCE_MODEL = "sentence-transformers/all-mpnet-base-v2"  # Upgraded sentence embeddings
+BATCH_SIZE = 8
+MAX_TEXT_LENGTH = 10000  # Increased for better analysis
+CHUNK_SIZE = 5  # Sentences per chunk
+OVERLAP = 2  # Increased overlap for better continuity
 app = FastAPI(title="Essay Analyzer", version="2.0.0")
 logger = logging.getLogger(__name__)
 # Global models
+models = {
+    "primary": None,
+    "secondary": None,
+    "tertiary": None,
+    "perplexity": None,
+    "embedder": None
+}
 def load_models():
+    """Load and initialize all models with optimized settings"""
     try:
+        logger.info("Initializing ensemble models...")
+        # Primary detector
+        models["primary"] = pipeline(
+            "text-classification",
+            model=PRIMARY_DETECTOR,
+            tokenizer=PRIMARY_DETECTOR,
+            device=0 if torch.cuda.is_available() else -1,
+            top_k=None  # Return all classes
+        )
+        # Secondary detector
+        models["secondary"] = pipeline(
+            "text-classification",
+            model=SECONDARY_DETECTOR,
+            tokenizer=SECONDARY_DETECTOR,
+            device=0 if torch.cuda.is_available() else -1,
+            top_k=None
+        )
+        # Tertiary detector
+        models["tertiary"] = pipeline(
+            "text-classification",
+            model=TERTIARY_DETECTOR,
+            tokenizer=TERTIARY_DETECTOR,
+            device=0 if torch.cuda.is_available() else -1,
+            top_k=None
+        )
+        # Load perplexity model with FP16 optimization if available
+        from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+        perp_tokenizer = GPT2TokenizerFast.from_pretrained(PERPLEXITY_MODEL)
+        perp_model = GPT2LMHeadModel.from_pretrained(PERPLEXITY_MODEL)
+        if torch.cuda.is_available():
+            perp_model = perp_model.half().cuda()  # Use FP16 on GPU
+        else:
+            perp_model = perp_model.eval()
+        models["perplexity"] = {
+            "model": perp_model,
+            "tokenizer": perp_tokenizer
+        }
         # Load embedding model
+        models["embedder"] = SentenceTransformer(SENTENCE_MODEL)
+        if torch.cuda.is_available():
+            models["embedder"].to(torch.device('cuda'))
         logger.info("All models initialized successfully")
         return True
         logger.error(f"Model initialization failed: {str(e)}")
         return False
+def extract_text_from_pdf(file: UploadFile) -> str:
+    """Enhanced PDF text extraction with error handling"""
     try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
             tmp.write(file.file.read())
+            tmp_path = tmp.name
+        reader = PdfReader(tmp_path)
+        text = []
+        for page in reader.pages:
+            page_text = page.extract_text() or ""
+            # Clean up text formatting
+            page_text = re.sub(r'\s+', ' ', page_text)
+            text.append(page_text.strip())
+        os.unlink(tmp_path)  # Clean up temp file
+        complete_text = " ".join(text).strip()
+        # Remove excessive whitespace and normalize
+        complete_text = re.sub(r'\s+', ' ', complete_text)
+        return complete_text[:MAX_TEXT_LENGTH]
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
+        raise HTTPException(500, "PDF processing failed: " + str(e))
+def get_segmented_texts(text: str) -> List[str]:
+    """Create multiple segmentations for robust analysis"""
+    sentences = nltk.sent_tokenize(text)
+    # Create segments of different sizes for analysis
+    segments = []
+    # Full text (if under limit)
+    if len(text) <= 1024:
+        segments.append(text)
+    # Regular chunks with overlap
+    for i in range(0, len(sentences), CHUNK_SIZE - OVERLAP):
+        chunk = ' '.join(sentences[i:i+CHUNK_SIZE])
+        if len(chunk) >= 100:  # Minimum meaningful length
+            segments.append(chunk)
+    # Paragraph-based segments (using double newlines as separators)
+    paragraphs = re.split(r'\n\s*\n', text)
+    for para in paragraphs:
+        clean_para = para.strip()
+        if len(clean_para) >= 200:  # Longer paragraph threshold
+            segments.append(clean_para)
+    return segments
 def calculate_perplexity(text: str) -> float:
+    """Advanced perplexity calculation with sliding window"""
+    perp_model = models["perplexity"]["model"]
+    perp_tokenizer = models["perplexity"]["tokenizer"]
     try:
+        # Break into smaller chunks for accurate perplexity
+        sentences = nltk.sent_tokenize(text)
+        if not sentences:
+            return 100.0
+        # Process in sliding windows of 5 sentences
+        window_size = 5
+        stride = 2
+        perplexities = []
+        for i in range(0, max(1, len(sentences) - window_size + 1), stride):
+            window_text = " ".join(sentences[i:i+window_size])
+            if len(window_text) < 10:
+                continue
+            encodings = perp_tokenizer(window_text, return_tensors="pt", truncation=True, max_length=512)
+            if torch.cuda.is_available():
+                encodings = {k: v.cuda() for k, v in encodings.items()}
+            with torch.no_grad():
+                outputs = perp_model(**encodings, labels=encodings["input_ids"])
+                neg_log_likelihood = outputs.loss
+                perplexity = torch.exp(neg_log_likelihood).item()
+                perplexities.append(perplexity)
+        # Filter out extreme outliers
+        if perplexities:
+            filtered_perps = [p for p in perplexities if p < 1000]  # Remove extreme values
+            if filtered_perps:
+                return np.median(filtered_perps)  # Median is more robust than mean
+        return 100.0  # Default fallback
     except Exception as e:
+        logger.error(f"Perplexity calculation error: {str(e)}")
         return 100.0
+def detect_linguistic_patterns(text: str) -> dict:
+    """Detect linguistic patterns that differentiate human vs AI text"""
+    try:
+        sentences = nltk.sent_tokenize(text)
+        words = re.findall(r'\b\w+\b', text.lower())
+        # Analyze sentence length distribution (AI often has more uniform length)
+        sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
+        sent_length_std = np.std(sent_lengths) if sent_lengths else 0
+        # Analyze lexical diversity (type-token ratio)
+        unique_words = len(set(words))
+        total_words = len(words)
+        lexical_diversity = unique_words / total_words if total_words > 0 else 0
+        # Sentence starter variety (AI often has repetitive starters)
+        starters = [s.split()[0].lower() if s.split() else "" for s in sentences]
+        starter_ratio = len(set(starters)) / len(starters) if starters else 0
+        # Paragraph length analysis
+        paragraphs = re.split(r'\n\s*\n', text)
+        para_lengths = [len(p.split()) for p in paragraphs if p.strip()]
+        para_length_std = np.std(para_lengths) if para_lengths else 0
+        return {
+            "sentence_length_std": sent_length_std,
+            "lexical_diversity": lexical_diversity,
+            "starter_diversity": starter_ratio,
+            "paragraph_length_std": para_length_std
+        }
+    except Exception as e:
+        logger.error(f"Linguistic pattern detection error: {str(e)}")
+        return {
+            "sentence_length_std": 0,
+            "lexical_diversity": 0,
+            "starter_diversity": 0,
+            "paragraph_length_std": 0
+        }
 def analyze_text(text: str) -> dict:
+    """Enhanced analysis pipeline using ensemble methods"""
     try:
+        # Initial text preprocessing
+        text = re.sub(r'\s+', ' ', text).strip()
+        if len(text) < 150:
             raise HTTPException(400, "Text too short for accurate analysis")
+        # Get text segments for analysis
+        segments = get_segmented_texts(text)
+        if not segments:
+            raise HTTPException(400, "Could not extract meaningful text segments")
+        # AI Detection using ensemble approach
+        primary_scores = []
+        secondary_scores = []
+        tertiary_scores = []
+        # Process each segment with both models
+        for segment in segments:
+            # Skip segments that are too short
+            if len(segment) < 100:
+                continue
+            # Primary model prediction
+            primary_result = models["primary"](segment)
+            ai_score = next((item["score"] for item in primary_result[0] if item["label"] == "fake"), 0.5)
+            primary_scores.append(ai_score)
+            # Secondary model prediction
+            secondary_result = models["secondary"](segment)
+            ai_score = next((item["score"] for item in secondary_result[0] if item["label"] == "AI-generated"), 0.5)
+            secondary_scores.append(ai_score)
+            # Tertiary model prediction
+            tertiary_result = models["tertiary"](segment)
+            ai_score = next((item["score"] for item in tertiary_result[0] if item["label"] in ["fake", "AI-generated"]), 0.5)
+            tertiary_scores.append(ai_score)
+        # Calculate perplexity score
         perplexity = calculate_perplexity(text)
+        # Normalized perplexity score (lower perplexity = more likely AI-generated)
+        # GPT-2 typically shows perplexity of 20-60 for AI text and 50-120 for human text
+        perplexity_score = max(0, min(1, (120 - perplexity) / 100))
+        # Linguistic feature analysis
+        linguistic_features = detect_linguistic_patterns(text)
+        # Calculate linguistic pattern score (higher = more likely AI)
+        # AI text tends to have lower std devs and higher uniformity
+        linguistic_score = 0.0
+        if linguistic_features["sentence_length_std"] < 3.5:
+            linguistic_score += 0.2
+        if linguistic_features["lexical_diversity"] < 0.6:
+            linguistic_score += 0.2
+        if linguistic_features["starter_diversity"] < 0.7:
+            linguistic_score += 0.2
+        if linguistic_features["paragraph_length_std"] < 20:
+            linguistic_score += 0.2
+        # Ensemble scoring with weighted average
+        # Balance the different models based on empirical performance
+        weights = {
+            "primary": 0.35,
+            "secondary": 0.25,
+            "tertiary": 0.15,
+            "perplexity": 0.15,
+            "linguistic": 0.10
+        }
+        # Use percentiles to get more stable scores from each model
+        primary_confidence = np.percentile(primary_scores, 75) if primary_scores else 0.5
+        secondary_confidence = np.percentile(secondary_scores, 75) if secondary_scores else 0.5
+        tertiary_confidence = np.percentile(tertiary_scores, 75) if tertiary_scores else 0.5
+        # Calculate final weighted score
+        final_score = (
+            primary_confidence * weights["primary"] +
+            secondary_confidence * weights["secondary"] +
+            tertiary_confidence * weights["tertiary"] +
+            perplexity_score * weights["perplexity"] +
+            linguistic_score * weights["linguistic"]
+        )
+        # Convert to percentage with calibration factor
+        # Apply sigmoid curve to get smoother probability distribution
+        ai_percentage = 100 / (1 + np.exp(-10 * (final_score - 0.5)))
+        # Optional plagiarism check
+        sentences = nltk.sent_tokenize(text)
+        if len(sentences) > 5:
+            embeddings = models["embedder"].encode(sentences, batch_size=BATCH_SIZE)
+            similarity = cosine_similarity(embeddings)
+            np.fill_diagonal(similarity, 0)
+            plagiarism_score = (similarity > 0.85).mean() * 100  # Higher threshold for accuracy
+        else:
+            plagiarism_score = 0
+        # Return just the AI percentage for the simplified API
+        return {
+            "ai_generated": round(ai_percentage, 2)
+        }
+        # Alternatively, return detailed analytics for debugging
+        """
         return {
             "ai_generated": round(ai_percentage, 2),
+            "human_written": round(100 - ai_percentage, 2),
             "plagiarism_risk": round(plagiarism_score, 2),
             "perplexity": round(perplexity, 2),
+            "model_confidences": {
+                "primary": round(primary_confidence * 100, 2),
+                "secondary": round(secondary_confidence * 100, 2),
+                "tertiary": round(tertiary_confidence * 100, 2)
+            },
+            "linguistic_analysis": {
+                "sentence_length_variation": round(linguistic_features["sentence_length_std"], 2),
+                "lexical_diversity": round(linguistic_features["lexical_diversity"], 2),
+                "sentence_starter_variety": round(linguistic_features["starter_diversity"], 2)
+            },
+            "segments_analyzed": len(segments)
         }
+        """
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Analysis error: {str(e)}")
+        raise HTTPException(500, f"Analysis failed: {str(e)}")
 @app.on_event("startup")
 async def startup():
+    logger.info("Starting enhanced AI detection service...")
     if not load_models():
         logger.error("Service initialization failed")
         raise RuntimeError("Failed to initialize models")
 @app.post("/analyze")
+async def analyze(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
+    """Analyze uploaded PDF for AI content detection"""
     try:
+        # Validate file type
         if not file.filename.lower().endswith(".pdf"):
             raise HTTPException(400, "Only PDF files accepted")
+        # Extract and analyze text
+        text = extract_text_from_pdf(file)
+        if not text or len(text) < 100:
+            return JSONResponse({
+                "ai_generated": 0,
+                "error": "Insufficient text extracted from PDF"
+            })
+        # Run analysis
+        results = analyze_text(text)
+        return JSONResponse(results)
     except HTTPException as he:
         raise he
     except Exception as e:
         logger.error(f"Unexpected error: {str(e)}")
+        raise HTTPException(500, f"Internal server error: {str(e)}")
+@app.post("/analyze/text")
+async def analyze_text_endpoint(text: str = Form(...)):
+    """Analyze raw text for AI content detection"""
+    try:
+        if not text or len(text) < 100:
+            raise HTTPException(400, "Text too short for analysis (minimum 100 characters)")
+        results = analyze_text(text)
+        return JSONResponse(results)
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        logger.error(f"Unexpected text analysis error: {str(e)}")
+        raise HTTPException(500, f"Analysis error: {str(e)}")
 @app.get("/health")
 async def health():
+    """Service health check endpoint"""
     return {
         "status": "operational",
+        "models": {
+            "primary": PRIMARY_DETECTOR,
+            "secondary": SECONDARY_DETECTOR,
+            "tertiary": TERTIARY_DETECTOR
+        },
+        "version": "3.0.0"
     }
 @app.get("/")
 async def root():
+    """API documentation endpoint"""
     return {
         "service": "Essay Analyzer",
         "version": "2.0.0",
+        "endpoints": {
+            "/analyze": "POST - Analyze PDF for AI detection",
+            "/analyze/text": "POST - Analyze raw text for AI detection",
             "/health": "GET - Service status"
         }
     }