Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on May 2

Commit

cd2a7b2

1 Parent(s): 9e462c0

Fix the api

Browse files

Files changed (2) hide show

main.py +119 -98
requirements.txt +3 -1

main.py CHANGED Viewed

@@ -1,24 +1,27 @@
-# main.py: AI Detection API for Flutter Integration
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
 import fitz  # PyMuPDF
 import os
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(
-    title="AI Text Detection API",
-    description="API endpoint for detecting AI-generated content in PDFs",
-    version="1.0.0"
 )
-# Enable CORS for Flutter app access
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -27,115 +30,133 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Set cache directory to a writable location within the container
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
-os.environ["HF_HOME"] = "/tmp/hf_home"
-# Load model and tokenizer with proper error handling
-# Using a dedicated AI text detection model
-MODEL_NAME = "Hello-SimpleAI/chatgpt-detector-roberta"  # A fine-tuned model for detecting AI-generated text
-tokenizer = None
-model = None
-try:
-    logger.info(f"Loading model and tokenizer: {MODEL_NAME}")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
-    logger.info("Model and tokenizer loaded successfully")
-except Exception as e:
-    logger.error(f"Error loading model: {str(e)}")
-    # Fallback to another model if the first one fails
     try:
-        FALLBACK_MODEL = "roberta-base-openai-detector"
-        logger.info(f"Trying fallback model: {FALLBACK_MODEL}")
-        tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
-        model = AutoModelForSequenceClassification.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
-        logger.info("Fallback model loaded successfully")
-    except Exception as e2:
-        logger.error(f"Error loading fallback model: {str(e2)}")
-        raise RuntimeError(f"Failed to load models: {str(e)} and {str(e2)}")
-# Helper: Extract text from PDF
 def extract_text_from_pdf(pdf_bytes):
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
-            return "".join([page.get_text() for page in doc]).strip()
     except Exception as e:
         logger.error(f"PDF extraction error: {str(e)}")
         raise RuntimeError(f"Failed to read PDF content: {str(e)}")
-# Health check endpoint
-@app.get("/")
-async def health_check():
-    return {
-        "status": "ok",
-        "model_loaded": model is not None and tokenizer is not None,
-        "model_name": MODEL_NAME
-    }
-# AI detection endpoint
 @app.post("/detect")
-async def detect_ai(file: UploadFile = File(...)):
-    # Check if model is loaded
-    if model is None or tokenizer is None:
-        raise HTTPException(status_code=503, detail="Model is not loaded. Please check server logs.")
-    if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
     try:
-        logger.info(f"Processing file: {file.filename}")
         pdf_bytes = await file.read()
         text = extract_text_from_pdf(pdf_bytes)
-        logger.info(f"Extracted {len(text)} characters from PDF")
-    except Exception as e:
-        logger.error(f"Error processing PDF: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-    if not text:
-        raise HTTPException(status_code=400, detail="No readable text found in PDF.")
-    try:
-        # Split text into chunks if it's very long (transformers has a token limit)
-        text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
-        # Process each chunk and average the results
-        ai_scores = []
-        for chunk in text_chunks[:10]:  # Limit to first 10 chunks to avoid timeouts
-            if not chunk.strip():
-                continue
-            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
-            with torch.no_grad():
-                outputs = model(**inputs)
-                logits = outputs.logits
-                # Get probabilities - models typically output [human_prob, ai_prob]
-                probs = torch.softmax(logits, dim=1).squeeze().tolist()
-                # Check if it's a single value or list (depends on model output format)
-                if isinstance(probs, list):
-                    # Most AI detection models output [human_prob, ai_prob]
-                    ai_prob = probs[1] if len(probs) > 1 else probs[0]
-                else:
-                    # Single value models typically output AI probability directly
-                    ai_prob = probs
-                ai_scores.append(ai_prob * 100)
-        # Calculate average AI probability across chunks
-        if ai_scores:
-            avg_ai_score = sum(ai_scores) / len(ai_scores)
-            logger.info(f"AI detection complete: {avg_ai_score:.2f}%")
-            return {"ai_generated_percentage": round(avg_ai_score, 2)}
-        else:
-            raise HTTPException(status_code=400, detail="Could not analyze text content.")
     except Exception as e:
-        logger.error(f"Error during AI detection: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")
 # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks

+# main.py: AI Detection and Plagiarism Check API
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+from sentence_transformers import SentenceTransformer, util
 import fitz  # PyMuPDF
+import numpy as np
 import os
 import logging
+import statistics
+import torch
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(
+    title="AI Text and Plagiarism Detection API",
+    description="API endpoint for detecting AI-generated content and semantic plagiarism in PDFs",
+    version="2.0.0"
 )
+# Enable CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Model configurations
+SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"
+AI_MODEL_CHOICES = [
+    "roberta-base-openai-detector",
+    "Hello-SimpleAI/chatgpt-detector-roberta",
+    "distilroberta-base"
+]
+# Initialize models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+ai_model = None
+sentence_model = None
+similarity_threshold = 0.82  # Optimal threshold for plagiarism detection
+async def initialize_models():
+    global ai_model, sentence_model
+    # Load AI detection model
+    for model_name in AI_MODEL_CHOICES:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            ai_model = pipeline(
+                "text-classification",
+                model=model,
+                tokenizer=tokenizer,
+                device=0 if device == "cuda" else -1
+            )
+            logger.info(f"Loaded AI model: {model_name}")
+            break
+        except Exception as e:
+            logger.error(f"Failed to load {model_name}: {str(e)}")
+    # Load sentence transformer model
     try:
+        sentence_model = SentenceTransformer(SENTENCE_MODEL, device=device)
+        logger.info(f"Loaded sentence model: {SENTENCE_MODEL}")
+    except Exception as e:
+        logger.error(f"Failed to load sentence model: {str(e)}")
+@app.on_event("startup")
+async def startup_event():
+    await initialize_models()
 def extract_text_from_pdf(pdf_bytes):
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
+            text = "".join([page.get_text() for page in doc]).strip()
+            logger.info(f"Extracted {len(text)} characters from PDF")
+            return text
     except Exception as e:
         logger.error(f"PDF extraction error: {str(e)}")
         raise RuntimeError(f"Failed to read PDF content: {str(e)}")
+def analyze_plagiarism(text, reference_texts):
+    """Analyze text against reference texts using semantic similarity"""
+    try:
+        # Split into sentences
+        sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 50]
+        if len(sentences) < 3:
+            return 0.0  # Not enough content to analyze
+        # Generate embeddings
+        query_embeddings = sentence_model.encode(sentences, convert_to_tensor=True)
+        ref_embeddings = sentence_model.encode(reference_texts, convert_to_tensor=True)
+        # Calculate cosine similarity
+        cos_scores = util.cos_sim(query_embeddings, ref_embeddings)
+        # Find matches above threshold
+        max_scores = np.max(cos_scores.cpu().numpy(), axis=1)
+        matches = sum(score > similarity_threshold for score in max_scores)
+        # Calculate plagiarism percentage
+        plagiarism_percent = (matches / len(sentences)) * 100
+        return round(plagiarism_percent, 2)
+    except Exception as e:
+        logger.error(f"Plagiarism analysis failed: {str(e)}")
+        raise
 @app.post("/detect")
+async def analyze_essay(file: UploadFile = File(...)):
+    if not ai_model or not sentence_model:
+        raise HTTPException(status_code=503, detail="Models not loaded")
     try:
+        # Process PDF
         pdf_bytes = await file.read()
         text = extract_text_from_pdf(pdf_bytes)
+        if len(text) < 100:
+            raise HTTPException(status_code=400, detail="Insufficient text length")
+        # AI Detection
+        ai_result = ai_model(text[:5120])  # Use first 5120 characters for analysis
+        ai_score = next((x['score'] for x in ai_result if x['label'] in ['Fake', 'AI']), 0.0)
+        ai_percent = round(ai_score * 100, 2)
+        # Plagiarism Detection
+        # Load reference texts from database/known sources
+        reference_texts = load_reference_texts()  # Implement your reference text loading
+        plagiarism_percent = analyze_plagiarism(text, reference_texts)
+        return {
+            "ai_detection": {
+                "percentage": ai_percent,
+                "threshold": 85.0,
+                "warning": ai_percent > 85.0
+            },
+            "plagiarism": {
+                "percentage": plagiarism_percent,
+                "threshold": 15.0,
+                "warning": plagiarism_percent > 15.0,
+                "method": "semantic_similarity"
+            }
+        }
     except Exception as e:
+        logger.error(f"Analysis failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+def load_reference_texts():
+    """Implement your reference text loading logic here"""
+    # This should return a list of reference texts/sentences to compare against
+    # Example: return [ "Sample reference text 1", "Sample reference text 2" ]
+    return []
 # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks

requirements.txt CHANGED Viewed

@@ -6,7 +6,9 @@ transformers>=4.28.0
 torch>=2.0.0
 PyMuPDF>=1.22.0
 python-multipart>=0.0.6
 # --extra-index-url https://download.pytorch.org/whl/cpu
 # fastapi==0.103.2

 torch>=2.0.0
 PyMuPDF>=1.22.0
 python-multipart>=0.0.6
+huggingface-hub>=0.14.1
+numpy>=1.22.0
+scipy>=1.8.0
 # --extra-index-url https://download.pytorch.org/whl/cpu
 # fastapi==0.103.2