Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on May 2

Commit

9e462c0

1 Parent(s): bc6196e

Fix the api

Browse files

Files changed (2) hide show

main.py +91 -14
requirements.txt +7 -5

main.py CHANGED Viewed

@@ -5,6 +5,12 @@ from fastapi.middleware.cors import CORSMiddleware
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 import fitz  # PyMuPDF
 app = FastAPI(
     title="AI Text Detection API",
@@ -21,44 +27,115 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Load model and tokenizer
-MODEL_NAME = "roberta-base-openai-detector"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 # Helper: Extract text from PDF
 def extract_text_from_pdf(pdf_bytes):
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
             return "".join([page.get_text() for page in doc]).strip()
-    except Exception:
-        raise RuntimeError("Failed to read PDF content.")
 # AI detection endpoint
 @app.post("/detect")
 async def detect_ai(file: UploadFile = File(...)):
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
     try:
         pdf_bytes = await file.read()
         text = extract_text_from_pdf(pdf_bytes)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
     if not text:
         raise HTTPException(status_code=400, detail="No readable text found in PDF.")
-    # Tokenize and predict
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-        probs = torch.softmax(logits, dim=1).squeeze().tolist()
-    ai_generated_percentage = round(probs[1] * 100, 2)
-    return {"ai_generated_percentage": ai_generated_percentage}
 # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 import fitz  # PyMuPDF
+import os
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI(
     title="AI Text Detection API",
     allow_headers=["*"],
 )
+# Set cache directory to a writable location within the container
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
+os.environ["HF_HOME"] = "/tmp/hf_home"
+# Load model and tokenizer with proper error handling
+# Using a dedicated AI text detection model
+MODEL_NAME = "Hello-SimpleAI/chatgpt-detector-roberta"  # A fine-tuned model for detecting AI-generated text
+tokenizer = None
+model = None
+try:
+    logger.info(f"Loading model and tokenizer: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
+    logger.info("Model and tokenizer loaded successfully")
+except Exception as e:
+    logger.error(f"Error loading model: {str(e)}")
+    # Fallback to another model if the first one fails
+    try:
+        FALLBACK_MODEL = "roberta-base-openai-detector"
+        logger.info(f"Trying fallback model: {FALLBACK_MODEL}")
+        tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
+        model = AutoModelForSequenceClassification.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
+        logger.info("Fallback model loaded successfully")
+    except Exception as e2:
+        logger.error(f"Error loading fallback model: {str(e2)}")
+        raise RuntimeError(f"Failed to load models: {str(e)} and {str(e2)}")
 # Helper: Extract text from PDF
 def extract_text_from_pdf(pdf_bytes):
     try:
         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
             return "".join([page.get_text() for page in doc]).strip()
+    except Exception as e:
+        logger.error(f"PDF extraction error: {str(e)}")
+        raise RuntimeError(f"Failed to read PDF content: {str(e)}")
+# Health check endpoint
+@app.get("/")
+async def health_check():
+    return {
+        "status": "ok",
+        "model_loaded": model is not None and tokenizer is not None,
+        "model_name": MODEL_NAME
+    }
 # AI detection endpoint
 @app.post("/detect")
 async def detect_ai(file: UploadFile = File(...)):
+    # Check if model is loaded
+    if model is None or tokenizer is None:
+        raise HTTPException(status_code=503, detail="Model is not loaded. Please check server logs.")
     if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
     try:
+        logger.info(f"Processing file: {file.filename}")
         pdf_bytes = await file.read()
         text = extract_text_from_pdf(pdf_bytes)
+        logger.info(f"Extracted {len(text)} characters from PDF")
     except Exception as e:
+        logger.error(f"Error processing PDF: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
     if not text:
         raise HTTPException(status_code=400, detail="No readable text found in PDF.")
+    try:
+        # Split text into chunks if it's very long (transformers has a token limit)
+        text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
+        # Process each chunk and average the results
+        ai_scores = []
+        for chunk in text_chunks[:10]:  # Limit to first 10 chunks to avoid timeouts
+            if not chunk.strip():
+                continue
+            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
+            with torch.no_grad():
+                outputs = model(**inputs)
+                logits = outputs.logits
+                # Get probabilities - models typically output [human_prob, ai_prob]
+                probs = torch.softmax(logits, dim=1).squeeze().tolist()
+                # Check if it's a single value or list (depends on model output format)
+                if isinstance(probs, list):
+                    # Most AI detection models output [human_prob, ai_prob]
+                    ai_prob = probs[1] if len(probs) > 1 else probs[0]
+                else:
+                    # Single value models typically output AI probability directly
+                    ai_prob = probs
+                ai_scores.append(ai_prob * 100)
+        # Calculate average AI probability across chunks
+        if ai_scores:
+            avg_ai_score = sum(ai_scores) / len(ai_scores)
+            logger.info(f"AI detection complete: {avg_ai_score:.2f}%")
+            return {"ai_generated_percentage": round(avg_ai_score, 2)}
+        else:
+            raise HTTPException(status_code=400, detail="Could not analyze text content.")
+    except Exception as e:
+        logger.error(f"Error during AI detection: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")
 # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks

requirements.txt CHANGED Viewed

@@ -1,10 +1,12 @@
 # requirements.txt
-fastapi
-uvicorn
-transformers
-torch
-pymupdf
 # --extra-index-url https://download.pytorch.org/whl/cpu
 # fastapi==0.103.2

 # requirements.txt
+fastapi>=0.95.0
+uvicorn>=0.21.1
+transformers>=4.28.0
+torch>=2.0.0
+PyMuPDF>=1.22.0
+python-multipart>=0.0.6
 # --extra-index-url https://download.pytorch.org/whl/cpu
 # fastapi==0.103.2