Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

COM ADMIN commited on May 4

Commit

6d7d004

1 Parent(s): 2754062

Fix inaccurate API results

Browse files

Files changed (2) hide show

Dockerfile +10 -19
main.py +49 -21

Dockerfile CHANGED Viewed

@@ -1,34 +1,25 @@
-# Base image (Python 3.9 is recommended for Spaces)
 FROM python:3.9-slim
-# Set environment variables
-ENV PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONUNBUFFERED=1 \
-    TRANSFORMERS_CACHE=/tmp/.cache
-# Install system dependencies (required for PyMuPDF)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1 \
     libglib2.0-0 \
-    && rm -rf /var/lib/apt/lists/*
-# Set working directory
 WORKDIR /app
-# Copy requirements first (for caching)
 COPY requirements.txt .
-# Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Download NLTK data
-RUN python -m nltk.downloader punkt
-# Copy the application
 COPY . .
-# Expose Hugging Face Spaces default port
-EXPOSE 7860
-# Run with Uvicorn (single worker)
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

 FROM python:3.9-slim
+# Install system dependencies + NLTK data
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1 \
     libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/* \
+    && mkdir -p /usr/share/nltk_data
+# Install Python dependencies first (for caching)
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Download NLTK data to persistent directory
+RUN python -m nltk.downloader -d /usr/share/nltk_data punkt
+# Copy application code
 COPY . .
+# Set NLTK data path environment variable
+ENV NLTK_DATA=/usr/share/nltk_data
+EXPOSE 7860
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

main.py CHANGED Viewed

@@ -12,6 +12,7 @@ import nltk
 from nltk.tokenize import sent_tokenize
 from typing import List, Tuple
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -19,6 +20,20 @@ logger = logging.getLogger(__name__)
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -35,6 +50,11 @@ MIN_TEXT_LENGTH = 150  # Minimum characters to consider as valid text
 MAX_TEXT_LENGTH = 10000  # Maximum characters to process for performance
 PLAGIARISM_THRESHOLD = 0.75  # Similarity threshold for plagiarism detection
 # Load models at startup
 try:
     ai_detector = pipeline(
@@ -81,25 +101,29 @@ def compute_embeddings(sentences: List[str]) -> np.ndarray:
 def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
     """Check for internal plagiarism and return score + flag."""
-    sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5]  # Filter very short sentences
-    if len(sentences) < 2:
-        return 0.0, False
-    embeddings = compute_embeddings(sentences)
-    sim_matrix = cosine_similarity(embeddings)
-    np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
-    # Get top 5% most similar pairs
-    n = len(sim_matrix)
-    if n == 0:
-        return 0.0, False
-    top_k = max(1, int(0.05 * n * (n - 1) / 2))  # 5% of possible pairs
-    top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
-    avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
-    plagiarism_detected = avg_similarity > PLAGIARISM_THRESHOLD
-    return round(avg_similarity * 100, 2), plagiarism_detected
 def chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
     """Split text into chunks for processing."""
@@ -136,11 +160,15 @@ async def detect_ai_content(file: UploadFile = File(...)):
         plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
         logger.info(f"AI score: {ai_score:.2%}, Plagiarism score: {plagiarism_score}%")
-        # Step 4: Return response (only AI percentage)
-        return {"ai_generated_percentage": round(ai_score * 100, 2)}
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Detection error: {str(e)}", exc_info=True)
-        raise HTTPException(500, "Analysis failed")

 from nltk.tokenize import sent_tokenize
 from typing import List, Tuple
 import re
+import os
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 app = FastAPI()
+# Initialize NLTK data path
+NLTK_DATA_PATH = "/usr/share/nltk_data"
+os.makedirs(NLTK_DATA_PATH, exist_ok=True)
+nltk.data.path.append(NLTK_DATA_PATH)
+# Ensure punkt tokenizer is available
+try:
+    nltk.data.find('tokenizers/punkt')
+    logger.info("NLTK punkt tokenizer available")
+except LookupError:
+    logger.info("Downloading NLTK punkt tokenizer...")
+    nltk.download('punkt', download_dir=NLTK_DATA_PATH)
+    nltk.data.path.append(NLTK_DATA_PATH)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 MAX_TEXT_LENGTH = 10000  # Maximum characters to process for performance
 PLAGIARISM_THRESHOLD = 0.75  # Similarity threshold for plagiarism detection
+# Health check endpoint
+@app.get("/health")
+def health_check():
+    return {"status": "healthy"}
 # Load models at startup
 try:
     ai_detector = pipeline(
 def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
     """Check for internal plagiarism and return score + flag."""
+    try:
+        sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5]  # Filter very short sentences
+        if len(sentences) < 2:
+            return 0.0, False
+        embeddings = compute_embeddings(sentences)
+        sim_matrix = cosine_similarity(embeddings)
+        np.fill_diagonal(sim_matrix, 0)  # Ignore self-similarity
+        # Get top 5% most similar pairs
+        n = len(sim_matrix)
+        if n == 0:
+            return 0.0, False
+        top_k = max(1, int(0.05 * n * (n - 1) / 2))  # 5% of possible pairs
+        top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
+        avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
+        plagiarism_detected = avg_similarity > PLAGIARISM_THRESHOLD
+        return round(avg_similarity * 100, 2), plagiarism_detected
+    except Exception as e:
+        logger.error(f"Plagiarism check failed: {str(e)}")
+        return 0.0, False
 def chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
     """Split text into chunks for processing."""
         plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
         logger.info(f"AI score: {ai_score:.2%}, Plagiarism score: {plagiarism_score}%")
+        # Step 4: Return response
+        return {
+            "ai_generated_percentage": round(ai_score * 100, 2),
+            "plagiarism_risk": plagiarism_detected
+        }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Detection error: {str(e)}", exc_info=True)
+        raise HTTPException(500, f"Analysis failed: {str(e)}")