COM ADMIN commited on
Commit
6d7d004
·
1 Parent(s): 2754062

Fix inaccurate API results

Browse files
Files changed (2) hide show
  1. Dockerfile +10 -19
  2. main.py +49 -21
Dockerfile CHANGED
@@ -1,34 +1,25 @@
1
- # Base image (Python 3.9 is recommended for Spaces)
2
  FROM python:3.9-slim
3
 
4
- # Set environment variables
5
- ENV PYTHONDONTWRITEBYTECODE=1 \
6
- PYTHONUNBUFFERED=1 \
7
- TRANSFORMERS_CACHE=/tmp/.cache
8
-
9
- # Install system dependencies (required for PyMuPDF)
10
  RUN apt-get update && apt-get install -y --no-install-recommends \
11
  libgl1 \
12
  libglib2.0-0 \
13
- && rm -rf /var/lib/apt/lists/*
 
14
 
15
- # Set working directory
16
  WORKDIR /app
17
-
18
- # Copy requirements first (for caching)
19
  COPY requirements.txt .
20
-
21
- # Install Python dependencies
22
  RUN pip install --no-cache-dir -r requirements.txt
23
 
24
- # Download NLTK data
25
- RUN python -m nltk.downloader punkt
26
 
27
- # Copy the application
28
  COPY . .
29
 
30
- # Expose Hugging Face Spaces default port
31
- EXPOSE 7860
32
 
33
- # Run with Uvicorn (single worker)
34
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
 
 
1
  FROM python:3.9-slim
2
 
3
+ # Install system dependencies + NLTK data
 
 
 
 
 
4
  RUN apt-get update && apt-get install -y --no-install-recommends \
5
  libgl1 \
6
  libglib2.0-0 \
7
+ && rm -rf /var/lib/apt/lists/* \
8
+ && mkdir -p /usr/share/nltk_data
9
 
10
+ # Install Python dependencies first (for caching)
11
  WORKDIR /app
 
 
12
  COPY requirements.txt .
 
 
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
+ # Download NLTK data to persistent directory
16
+ RUN python -m nltk.downloader -d /usr/share/nltk_data punkt
17
 
18
+ # Copy application code
19
  COPY . .
20
 
21
+ # Set NLTK data path environment variable
22
+ ENV NLTK_DATA=/usr/share/nltk_data
23
 
24
+ EXPOSE 7860
25
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
main.py CHANGED
@@ -12,6 +12,7 @@ import nltk
12
  from nltk.tokenize import sent_tokenize
13
  from typing import List, Tuple
14
  import re
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
@@ -19,6 +20,20 @@ logger = logging.getLogger(__name__)
19
 
20
  app = FastAPI()
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"],
@@ -35,6 +50,11 @@ MIN_TEXT_LENGTH = 150 # Minimum characters to consider as valid text
35
  MAX_TEXT_LENGTH = 10000 # Maximum characters to process for performance
36
  PLAGIARISM_THRESHOLD = 0.75 # Similarity threshold for plagiarism detection
37
 
 
 
 
 
 
38
  # Load models at startup
39
  try:
40
  ai_detector = pipeline(
@@ -81,25 +101,29 @@ def compute_embeddings(sentences: List[str]) -> np.ndarray:
81
 
82
  def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
83
  """Check for internal plagiarism and return score + flag."""
84
- sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5] # Filter very short sentences
85
- if len(sentences) < 2:
86
- return 0.0, False
87
-
88
- embeddings = compute_embeddings(sentences)
89
- sim_matrix = cosine_similarity(embeddings)
90
- np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
91
-
92
- # Get top 5% most similar pairs
93
- n = len(sim_matrix)
94
- if n == 0:
95
- return 0.0, False
 
 
 
 
 
96
 
97
- top_k = max(1, int(0.05 * n * (n - 1) / 2)) # 5% of possible pairs
98
- top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
99
- avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
100
-
101
- plagiarism_detected = avg_similarity > PLAGIARISM_THRESHOLD
102
- return round(avg_similarity * 100, 2), plagiarism_detected
103
 
104
  def chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
105
  """Split text into chunks for processing."""
@@ -136,11 +160,15 @@ async def detect_ai_content(file: UploadFile = File(...)):
136
  plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
137
  logger.info(f"AI score: {ai_score:.2%}, Plagiarism score: {plagiarism_score}%")
138
 
139
- # Step 4: Return response (only AI percentage)
140
- return {"ai_generated_percentage": round(ai_score * 100, 2)}
 
 
 
141
 
142
  except HTTPException:
143
  raise
144
  except Exception as e:
145
  logger.error(f"Detection error: {str(e)}", exc_info=True)
146
- raise HTTPException(500, "Analysis failed")
 
 
12
  from nltk.tokenize import sent_tokenize
13
  from typing import List, Tuple
14
  import re
15
+ import os
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO)
 
20
 
21
  app = FastAPI()
22
 
23
+ # Initialize NLTK data path
24
+ NLTK_DATA_PATH = "/usr/share/nltk_data"
25
+ os.makedirs(NLTK_DATA_PATH, exist_ok=True)
26
+ nltk.data.path.append(NLTK_DATA_PATH)
27
+
28
+ # Ensure punkt tokenizer is available
29
+ try:
30
+ nltk.data.find('tokenizers/punkt')
31
+ logger.info("NLTK punkt tokenizer available")
32
+ except LookupError:
33
+ logger.info("Downloading NLTK punkt tokenizer...")
34
+ nltk.download('punkt', download_dir=NLTK_DATA_PATH)
35
+ nltk.data.path.append(NLTK_DATA_PATH)
36
+
37
  app.add_middleware(
38
  CORSMiddleware,
39
  allow_origins=["*"],
 
50
  MAX_TEXT_LENGTH = 10000 # Maximum characters to process for performance
51
  PLAGIARISM_THRESHOLD = 0.75 # Similarity threshold for plagiarism detection
52
 
53
+ # Health check endpoint
54
+ @app.get("/health")
55
+ def health_check():
56
+ return {"status": "healthy"}
57
+
58
  # Load models at startup
59
  try:
60
  ai_detector = pipeline(
 
101
 
102
  def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
103
  """Check for internal plagiarism and return score + flag."""
104
+ try:
105
+ sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5] # Filter very short sentences
106
+ if len(sentences) < 2:
107
+ return 0.0, False
108
+
109
+ embeddings = compute_embeddings(sentences)
110
+ sim_matrix = cosine_similarity(embeddings)
111
+ np.fill_diagonal(sim_matrix, 0) # Ignore self-similarity
112
+
113
+ # Get top 5% most similar pairs
114
+ n = len(sim_matrix)
115
+ if n == 0:
116
+ return 0.0, False
117
+
118
+ top_k = max(1, int(0.05 * n * (n - 1) / 2)) # 5% of possible pairs
119
+ top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
120
+ avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
121
 
122
+ plagiarism_detected = avg_similarity > PLAGIARISM_THRESHOLD
123
+ return round(avg_similarity * 100, 2), plagiarism_detected
124
+ except Exception as e:
125
+ logger.error(f"Plagiarism check failed: {str(e)}")
126
+ return 0.0, False
 
127
 
128
  def chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
129
  """Split text into chunks for processing."""
 
160
  plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)
161
  logger.info(f"AI score: {ai_score:.2%}, Plagiarism score: {plagiarism_score}%")
162
 
163
+ # Step 4: Return response
164
+ return {
165
+ "ai_generated_percentage": round(ai_score * 100, 2),
166
+ "plagiarism_risk": plagiarism_detected
167
+ }
168
 
169
  except HTTPException:
170
  raise
171
  except Exception as e:
172
  logger.error(f"Detection error: {str(e)}", exc_info=True)
173
+ raise HTTPException(500, f"Analysis failed: {str(e)}")
174
+