Essay-Grader commited on
Commit
9e462c0
·
1 Parent(s): bc6196e

Fix the api

Browse files
Files changed (2) hide show
  1. main.py +91 -14
  2. requirements.txt +7 -5
main.py CHANGED
@@ -5,6 +5,12 @@ from fastapi.middleware.cors import CORSMiddleware
5
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
  import torch
7
  import fitz # PyMuPDF
 
 
 
 
 
 
8
 
9
  app = FastAPI(
10
  title="AI Text Detection API",
@@ -21,44 +27,115 @@ app.add_middleware(
21
  allow_headers=["*"],
22
  )
23
 
24
- # Load model and tokenizer
25
- MODEL_NAME = "roberta-base-openai-detector"
26
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Helper: Extract text from PDF
30
  def extract_text_from_pdf(pdf_bytes):
31
  try:
32
  with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
33
  return "".join([page.get_text() for page in doc]).strip()
34
- except Exception:
35
- raise RuntimeError("Failed to read PDF content.")
 
 
 
 
 
 
 
 
 
 
36
 
37
  # AI detection endpoint
38
  @app.post("/detect")
39
  async def detect_ai(file: UploadFile = File(...)):
 
 
 
 
40
  if not file.filename.lower().endswith(".pdf"):
41
  raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
42
 
43
  try:
 
44
  pdf_bytes = await file.read()
45
  text = extract_text_from_pdf(pdf_bytes)
 
46
  except Exception as e:
 
47
  raise HTTPException(status_code=500, detail=str(e))
48
 
49
  if not text:
50
  raise HTTPException(status_code=400, detail="No readable text found in PDF.")
51
 
52
- # Tokenize and predict
53
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
54
- with torch.no_grad():
55
- outputs = model(**inputs)
56
- logits = outputs.logits
57
- probs = torch.softmax(logits, dim=1).squeeze().tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- ai_generated_percentage = round(probs[1] * 100, 2)
 
 
60
 
61
- return {"ai_generated_percentage": ai_generated_percentage}
62
 
63
 
64
  # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 
5
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
  import torch
7
  import fitz # PyMuPDF
8
+ import os
9
+ import logging
10
+
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
 
15
  app = FastAPI(
16
  title="AI Text Detection API",
 
27
  allow_headers=["*"],
28
  )
29
 
30
+ # Set cache directory to a writable location within the container
31
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
32
+ os.environ["HF_HOME"] = "/tmp/hf_home"
33
+
34
+ # Load model and tokenizer with proper error handling
35
+ # Using a dedicated AI text detection model
36
+ MODEL_NAME = "Hello-SimpleAI/chatgpt-detector-roberta" # A fine-tuned model for detecting AI-generated text
37
+ tokenizer = None
38
+ model = None
39
+
40
+ try:
41
+ logger.info(f"Loading model and tokenizer: {MODEL_NAME}")
42
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
43
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
44
+ logger.info("Model and tokenizer loaded successfully")
45
+ except Exception as e:
46
+ logger.error(f"Error loading model: {str(e)}")
47
+ # Fallback to another model if the first one fails
48
+ try:
49
+ FALLBACK_MODEL = "roberta-base-openai-detector"
50
+ logger.info(f"Trying fallback model: {FALLBACK_MODEL}")
51
+ tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
52
+ model = AutoModelForSequenceClassification.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
53
+ logger.info("Fallback model loaded successfully")
54
+ except Exception as e2:
55
+ logger.error(f"Error loading fallback model: {str(e2)}")
56
+ raise RuntimeError(f"Failed to load models: {str(e)} and {str(e2)}")
57
 
58
  # Helper: Extract text from PDF
59
  def extract_text_from_pdf(pdf_bytes):
60
  try:
61
  with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
62
  return "".join([page.get_text() for page in doc]).strip()
63
+ except Exception as e:
64
+ logger.error(f"PDF extraction error: {str(e)}")
65
+ raise RuntimeError(f"Failed to read PDF content: {str(e)}")
66
+
67
+ # Health check endpoint
68
+ @app.get("/")
69
+ async def health_check():
70
+ return {
71
+ "status": "ok",
72
+ "model_loaded": model is not None and tokenizer is not None,
73
+ "model_name": MODEL_NAME
74
+ }
75
 
76
  # AI detection endpoint
77
  @app.post("/detect")
78
  async def detect_ai(file: UploadFile = File(...)):
79
+ # Check if model is loaded
80
+ if model is None or tokenizer is None:
81
+ raise HTTPException(status_code=503, detail="Model is not loaded. Please check server logs.")
82
+
83
  if not file.filename.lower().endswith(".pdf"):
84
  raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
85
 
86
  try:
87
+ logger.info(f"Processing file: {file.filename}")
88
  pdf_bytes = await file.read()
89
  text = extract_text_from_pdf(pdf_bytes)
90
+ logger.info(f"Extracted {len(text)} characters from PDF")
91
  except Exception as e:
92
+ logger.error(f"Error processing PDF: {str(e)}")
93
  raise HTTPException(status_code=500, detail=str(e))
94
 
95
  if not text:
96
  raise HTTPException(status_code=400, detail="No readable text found in PDF.")
97
 
98
+ try:
99
+ # Split text into chunks if it's very long (transformers has a token limit)
100
+ text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
101
+
102
+ # Process each chunk and average the results
103
+ ai_scores = []
104
+ for chunk in text_chunks[:10]: # Limit to first 10 chunks to avoid timeouts
105
+ if not chunk.strip():
106
+ continue
107
+
108
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
109
+
110
+ with torch.no_grad():
111
+ outputs = model(**inputs)
112
+ logits = outputs.logits
113
+
114
+ # Get probabilities - models typically output [human_prob, ai_prob]
115
+ probs = torch.softmax(logits, dim=1).squeeze().tolist()
116
+
117
+ # Check if it's a single value or list (depends on model output format)
118
+ if isinstance(probs, list):
119
+ # Most AI detection models output [human_prob, ai_prob]
120
+ ai_prob = probs[1] if len(probs) > 1 else probs[0]
121
+ else:
122
+ # Single value models typically output AI probability directly
123
+ ai_prob = probs
124
+
125
+ ai_scores.append(ai_prob * 100)
126
+
127
+ # Calculate average AI probability across chunks
128
+ if ai_scores:
129
+ avg_ai_score = sum(ai_scores) / len(ai_scores)
130
+ logger.info(f"AI detection complete: {avg_ai_score:.2f}%")
131
+ return {"ai_generated_percentage": round(avg_ai_score, 2)}
132
+ else:
133
+ raise HTTPException(status_code=400, detail="Could not analyze text content.")
134
 
135
+ except Exception as e:
136
+ logger.error(f"Error during AI detection: {str(e)}")
137
+ raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")
138
 
 
139
 
140
 
141
  # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
requirements.txt CHANGED
@@ -1,10 +1,12 @@
1
  # requirements.txt
2
 
3
- fastapi
4
- uvicorn
5
- transformers
6
- torch
7
- pymupdf
 
 
8
 
9
  # --extra-index-url https://download.pytorch.org/whl/cpu
10
  # fastapi==0.103.2
 
1
  # requirements.txt
2
 
3
+ fastapi>=0.95.0
4
+ uvicorn>=0.21.1
5
+ transformers>=4.28.0
6
+ torch>=2.0.0
7
+ PyMuPDF>=1.22.0
8
+ python-multipart>=0.0.6
9
+
10
 
11
  # --extra-index-url https://download.pytorch.org/whl/cpu
12
  # fastapi==0.103.2