Essay-Grader commited on
Commit
cd2a7b2
·
1 Parent(s): 9e462c0

Fix the api

Browse files
Files changed (2) hide show
  1. main.py +119 -98
  2. requirements.txt +3 -1
main.py CHANGED
@@ -1,24 +1,27 @@
1
- # main.py: AI Detection API for Flutter Integration
2
 
3
  from fastapi import FastAPI, UploadFile, File, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
5
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
- import torch
7
  import fitz # PyMuPDF
 
8
  import os
9
  import logging
 
 
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
  app = FastAPI(
16
- title="AI Text Detection API",
17
- description="API endpoint for detecting AI-generated content in PDFs",
18
- version="1.0.0"
19
  )
20
 
21
- # Enable CORS for Flutter app access
22
  app.add_middleware(
23
  CORSMiddleware,
24
  allow_origins=["*"],
@@ -27,115 +30,133 @@ app.add_middleware(
27
  allow_headers=["*"],
28
  )
29
 
30
- # Set cache directory to a writable location within the container
31
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
32
- os.environ["HF_HOME"] = "/tmp/hf_home"
33
-
34
- # Load model and tokenizer with proper error handling
35
- # Using a dedicated AI text detection model
36
- MODEL_NAME = "Hello-SimpleAI/chatgpt-detector-roberta" # A fine-tuned model for detecting AI-generated text
37
- tokenizer = None
38
- model = None
39
-
40
- try:
41
- logger.info(f"Loading model and tokenizer: {MODEL_NAME}")
42
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
43
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, cache_dir="/tmp/transformers_cache")
44
- logger.info("Model and tokenizer loaded successfully")
45
- except Exception as e:
46
- logger.error(f"Error loading model: {str(e)}")
47
- # Fallback to another model if the first one fails
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
- FALLBACK_MODEL = "roberta-base-openai-detector"
50
- logger.info(f"Trying fallback model: {FALLBACK_MODEL}")
51
- tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
52
- model = AutoModelForSequenceClassification.from_pretrained(FALLBACK_MODEL, cache_dir="/tmp/transformers_cache")
53
- logger.info("Fallback model loaded successfully")
54
- except Exception as e2:
55
- logger.error(f"Error loading fallback model: {str(e2)}")
56
- raise RuntimeError(f"Failed to load models: {str(e)} and {str(e2)}")
57
-
58
- # Helper: Extract text from PDF
59
  def extract_text_from_pdf(pdf_bytes):
60
  try:
61
  with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
62
- return "".join([page.get_text() for page in doc]).strip()
 
 
63
  except Exception as e:
64
  logger.error(f"PDF extraction error: {str(e)}")
65
  raise RuntimeError(f"Failed to read PDF content: {str(e)}")
66
 
67
- # Health check endpoint
68
- @app.get("/")
69
- async def health_check():
70
- return {
71
- "status": "ok",
72
- "model_loaded": model is not None and tokenizer is not None,
73
- "model_name": MODEL_NAME
74
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # AI detection endpoint
77
  @app.post("/detect")
78
- async def detect_ai(file: UploadFile = File(...)):
79
- # Check if model is loaded
80
- if model is None or tokenizer is None:
81
- raise HTTPException(status_code=503, detail="Model is not loaded. Please check server logs.")
82
-
83
- if not file.filename.lower().endswith(".pdf"):
84
- raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
85
 
86
  try:
87
- logger.info(f"Processing file: {file.filename}")
88
  pdf_bytes = await file.read()
89
  text = extract_text_from_pdf(pdf_bytes)
90
- logger.info(f"Extracted {len(text)} characters from PDF")
91
- except Exception as e:
92
- logger.error(f"Error processing PDF: {str(e)}")
93
- raise HTTPException(status_code=500, detail=str(e))
94
-
95
- if not text:
96
- raise HTTPException(status_code=400, detail="No readable text found in PDF.")
97
-
98
- try:
99
- # Split text into chunks if it's very long (transformers has a token limit)
100
- text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
101
 
102
- # Process each chunk and average the results
103
- ai_scores = []
104
- for chunk in text_chunks[:10]: # Limit to first 10 chunks to avoid timeouts
105
- if not chunk.strip():
106
- continue
107
-
108
- inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
109
-
110
- with torch.no_grad():
111
- outputs = model(**inputs)
112
- logits = outputs.logits
113
-
114
- # Get probabilities - models typically output [human_prob, ai_prob]
115
- probs = torch.softmax(logits, dim=1).squeeze().tolist()
116
-
117
- # Check if it's a single value or list (depends on model output format)
118
- if isinstance(probs, list):
119
- # Most AI detection models output [human_prob, ai_prob]
120
- ai_prob = probs[1] if len(probs) > 1 else probs[0]
121
- else:
122
- # Single value models typically output AI probability directly
123
- ai_prob = probs
124
-
125
- ai_scores.append(ai_prob * 100)
126
 
127
- # Calculate average AI probability across chunks
128
- if ai_scores:
129
- avg_ai_score = sum(ai_scores) / len(ai_scores)
130
- logger.info(f"AI detection complete: {avg_ai_score:.2f}%")
131
- return {"ai_generated_percentage": round(avg_ai_score, 2)}
132
- else:
133
- raise HTTPException(status_code=400, detail="Could not analyze text content.")
 
 
 
 
 
 
134
 
135
  except Exception as e:
136
- logger.error(f"Error during AI detection: {str(e)}")
137
- raise HTTPException(status_code=500, detail=f"Error analyzing text: {str(e)}")
138
-
 
 
 
 
 
139
 
140
 
141
  # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 
1
+ # main.py: AI Detection and Plagiarism Check API
2
 
3
  from fastapi import FastAPI, UploadFile, File, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
6
+ from sentence_transformers import SentenceTransformer, util
7
  import fitz # PyMuPDF
8
+ import numpy as np
9
  import os
10
  import logging
11
+ import statistics
12
+ import torch
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
  app = FastAPI(
19
+ title="AI Text and Plagiarism Detection API",
20
+ description="API endpoint for detecting AI-generated content and semantic plagiarism in PDFs",
21
+ version="2.0.0"
22
  )
23
 
24
+ # Enable CORS
25
  app.add_middleware(
26
  CORSMiddleware,
27
  allow_origins=["*"],
 
30
  allow_headers=["*"],
31
  )
32
 
33
+ # Model configurations
34
+ SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"
35
+ AI_MODEL_CHOICES = [
36
+ "roberta-base-openai-detector",
37
+ "Hello-SimpleAI/chatgpt-detector-roberta",
38
+ "distilroberta-base"
39
+ ]
40
+
41
+ # Initialize models
42
+ device = "cuda" if torch.cuda.is_available() else "cpu"
43
+ ai_model = None
44
+ sentence_model = None
45
+ similarity_threshold = 0.82 # Optimal threshold for plagiarism detection
46
+
47
+ async def initialize_models():
48
+ global ai_model, sentence_model
49
+
50
+ # Load AI detection model
51
+ for model_name in AI_MODEL_CHOICES:
52
+ try:
53
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
54
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
55
+ ai_model = pipeline(
56
+ "text-classification",
57
+ model=model,
58
+ tokenizer=tokenizer,
59
+ device=0 if device == "cuda" else -1
60
+ )
61
+ logger.info(f"Loaded AI model: {model_name}")
62
+ break
63
+ except Exception as e:
64
+ logger.error(f"Failed to load {model_name}: {str(e)}")
65
+
66
+ # Load sentence transformer model
67
  try:
68
+ sentence_model = SentenceTransformer(SENTENCE_MODEL, device=device)
69
+ logger.info(f"Loaded sentence model: {SENTENCE_MODEL}")
70
+ except Exception as e:
71
+ logger.error(f"Failed to load sentence model: {str(e)}")
72
+
73
+ @app.on_event("startup")
74
+ async def startup_event():
75
+ await initialize_models()
76
+
 
77
  def extract_text_from_pdf(pdf_bytes):
78
  try:
79
  with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
80
+ text = "".join([page.get_text() for page in doc]).strip()
81
+ logger.info(f"Extracted {len(text)} characters from PDF")
82
+ return text
83
  except Exception as e:
84
  logger.error(f"PDF extraction error: {str(e)}")
85
  raise RuntimeError(f"Failed to read PDF content: {str(e)}")
86
 
87
+ def analyze_plagiarism(text, reference_texts):
88
+ """Analyze text against reference texts using semantic similarity"""
89
+ try:
90
+ # Split into sentences
91
+ sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 50]
92
+ if len(sentences) < 3:
93
+ return 0.0 # Not enough content to analyze
94
+
95
+ # Generate embeddings
96
+ query_embeddings = sentence_model.encode(sentences, convert_to_tensor=True)
97
+ ref_embeddings = sentence_model.encode(reference_texts, convert_to_tensor=True)
98
+
99
+ # Calculate cosine similarity
100
+ cos_scores = util.cos_sim(query_embeddings, ref_embeddings)
101
+
102
+ # Find matches above threshold
103
+ max_scores = np.max(cos_scores.cpu().numpy(), axis=1)
104
+ matches = sum(score > similarity_threshold for score in max_scores)
105
+
106
+ # Calculate plagiarism percentage
107
+ plagiarism_percent = (matches / len(sentences)) * 100
108
+ return round(plagiarism_percent, 2)
109
+
110
+ except Exception as e:
111
+ logger.error(f"Plagiarism analysis failed: {str(e)}")
112
+ raise
113
 
 
114
  @app.post("/detect")
115
+ async def analyze_essay(file: UploadFile = File(...)):
116
+ if not ai_model or not sentence_model:
117
+ raise HTTPException(status_code=503, detail="Models not loaded")
 
 
 
 
118
 
119
  try:
120
+ # Process PDF
121
  pdf_bytes = await file.read()
122
  text = extract_text_from_pdf(pdf_bytes)
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ if len(text) < 100:
125
+ raise HTTPException(status_code=400, detail="Insufficient text length")
126
+
127
+ # AI Detection
128
+ ai_result = ai_model(text[:5120]) # Use first 5120 characters for analysis
129
+ ai_score = next((x['score'] for x in ai_result if x['label'] in ['Fake', 'AI']), 0.0)
130
+ ai_percent = round(ai_score * 100, 2)
131
+
132
+ # Plagiarism Detection
133
+ # Load reference texts from database/known sources
134
+ reference_texts = load_reference_texts() # Implement your reference text loading
135
+ plagiarism_percent = analyze_plagiarism(text, reference_texts)
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ return {
138
+ "ai_detection": {
139
+ "percentage": ai_percent,
140
+ "threshold": 85.0,
141
+ "warning": ai_percent > 85.0
142
+ },
143
+ "plagiarism": {
144
+ "percentage": plagiarism_percent,
145
+ "threshold": 15.0,
146
+ "warning": plagiarism_percent > 15.0,
147
+ "method": "semantic_similarity"
148
+ }
149
+ }
150
 
151
  except Exception as e:
152
+ logger.error(f"Analysis failed: {str(e)}")
153
+ raise HTTPException(status_code=500, detail=str(e))
154
+
155
+ def load_reference_texts():
156
+ """Implement your reference text loading logic here"""
157
+ # This should return a list of reference texts/sentences to compare against
158
+ # Example: return [ "Sample reference text 1", "Sample reference text 2" ]
159
+ return []
160
 
161
 
162
  # from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
requirements.txt CHANGED
@@ -6,7 +6,9 @@ transformers>=4.28.0
6
  torch>=2.0.0
7
  PyMuPDF>=1.22.0
8
  python-multipart>=0.0.6
9
-
 
 
10
 
11
  # --extra-index-url https://download.pytorch.org/whl/cpu
12
  # fastapi==0.103.2
 
6
  torch>=2.0.0
7
  PyMuPDF>=1.22.0
8
  python-multipart>=0.0.6
9
+ huggingface-hub>=0.14.1
10
+ numpy>=1.22.0
11
+ scipy>=1.8.0
12
 
13
  # --extra-index-url https://download.pytorch.org/whl/cpu
14
  # fastapi==0.103.2