Commit
·
cd2a7b2
1
Parent(s):
9e462c0
Fix the api
Browse files- main.py +119 -98
- requirements.txt +3 -1
main.py
CHANGED
@@ -1,24 +1,27 @@
|
|
1 |
-
# main.py: AI Detection
|
2 |
|
3 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
6 |
-
import
|
7 |
import fitz # PyMuPDF
|
|
|
8 |
import os
|
9 |
import logging
|
|
|
|
|
10 |
|
11 |
# Configure logging
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
app = FastAPI(
|
16 |
-
title="AI Text Detection API",
|
17 |
-
description="API endpoint for detecting AI-generated content in PDFs",
|
18 |
-
version="
|
19 |
)
|
20 |
|
21 |
-
# Enable CORS
|
22 |
app.add_middleware(
|
23 |
CORSMiddleware,
|
24 |
allow_origins=["*"],
|
@@ -27,115 +30,133 @@ app.add_middleware(
|
|
27 |
allow_headers=["*"],
|
28 |
)
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
try:
|
49 |
-
|
50 |
-
logger.info(f"
|
51 |
-
|
52 |
-
model
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
# Helper: Extract text from PDF
|
59 |
def extract_text_from_pdf(pdf_bytes):
|
60 |
try:
|
61 |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
62 |
-
|
|
|
|
|
63 |
except Exception as e:
|
64 |
logger.error(f"PDF extraction error: {str(e)}")
|
65 |
raise RuntimeError(f"Failed to read PDF content: {str(e)}")
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
# AI detection endpoint
|
77 |
@app.post("/detect")
|
78 |
-
async def
|
79 |
-
|
80 |
-
|
81 |
-
raise HTTPException(status_code=503, detail="Model is not loaded. Please check server logs.")
|
82 |
-
|
83 |
-
if not file.filename.lower().endswith(".pdf"):
|
84 |
-
raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
|
85 |
|
86 |
try:
|
87 |
-
|
88 |
pdf_bytes = await file.read()
|
89 |
text = extract_text_from_pdf(pdf_bytes)
|
90 |
-
logger.info(f"Extracted {len(text)} characters from PDF")
|
91 |
-
except Exception as e:
|
92 |
-
logger.error(f"Error processing PDF: {str(e)}")
|
93 |
-
raise HTTPException(status_code=500, detail=str(e))
|
94 |
-
|
95 |
-
if not text:
|
96 |
-
raise HTTPException(status_code=400, detail="No readable text found in PDF.")
|
97 |
-
|
98 |
-
try:
|
99 |
-
# Split text into chunks if it's very long (transformers has a token limit)
|
100 |
-
text_chunks = [text[i:i+512] for i in range(0, len(text), 512)]
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
# Get probabilities - models typically output [human_prob, ai_prob]
|
115 |
-
probs = torch.softmax(logits, dim=1).squeeze().tolist()
|
116 |
-
|
117 |
-
# Check if it's a single value or list (depends on model output format)
|
118 |
-
if isinstance(probs, list):
|
119 |
-
# Most AI detection models output [human_prob, ai_prob]
|
120 |
-
ai_prob = probs[1] if len(probs) > 1 else probs[0]
|
121 |
-
else:
|
122 |
-
# Single value models typically output AI probability directly
|
123 |
-
ai_prob = probs
|
124 |
-
|
125 |
-
ai_scores.append(ai_prob * 100)
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
except Exception as e:
|
136 |
-
logger.error(f"
|
137 |
-
raise HTTPException(status_code=500, detail=
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
|
141 |
# from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
|
|
|
1 |
+
# main.py: AI Detection and Plagiarism Check API
|
2 |
|
3 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
import fitz # PyMuPDF
|
8 |
+
import numpy as np
|
9 |
import os
|
10 |
import logging
|
11 |
+
import statistics
|
12 |
+
import torch
|
13 |
|
14 |
# Configure logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
app = FastAPI(
|
19 |
+
title="AI Text and Plagiarism Detection API",
|
20 |
+
description="API endpoint for detecting AI-generated content and semantic plagiarism in PDFs",
|
21 |
+
version="2.0.0"
|
22 |
)
|
23 |
|
24 |
+
# Enable CORS
|
25 |
app.add_middleware(
|
26 |
CORSMiddleware,
|
27 |
allow_origins=["*"],
|
|
|
30 |
allow_headers=["*"],
|
31 |
)
|
32 |
|
33 |
+
# Model configurations
|
34 |
+
SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"
|
35 |
+
AI_MODEL_CHOICES = [
|
36 |
+
"roberta-base-openai-detector",
|
37 |
+
"Hello-SimpleAI/chatgpt-detector-roberta",
|
38 |
+
"distilroberta-base"
|
39 |
+
]
|
40 |
+
|
41 |
+
# Initialize models
|
42 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
43 |
+
ai_model = None
|
44 |
+
sentence_model = None
|
45 |
+
similarity_threshold = 0.82 # Optimal threshold for plagiarism detection
|
46 |
+
|
47 |
+
async def initialize_models():
|
48 |
+
global ai_model, sentence_model
|
49 |
+
|
50 |
+
# Load AI detection model
|
51 |
+
for model_name in AI_MODEL_CHOICES:
|
52 |
+
try:
|
53 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
54 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
55 |
+
ai_model = pipeline(
|
56 |
+
"text-classification",
|
57 |
+
model=model,
|
58 |
+
tokenizer=tokenizer,
|
59 |
+
device=0 if device == "cuda" else -1
|
60 |
+
)
|
61 |
+
logger.info(f"Loaded AI model: {model_name}")
|
62 |
+
break
|
63 |
+
except Exception as e:
|
64 |
+
logger.error(f"Failed to load {model_name}: {str(e)}")
|
65 |
+
|
66 |
+
# Load sentence transformer model
|
67 |
try:
|
68 |
+
sentence_model = SentenceTransformer(SENTENCE_MODEL, device=device)
|
69 |
+
logger.info(f"Loaded sentence model: {SENTENCE_MODEL}")
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f"Failed to load sentence model: {str(e)}")
|
72 |
+
|
73 |
+
@app.on_event("startup")
|
74 |
+
async def startup_event():
|
75 |
+
await initialize_models()
|
76 |
+
|
|
|
77 |
def extract_text_from_pdf(pdf_bytes):
|
78 |
try:
|
79 |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
80 |
+
text = "".join([page.get_text() for page in doc]).strip()
|
81 |
+
logger.info(f"Extracted {len(text)} characters from PDF")
|
82 |
+
return text
|
83 |
except Exception as e:
|
84 |
logger.error(f"PDF extraction error: {str(e)}")
|
85 |
raise RuntimeError(f"Failed to read PDF content: {str(e)}")
|
86 |
|
87 |
+
def analyze_plagiarism(text, reference_texts):
|
88 |
+
"""Analyze text against reference texts using semantic similarity"""
|
89 |
+
try:
|
90 |
+
# Split into sentences
|
91 |
+
sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 50]
|
92 |
+
if len(sentences) < 3:
|
93 |
+
return 0.0 # Not enough content to analyze
|
94 |
+
|
95 |
+
# Generate embeddings
|
96 |
+
query_embeddings = sentence_model.encode(sentences, convert_to_tensor=True)
|
97 |
+
ref_embeddings = sentence_model.encode(reference_texts, convert_to_tensor=True)
|
98 |
+
|
99 |
+
# Calculate cosine similarity
|
100 |
+
cos_scores = util.cos_sim(query_embeddings, ref_embeddings)
|
101 |
+
|
102 |
+
# Find matches above threshold
|
103 |
+
max_scores = np.max(cos_scores.cpu().numpy(), axis=1)
|
104 |
+
matches = sum(score > similarity_threshold for score in max_scores)
|
105 |
+
|
106 |
+
# Calculate plagiarism percentage
|
107 |
+
plagiarism_percent = (matches / len(sentences)) * 100
|
108 |
+
return round(plagiarism_percent, 2)
|
109 |
+
|
110 |
+
except Exception as e:
|
111 |
+
logger.error(f"Plagiarism analysis failed: {str(e)}")
|
112 |
+
raise
|
113 |
|
|
|
114 |
@app.post("/detect")
|
115 |
+
async def analyze_essay(file: UploadFile = File(...)):
|
116 |
+
if not ai_model or not sentence_model:
|
117 |
+
raise HTTPException(status_code=503, detail="Models not loaded")
|
|
|
|
|
|
|
|
|
118 |
|
119 |
try:
|
120 |
+
# Process PDF
|
121 |
pdf_bytes = await file.read()
|
122 |
text = extract_text_from_pdf(pdf_bytes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
+
if len(text) < 100:
|
125 |
+
raise HTTPException(status_code=400, detail="Insufficient text length")
|
126 |
+
|
127 |
+
# AI Detection
|
128 |
+
ai_result = ai_model(text[:5120]) # Use first 5120 characters for analysis
|
129 |
+
ai_score = next((x['score'] for x in ai_result if x['label'] in ['Fake', 'AI']), 0.0)
|
130 |
+
ai_percent = round(ai_score * 100, 2)
|
131 |
+
|
132 |
+
# Plagiarism Detection
|
133 |
+
# Load reference texts from database/known sources
|
134 |
+
reference_texts = load_reference_texts() # Implement your reference text loading
|
135 |
+
plagiarism_percent = analyze_plagiarism(text, reference_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
return {
|
138 |
+
"ai_detection": {
|
139 |
+
"percentage": ai_percent,
|
140 |
+
"threshold": 85.0,
|
141 |
+
"warning": ai_percent > 85.0
|
142 |
+
},
|
143 |
+
"plagiarism": {
|
144 |
+
"percentage": plagiarism_percent,
|
145 |
+
"threshold": 15.0,
|
146 |
+
"warning": plagiarism_percent > 15.0,
|
147 |
+
"method": "semantic_similarity"
|
148 |
+
}
|
149 |
+
}
|
150 |
|
151 |
except Exception as e:
|
152 |
+
logger.error(f"Analysis failed: {str(e)}")
|
153 |
+
raise HTTPException(status_code=500, detail=str(e))
|
154 |
+
|
155 |
+
def load_reference_texts():
|
156 |
+
"""Implement your reference text loading logic here"""
|
157 |
+
# This should return a list of reference texts/sentences to compare against
|
158 |
+
# Example: return [ "Sample reference text 1", "Sample reference text 2" ]
|
159 |
+
return []
|
160 |
|
161 |
|
162 |
# from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
|
requirements.txt
CHANGED
@@ -6,7 +6,9 @@ transformers>=4.28.0
|
|
6 |
torch>=2.0.0
|
7 |
PyMuPDF>=1.22.0
|
8 |
python-multipart>=0.0.6
|
9 |
-
|
|
|
|
|
10 |
|
11 |
# --extra-index-url https://download.pytorch.org/whl/cpu
|
12 |
# fastapi==0.103.2
|
|
|
6 |
torch>=2.0.0
|
7 |
PyMuPDF>=1.22.0
|
8 |
python-multipart>=0.0.6
|
9 |
+
huggingface-hub>=0.14.1
|
10 |
+
numpy>=1.22.0
|
11 |
+
scipy>=1.8.0
|
12 |
|
13 |
# --extra-index-url https://download.pytorch.org/whl/cpu
|
14 |
# fastapi==0.103.2
|