Spaces:

can-org
/

AI-Checker

Running

File size: 4,588 Bytes

6f034a7

import os
import asyncio
import logging
from io import BytesIO

from fastapi import HTTPException, UploadFile, status, Depends
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from nltk.tokenize import sent_tokenize

from .inferencer import classify_text
from .preprocess import parse_docx, parse_pdf, parse_txt

security = HTTPBearer()

# Verify Bearer token from Authorization header
async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
    token = credentials.credentials
    expected_token = os.getenv("MY_SECRET_TOKEN")
    if token != expected_token:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Invalid or expired token"
        )
    return token

# Classify plain text input
async def handle_text_analysis(text: str):
    text = text.strip()
    if not text or len(text.split()) < 10:
        raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
    if len(text) > 10000:
        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")

    label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
    return {
        "result": label,
        "perplexity": round(perplexity, 2),
        "ai_likelihood": ai_likelihood
    }

# Extract text from uploaded files (.docx, .pdf, .txt)
async def extract_file_contents(file: UploadFile) -> str:
    content = await file.read()
    file_stream = BytesIO(content)

    if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        return parse_docx(file_stream)
    elif file.content_type == "application/pdf":
        return parse_pdf(file_stream)
    elif file.content_type == "text/plain":
        return parse_txt(file_stream)
    else:
        raise HTTPException(
            status_code=415,
            detail="Invalid file type. Only .docx, .pdf, and .txt are allowed."
        )

# Classify text from uploaded file
async def handle_file_upload(file: UploadFile):
    try:
        file_contents = await extract_file_contents(file)
        if len(file_contents) > 10000:
            return {"message": "File contains more than 10,000 characters."}

        cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
        if not cleaned_text:
            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
      
        label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
        return {
            "content": file_contents,
            "result": label,
            "perplexity": round(perplexity, 2),
            "ai_likelihood": ai_likelihood
        }
    except Exception as e:
        logging.error(f"Error processing file: {e}")
        raise HTTPException(status_code=500, detail="Error processing the file")

# Analyze each sentence in plain text input
async def handle_sentence_level_analysis(text: str):
    text = text.strip()
    if text[-1] != ".":
        text+="."
    if len(text) > 10000:
        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
     
    sentences = sent_tokenize(text, language="english")
    results = []
    for sentence in sentences:
        if not sentence.strip():
            continue
        label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
        results.append({
            "sentence": sentence,
            "label": label,
            "perplexity": round(perplexity, 2),
            "ai_likelihood": ai_likelihood
        })
    return {"analysis": results}

# Analyze each sentence from uploaded file
async def handle_file_sentence(file: UploadFile):
    try:
        file_contents = await extract_file_contents(file)
        if len(file_contents) > 10000:
            return {"message": "File contains more than 10,000 characters."}

        cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
        if not cleaned_text:
            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")

        result = await handle_sentence_level_analysis(cleaned_text)
        return {
            "content": file_contents,
            **result
        }
    except Exception as e:
        logging.error(f"Error processing file: {e}")
        raise HTTPException(status_code=500, detail="Error processing the file")

# Optional synchronous helper function
def classify(text: str):
    return classify_text(text)