Spaces:

can-org
/

AI-Checker

Running

App Files Files Community

Pujan-Dev commited on 18 days ago

Commit

26de4c9

1 Parent(s): f167a6a

feat: fixed it

Browse files

Files changed (4) hide show

Dockerfile +1 -0
features/text_classifier/controller.py +14 -10
features/text_classifier/model_loader.py +0 -2
requirements.txt +2 -2

Dockerfile CHANGED Viewed

@@ -11,6 +11,7 @@ WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+RUN python -m spacy download en_core_web_sm
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

features/text_classifier/controller.py CHANGED Viewed

@@ -9,9 +9,9 @@ from nltk.tokenize import sent_tokenize
 from .inferencer import classify_text
 from .preprocess import parse_docx, parse_pdf, parse_txt
-from nltk.tokenize import sent_tokenize
 security = HTTPBearer()
 # Verify Bearer token from Authorization header
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
@@ -78,18 +78,23 @@ async def handle_file_upload(file: UploadFile):
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
-# Analyze each sentence in plain text input
 async def handle_sentence_level_analysis(text: str):
     text = text.strip()
-    if text[-1] != ".":
-        text+="."
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-    sentences = sent_tokenize(text, language="english")
     results = []
     for sentence in sentences:
-        if not sentence.strip():
             continue
         label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
         results.append({
@@ -98,9 +103,8 @@ async def handle_sentence_level_analysis(text: str):
             "perplexity": round(perplexity, 2),
             "ai_likelihood": ai_likelihood
         })
-    return {"analysis": results}
-# Analyze each sentence from uploaded file
 async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)

 from .inferencer import classify_text
 from .preprocess import parse_docx, parse_pdf, parse_txt
+import spacy
 security = HTTPBearer()
+nlp = spacy.load("en_core_web_sm")
 # Verify Bearer token from Authorization header
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 async def handle_sentence_level_analysis(text: str):
     text = text.strip()
+    if not text.endswith("."):
+        text += "."
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    # Use SpaCy for sentence splitting
+    doc = nlp(text)
+    sentences = [sent.text.strip() for sent in doc.sents]
     results = []
     for sentence in sentences:
+        if not sentence:
             continue
         label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
         results.append({
             "perplexity": round(perplexity, 2),
             "ai_likelihood": ai_likelihood
         })
+    return {"analysis": results}# Analyze each sentence from uploaded file
 async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)

features/text_classifier/model_loader.py CHANGED Viewed

@@ -5,7 +5,6 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
 from huggingface_hub import snapshot_download
 import torch
 from dotenv import load_dotenv
-import nltk
 load_dotenv()
 REPO_ID = "Pujan-Dev/AI-Text-Detector"
 MODEL_DIR = "./models"
@@ -19,7 +18,6 @@ _model, _tokenizer = None, None
 def warmup():
     global _model, _tokenizer
     # Ensure punkt is available
-    nltk.download("punkt")
     download_model_repo()
     _model, _tokenizer = load_model()

 from huggingface_hub import snapshot_download
 import torch
 from dotenv import load_dotenv
 load_dotenv()
 REPO_ID = "Pujan-Dev/AI-Text-Detector"
 MODEL_DIR = "./models"
 def warmup():
     global _model, _tokenizer
     # Ensure punkt is available
     download_model_repo()
     _model, _tokenizer = load_model()

requirements.txt CHANGED Viewed

@@ -7,6 +7,6 @@ python-dotenv
 python-docx
 pydantic
 PyMuPDF
-nltk
 python-multipart
-slowapi

 python-docx
 pydantic
 PyMuPDF
 python-multipart
+slowapi
+spacy