Spaces:

can-org
/

AI-Checker

Running

App Files Files Community

Pujan-Dev commited on May 26

Commit

4fee431

1 Parent(s): 0b3d6d9

feat:added basic

Browse files

Files changed (15) hide show

README.md +2 -1
app.py +2 -1
__init__.py → features/nepali_text_classifier/__init__.py +0 -0
features/nepali_text_classifier/controller.py +36 -0
features/nepali_text_classifier/inferencer.py +21 -0
features/nepali_text_classifier/model_loader.py +54 -0
features/nepali_text_classifier/preprocess.py +32 -0
features/nepali_text_classifier/routes.py +29 -0
features/text_classifier/controller.py +2 -4
features/text_classifier/model_loader.py +1 -1
np_text_model/.gitattributes +36 -0
np_text_model/classifier/sentencepiece.bpe.model +3 -0
np_text_model/classifier/special_tokens_map.json +15 -0
np_text_model/classifier/tokenizer.json +3 -0
np_text_model/classifier/tokenizer_config.json +55 -0

README.md CHANGED Viewed

@@ -5,4 +5,5 @@ colorFrom: yellow
 colorTo: blue
 sdk: docker
 pinned: false
----

 colorTo: blue
 sdk: docker
 pinned: false
+---

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from slowapi.errors import RateLimitExceeded
 from slowapi.util import get_remote_address
 from fastapi.responses import JSONResponse
 from features.text_classifier.routes import router as text_classifier_router
 from config import ACCESS_RATE
 import requests
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
@@ -25,7 +26,7 @@ app.add_middleware(SlowAPIMiddleware)
 # Include your routes
 app.include_router(text_classifier_router, prefix="/text")
 @app.get("/")
 @limiter.limit(ACCESS_RATE)
 async def root(request: Request):

 from slowapi.util import get_remote_address
 from fastapi.responses import JSONResponse
 from features.text_classifier.routes import router as text_classifier_router
+from features.nepali_text_classifier.routes import router as nepali_text_classifier_router
 from config import ACCESS_RATE
 import requests
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
 # Include your routes
 app.include_router(text_classifier_router, prefix="/text")
+app.include_router(nepali_text_classifier_router,prefix="/NP")
 @app.get("/")
 @limiter.limit(ACCESS_RATE)
 async def root(request: Request):

__init__.py → features/nepali_text_classifier/__init__.py RENAMED Viewed

File without changes

features/nepali_text_classifier/controller.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import asyncio
+from fastapi import HTTPException, status, Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+import os
+from features.nepali_text_classifier.inferencer import classify_text
+security = HTTPBearer()
+async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    token = credentials.credentials
+    expected_token = os.getenv("MY_SECRET_TOKEN")
+    if token != expected_token:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Invalid or expired token"
+        )
+    return token
+async def nepali_text_analysis(text: str):
+    # Fix: split once and reuse
+    words = text.split()
+    if len(words) < 10:
+        raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
+    if len(text) > 10000:
+        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    label, confidence = await asyncio.to_thread(classify_text, text)
+    return {
+        "result": label,
+        "ai_likelihood": confidence
+    }
+def classify(text: str):
+    return classify_text(text)

features/nepali_text_classifier/inferencer.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from .model_loader import get_model_tokenizer
+import torch.nn.functional as F
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def classify_text(text: str):
+    model, tokenizer = get_model_tokenizer()
+    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits
+        probs = F.softmax(logits, dim=1)
+        pred = torch.argmax(probs, dim=1).item()
+        prob_percent = probs[0][pred].item() * 100
+    return {"label": "Human" if pred == 0 else "AI", "confidence": round(prob_percent, 2)}

features/nepali_text_classifier/model_loader.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import shutil
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import logging
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer, AutoModel
+# Configs
+REPO_ID = "Pujan-Dev/Nepali-AI-VS-HUMAN"
+BASE_DIR = "./np_text_model"
+TOKENIZER_DIR = os.path.join(BASE_DIR, "classifier")  # <- update this to match your uploaded folder
+WEIGHTS_PATH = os.path.join(BASE_DIR, "model_95_acc.pth")  # <- change to match actual uploaded weight
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Define model class
+class XLMRClassifier(nn.Module):
+    def __init__(self):
+        super(XLMRClassifier, self).__init__()
+        self.bert = AutoModel.from_pretrained("xlm-roberta-base")
+        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        cls_output = outputs.last_hidden_state[:, 0, :]
+        return self.classifier(cls_output)
+# Globals for caching
+_model = None
+_tokenizer = None
+def download_model_repo():
+    if os.path.exists(BASE_DIR) and os.path.isdir(BASE_DIR):
+        logging.info("Model already downloaded.")
+        return
+    snapshot_path = snapshot_download(repo_id=REPO_ID)
+    os.makedirs(BASE_DIR, exist_ok=True)
+    shutil.copytree(snapshot_path, BASE_DIR, dirs_exist_ok=True)
+def load_model():
+    download_model_repo()
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
+    model = XLMRClassifier().to(device)
+    model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
+    model.eval()
+    return model, tokenizer
+def get_model_tokenizer():
+    global _model, _tokenizer
+    if _model is None or _tokenizer is None:
+        _model, _tokenizer = load_model()
+    return _model, _tokenizer

features/nepali_text_classifier/preprocess.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import fitz  # PyMuPDF
+import docx
+from io import BytesIO
+import logging
+from fastapi import HTTPException
+def parse_docx(file: BytesIO):
+    doc = docx.Document(file)
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text + "\n"
+    return text
+def parse_pdf(file: BytesIO):
+    try:
+        doc = fitz.open(stream=file, filetype="pdf")
+        text = ""
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            text += page.get_text()
+        return text
+    except Exception as e:
+        logging.error(f"Error while processing PDF: {str(e)}")
+        raise HTTPException(
+            status_code=500, detail="Error processing PDF file")
+def parse_txt(file: BytesIO):
+    return file.read().decode("utf-8")

features/nepali_text_classifier/routes.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from slowapi import Limiter
+from config import ACCESS_RATE
+from .controller import nepali_text_analysis
+from .inferencer import classify_text
+from fastapi import APIRouter, Request, Depends, HTTPException
+from fastapi.security import HTTPBearer
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from pydantic import BaseModel
+router = APIRouter()
+limiter = Limiter(key_func=get_remote_address)
+security = HTTPBearer()
+# Input schema
+class TextInput(BaseModel):
+    text: str
+@router.post("/analyse")
+@limiter.limit(ACCESS_RATE)
+async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
+    # Token is available as `token.credentials`, add validation if needed
+    result = classify_text(data.text)
+    return result
+@router.get("/health")
+@limiter.limit(ACCESS_RATE)
+def health(request: Request):
+    return {"status": "ok"}

features/text_classifier/controller.py CHANGED Viewed

@@ -60,7 +60,7 @@ async def handle_file_upload(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
-            return {"message": "File contains more than 10,000 characters."}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
@@ -87,7 +87,6 @@ async def handle_sentence_level_analysis(text: str):
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-    # Use SpaCy for sentence splitting
     doc = nlp(text)
     sentences = [sent.text.strip() for sent in doc.sents]
@@ -108,7 +107,7 @@ async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
-            return {"message": "File contains more than 10,000 characters."}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
@@ -123,7 +122,6 @@ async def handle_file_sentence(file: UploadFile):
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
-# Optional synchronous helper function
 def classify(text: str):
     return classify_text(text)

     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
     doc = nlp(text)
     sentences = [sent.text.strip() for sent in doc.sents]
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 def classify(text: str):
     return classify_text(text)

features/text_classifier/model_loader.py CHANGED Viewed

@@ -18,9 +18,9 @@ _model, _tokenizer = None, None
 def warmup():
     global _model, _tokenizer
     # Ensure punkt is available
     download_model_repo()
     _model, _tokenizer = load_model()
 def download_model_repo():

 def warmup():
     global _model, _tokenizer
     # Ensure punkt is available
     download_model_repo()
     _model, _tokenizer = load_model()
+    logging.info("Its ready")
 def download_model_repo():

np_text_model/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+classifier/tokenizer.json filter=lfs diff=lfs merge=lfs -text

np_text_model/classifier/sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

np_text_model/classifier/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

np_text_model/classifier/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
+size 17082987

np_text_model/classifier/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}