Pujan-Dev commited on
Commit
26de4c9
·
1 Parent(s): f167a6a

feat: fixed it

Browse files
Dockerfile CHANGED
@@ -11,6 +11,7 @@ WORKDIR /app
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
14
 
15
  COPY --chown=user . /app
16
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+ RUN python -m spacy download en_core_web_sm
15
 
16
  COPY --chown=user . /app
17
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
features/text_classifier/controller.py CHANGED
@@ -9,9 +9,9 @@ from nltk.tokenize import sent_tokenize
9
 
10
  from .inferencer import classify_text
11
  from .preprocess import parse_docx, parse_pdf, parse_txt
12
- from nltk.tokenize import sent_tokenize
13
-
14
  security = HTTPBearer()
 
15
 
16
  # Verify Bearer token from Authorization header
17
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
@@ -78,18 +78,23 @@ async def handle_file_upload(file: UploadFile):
78
  logging.error(f"Error processing file: {e}")
79
  raise HTTPException(status_code=500, detail="Error processing the file")
80
 
81
- # Analyze each sentence in plain text input
 
82
  async def handle_sentence_level_analysis(text: str):
83
  text = text.strip()
84
- if text[-1] != ".":
85
- text+="."
 
86
  if len(text) > 10000:
87
  raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
88
-
89
- sentences = sent_tokenize(text, language="english")
 
 
 
90
  results = []
91
  for sentence in sentences:
92
- if not sentence.strip():
93
  continue
94
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
95
  results.append({
@@ -98,9 +103,8 @@ async def handle_sentence_level_analysis(text: str):
98
  "perplexity": round(perplexity, 2),
99
  "ai_likelihood": ai_likelihood
100
  })
101
- return {"analysis": results}
102
 
103
- # Analyze each sentence from uploaded file
104
  async def handle_file_sentence(file: UploadFile):
105
  try:
106
  file_contents = await extract_file_contents(file)
 
9
 
10
  from .inferencer import classify_text
11
  from .preprocess import parse_docx, parse_pdf, parse_txt
12
+ import spacy
 
13
  security = HTTPBearer()
14
+ nlp = spacy.load("en_core_web_sm")
15
 
16
  # Verify Bearer token from Authorization header
17
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
 
78
  logging.error(f"Error processing file: {e}")
79
  raise HTTPException(status_code=500, detail="Error processing the file")
80
 
81
+
82
+
83
  async def handle_sentence_level_analysis(text: str):
84
  text = text.strip()
85
+ if not text.endswith("."):
86
+ text += "."
87
+
88
  if len(text) > 10000:
89
  raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
90
+
91
+ # Use SpaCy for sentence splitting
92
+ doc = nlp(text)
93
+ sentences = [sent.text.strip() for sent in doc.sents]
94
+
95
  results = []
96
  for sentence in sentences:
97
+ if not sentence:
98
  continue
99
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
100
  results.append({
 
103
  "perplexity": round(perplexity, 2),
104
  "ai_likelihood": ai_likelihood
105
  })
 
106
 
107
+ return {"analysis": results}# Analyze each sentence from uploaded file
108
  async def handle_file_sentence(file: UploadFile):
109
  try:
110
  file_contents = await extract_file_contents(file)
features/text_classifier/model_loader.py CHANGED
@@ -5,7 +5,6 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
5
  from huggingface_hub import snapshot_download
6
  import torch
7
  from dotenv import load_dotenv
8
- import nltk
9
  load_dotenv()
10
  REPO_ID = "Pujan-Dev/AI-Text-Detector"
11
  MODEL_DIR = "./models"
@@ -19,7 +18,6 @@ _model, _tokenizer = None, None
19
  def warmup():
20
  global _model, _tokenizer
21
  # Ensure punkt is available
22
- nltk.download("punkt")
23
 
24
  download_model_repo()
25
  _model, _tokenizer = load_model()
 
5
  from huggingface_hub import snapshot_download
6
  import torch
7
  from dotenv import load_dotenv
 
8
  load_dotenv()
9
  REPO_ID = "Pujan-Dev/AI-Text-Detector"
10
  MODEL_DIR = "./models"
 
18
  def warmup():
19
  global _model, _tokenizer
20
  # Ensure punkt is available
 
21
 
22
  download_model_repo()
23
  _model, _tokenizer = load_model()
requirements.txt CHANGED
@@ -7,6 +7,6 @@ python-dotenv
7
  python-docx
8
  pydantic
9
  PyMuPDF
10
- nltk
11
  python-multipart
12
- slowapi
 
 
7
  python-docx
8
  pydantic
9
  PyMuPDF
 
10
  python-multipart
11
+ slowapi
12
+ spacy