import re import pdfplumber import numpy as np import pytesseract from transformers import AutoTokenizer from pdf2image import convert_from_path from src.utils.config import DALAT5_MODEL, CHUNK_SIZE, CHUNK_OVERLAP from typing import Any, List # Load DalaT5's tokeniser tokeniser = AutoTokenizer.from_pretrained(DALAT5_MODEL) def extract_text_with_pdfplumber(file: Any) -> str: """ Extract text by leveraging PDFPlumber, which is particularly useful for PDF files with tabular data. """ if file.name.endswith(".pdf"): try: with pdfplumber.open(file.name) as pdf: texts = [page.extract_text() or "" for page in pdf.pages] return "\n".join(texts).strip() except Exception as e: print(f"[ERROR] PDFPlumber failed: {e}") return "" return "" def extract_text_with_ocr(file: Any) -> str: """ Extract text data by leveraging Tesseract. """ if file.name.endswith(".pdf"): try: images = convert_from_path(file.name, dpi = 300) page_texts = [] for img in images: raw = pytesseract.image_to_string(img, lang = "kaz+eng") # Clean page-by-page cleaned = repair_extracted_text(raw) page_texts.append(cleaned) return "\n".join(page_texts).strip() except Exception as e: print(f"[ERROR] OCR failed: {e}") return "" def clean_text(text: str) -> str: """ Pre-clean text before chunking. """ # Collapse multiple newlines into a space text = re.sub(r"\n+", " ", text) # Normalize excessive punctuation text = re.sub(r"[^\w\s]{2,}", "", text) # Remove repeated punctuation or symbols text = re.sub(r"[•●–—―]+", " ", text) # Normalize extra spacing text = re.sub(r"\s{2,}", " ", text) return text.strip() def is_valid_chunk(chunk: str) -> bool: """ Heuristic to filter out low-quality chunks. """ if len(chunk) < 20: return False symbols = sum(1 for c in chunk if not c.isalnum() and c != ' ') if symbols / len(chunk) > 0.4: return False return True def deduplicate_chunks(chunks: List[str], embedder: Any, threshold: float = 0.95) -> List[str]: """ Deduplicate chunks based on cosine similarity. Only retains semantically distinct segments. """ unique_chunks = [] seen_embeddings = [] for chunk in chunks: emb = embedder.embed_text(chunk) if all(np.dot(emb, e) / (np.linalg.norm(emb) * np.linalg.norm(e)) < threshold for e in seen_embeddings): unique_chunks.append(chunk) seen_embeddings.append(emb) return unique_chunks def chunk_text(text: str) -> List[str]: """ Chunk text into overlapping token-based segments using DalaT5's tokeniser. """ # Clean text before doing anything cleaned_text = clean_text(text) # Encode with the tokeniser tokens = tokeniser.encode(cleaned_text, add_special_tokens = False) total_tokens = len(tokens) if total_tokens <= CHUNK_SIZE: single_chunk = tokeniser.decode(tokens, skip_special_tokens=True).strip() return [single_chunk] if is_valid_chunk(single_chunk) else [] chunks = [] start = 0 while start < total_tokens: end = min(start + CHUNK_SIZE, total_tokens) chunk_tokens = tokens[start:end] chunk = tokeniser.decode(chunk_tokens, skip_special_tokens=True).strip() if is_valid_chunk(chunk): chunks.append(chunk) start += CHUNK_SIZE - CHUNK_OVERLAP return chunks def repair_extracted_text(text: str) -> str: """ Additional logic to repair broken line splits, hyphenations, and common repetition artifacts. """ # Remove repeated words text = re.sub(r'\b(\w{4,})\s+\1\b', r'\1', text) # Fix hyphenation text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text) # Remove extremely repeated sentences text = re.sub(r'(\b\w{1,2}\b\s+){5,}', '', text) # Remove some previously observed junk text = re.sub(r'\b(Googsoft|Hoogsoft|biometriialyq|avtorometriia)\b', '', text) # Collapse multiple spaces text = re.sub(r'\s{2,}', ' ', text) return text.strip()