Spaces:
Sleeping
Sleeping
File size: 4,364 Bytes
0eb636f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import re
import pdfplumber
import numpy as np
import pytesseract
from transformers import AutoTokenizer
from pdf2image import convert_from_path
from src.utils.config import DALAT5_MODEL, CHUNK_SIZE, CHUNK_OVERLAP
from typing import Any, List
# Load DalaT5's tokeniser
tokeniser = AutoTokenizer.from_pretrained(DALAT5_MODEL)
def extract_text_with_pdfplumber(file: Any) -> str:
"""
Extract text by leveraging PDFPlumber, which is particularly useful for PDF files
with tabular data.
"""
if file.name.endswith(".pdf"):
try:
with pdfplumber.open(file.name) as pdf:
texts = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(texts).strip()
except Exception as e:
print(f"[ERROR] PDFPlumber failed: {e}")
return ""
return ""
def extract_text_with_ocr(file: Any) -> str:
"""
Extract text data by leveraging Tesseract.
"""
if file.name.endswith(".pdf"):
try:
images = convert_from_path(file.name, dpi = 300)
page_texts = []
for img in images:
raw = pytesseract.image_to_string(img, lang = "kaz+eng")
# Clean page-by-page
cleaned = repair_extracted_text(raw)
page_texts.append(cleaned)
return "\n".join(page_texts).strip()
except Exception as e:
print(f"[ERROR] OCR failed: {e}")
return ""
def clean_text(text: str) -> str:
"""
Pre-clean text before chunking.
"""
# Collapse multiple newlines into a space
text = re.sub(r"\n+", " ", text)
# Normalize excessive punctuation
text = re.sub(r"[^\w\s]{2,}", "", text)
# Remove repeated punctuation or symbols
text = re.sub(r"[β’ββββ]+", " ", text)
# Normalize extra spacing
text = re.sub(r"\s{2,}", " ", text)
return text.strip()
def is_valid_chunk(chunk: str) -> bool:
"""
Heuristic to filter out low-quality chunks.
"""
if len(chunk) < 20:
return False
symbols = sum(1 for c in chunk if not c.isalnum() and c != ' ')
if symbols / len(chunk) > 0.4:
return False
return True
def deduplicate_chunks(chunks: List[str], embedder: Any, threshold: float = 0.95) -> List[str]:
"""
Deduplicate chunks based on cosine similarity.
Only retains semantically distinct segments.
"""
unique_chunks = []
seen_embeddings = []
for chunk in chunks:
emb = embedder.embed_text(chunk)
if all(np.dot(emb, e) / (np.linalg.norm(emb) * np.linalg.norm(e)) < threshold for e in seen_embeddings):
unique_chunks.append(chunk)
seen_embeddings.append(emb)
return unique_chunks
def chunk_text(text: str) -> List[str]:
"""
Chunk text into overlapping token-based segments using DalaT5's tokeniser.
"""
# Clean text before doing anything
cleaned_text = clean_text(text)
# Encode with the tokeniser
tokens = tokeniser.encode(cleaned_text, add_special_tokens = False)
total_tokens = len(tokens)
if total_tokens <= CHUNK_SIZE:
single_chunk = tokeniser.decode(tokens, skip_special_tokens=True).strip()
return [single_chunk] if is_valid_chunk(single_chunk) else []
chunks = []
start = 0
while start < total_tokens:
end = min(start + CHUNK_SIZE, total_tokens)
chunk_tokens = tokens[start:end]
chunk = tokeniser.decode(chunk_tokens, skip_special_tokens=True).strip()
if is_valid_chunk(chunk):
chunks.append(chunk)
start += CHUNK_SIZE - CHUNK_OVERLAP
return chunks
def repair_extracted_text(text: str) -> str:
"""
Additional logic to repair broken line splits, hyphenations, and common repetition artifacts.
"""
# Remove repeated words
text = re.sub(r'\b(\w{4,})\s+\1\b', r'\1', text)
# Fix hyphenation
text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
# Remove extremely repeated sentences
text = re.sub(r'(\b\w{1,2}\b\s+){5,}', '', text)
# Remove some previously observed junk
text = re.sub(r'\b(Googsoft|Hoogsoft|biometriialyq|avtorometriia)\b', '', text)
# Collapse multiple spaces
text = re.sub(r'\s{2,}', ' ', text)
return text.strip()
|