Spaces:

crossroderick
/

semanticdala

Sleeping

App Files Files Community

semanticdala / src /utils /data_utils.py

crossroderick

Added all files

0eb636f about 1 month ago

raw

history blame contribute delete

4.36 kB

	import re
	import pdfplumber
	import numpy as np
	import pytesseract
	from transformers import AutoTokenizer
	from pdf2image import convert_from_path
	from src.utils.config import DALAT5_MODEL, CHUNK_SIZE, CHUNK_OVERLAP

	from typing import Any, List


	# Load DalaT5's tokeniser
	tokeniser = AutoTokenizer.from_pretrained(DALAT5_MODEL)


	def extract_text_with_pdfplumber(file: Any) -> str:
	"""
	Extract text by leveraging PDFPlumber, which is particularly useful for PDF files
	with tabular data.
	"""
	if file.name.endswith(".pdf"):
	try:
	with pdfplumber.open(file.name) as pdf:
	texts = [page.extract_text() or "" for page in pdf.pages]

	return "\n".join(texts).strip()

	except Exception as e:
	print(f"[ERROR] PDFPlumber failed: {e}")

	return ""

	return ""


	def extract_text_with_ocr(file: Any) -> str:
	"""
	Extract text data by leveraging Tesseract.
	"""
	if file.name.endswith(".pdf"):
	try:
	images = convert_from_path(file.name, dpi = 300)
	page_texts = []

	for img in images:
	raw = pytesseract.image_to_string(img, lang = "kaz+eng")

	# Clean page-by-page
	cleaned = repair_extracted_text(raw)

	page_texts.append(cleaned)

	return "\n".join(page_texts).strip()

	except Exception as e:
	print(f"[ERROR] OCR failed: {e}")

	return ""


	def clean_text(text: str) -> str:
	"""
	Pre-clean text before chunking.
	"""
	# Collapse multiple newlines into a space
	text = re.sub(r"\n+", " ", text)

	# Normalize excessive punctuation
	text = re.sub(r"[^\w\s]{2,}", "", text)

	# Remove repeated punctuation or symbols
	text = re.sub(r"[•●–—―]+", " ", text)

	# Normalize extra spacing
	text = re.sub(r"\s{2,}", " ", text)

	return text.strip()


	def is_valid_chunk(chunk: str) -> bool:
	"""
	Heuristic to filter out low-quality chunks.
	"""
	if len(chunk) < 20:
	return False

	symbols = sum(1 for c in chunk if not c.isalnum() and c != ' ')

	if symbols / len(chunk) > 0.4:
	return False

	return True


	def deduplicate_chunks(chunks: List[str], embedder: Any, threshold: float = 0.95) -> List[str]:
	"""
	Deduplicate chunks based on cosine similarity.
	Only retains semantically distinct segments.
	"""
	unique_chunks = []
	seen_embeddings = []

	for chunk in chunks:
	emb = embedder.embed_text(chunk)

	if all(np.dot(emb, e) / (np.linalg.norm(emb) * np.linalg.norm(e)) < threshold for e in seen_embeddings):
	unique_chunks.append(chunk)
	seen_embeddings.append(emb)

	return unique_chunks


	def chunk_text(text: str) -> List[str]:
	"""
	Chunk text into overlapping token-based segments using DalaT5's tokeniser.
	"""
	# Clean text before doing anything
	cleaned_text = clean_text(text)

	# Encode with the tokeniser
	tokens = tokeniser.encode(cleaned_text, add_special_tokens = False)
	total_tokens = len(tokens)

	if total_tokens <= CHUNK_SIZE:
	single_chunk = tokeniser.decode(tokens, skip_special_tokens=True).strip()

	return [single_chunk] if is_valid_chunk(single_chunk) else []

	chunks = []
	start = 0

	while start < total_tokens:
	end = min(start + CHUNK_SIZE, total_tokens)
	chunk_tokens = tokens[start:end]
	chunk = tokeniser.decode(chunk_tokens, skip_special_tokens=True).strip()

	if is_valid_chunk(chunk):
	chunks.append(chunk)

	start += CHUNK_SIZE - CHUNK_OVERLAP

	return chunks


	def repair_extracted_text(text: str) -> str:
	"""
	Additional logic to repair broken line splits, hyphenations, and common repetition artifacts.
	"""
	# Remove repeated words
	text = re.sub(r'\b(\w{4,})\s+\1\b', r'\1', text)

	# Fix hyphenation
	text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)

	# Remove extremely repeated sentences
	text = re.sub(r'(\b\w{1,2}\b\s+){5,}', '', text)

	# Remove some previously observed junk
	text = re.sub(r'\b(Googsoft\|Hoogsoft\|biometriialyq\|avtorometriia)\b', '', text)

	# Collapse multiple spaces
	text = re.sub(r'\s{2,}', ' ', text)

	return text.strip()