import PyPDF2 from typing import List, Dict class PDFProcessor: def __init__(self): self.pages = {} def extract_text(self, pdf_file) -> Dict[int, str]: """Extract text from PDF and return a dictionary of page numbers and text.""" pdf_reader = PyPDF2.PdfReader(pdf_file) for page_num in range(len(pdf_reader.pages)): text = pdf_reader.pages[page_num].extract_text() self.pages[page_num] = text return self.pages def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: """Split text into chunks of specified size.""" words = text.split() chunks = [] current_chunk = [] current_size = 0 for word in words: current_size += len(word) + 1 # +1 for space if current_size > chunk_size: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_size = len(word) else: current_chunk.append(word) if current_chunk: chunks.append(' '.join(current_chunk)) return chunks