import fitz from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document def get_processed_documents(pdf_path: str): source_docs = load_pdf_as_documents(pdf_path) text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], ) docs_processed = text_splitter.split_documents(source_docs) return docs_processed def load_pdf_as_documents(pdf_path: str): doc = fitz.open(pdf_path) documents = [] for i, page in enumerate(doc): text = page.get_text().strip() if text: documents.append( Document( page_content=text, metadata={"source": f"page_{i+1}"} ) ) return documents