plantuml-agent / src /documents.py
savi8sant8s's picture
Finished initial app
f6662f9
raw
history blame
920 Bytes
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
def get_processed_documents(pdf_path: str):
source_docs = load_pdf_as_documents(pdf_path)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", ".", " ", ""],
)
docs_processed = text_splitter.split_documents(source_docs)
return docs_processed
def load_pdf_as_documents(pdf_path: str):
doc = fitz.open(pdf_path)
documents = []
for i, page in enumerate(doc):
text = page.get_text().strip()
if text:
documents.append(
Document(
page_content=text,
metadata={"source": f"page_{i+1}"}
)
)
return documents