Spaces:
Sleeping
Sleeping
import fitz | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.docstore.document import Document | |
def get_processed_documents(pdf_path: str): | |
source_docs = load_pdf_as_documents(pdf_path) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=50, | |
add_start_index=True, | |
strip_whitespace=True, | |
separators=["\n\n", "\n", ".", " ", ""], | |
) | |
docs_processed = text_splitter.split_documents(source_docs) | |
return docs_processed | |
def load_pdf_as_documents(pdf_path: str): | |
doc = fitz.open(pdf_path) | |
documents = [] | |
for i, page in enumerate(doc): | |
text = page.get_text().strip() | |
if text: | |
documents.append( | |
Document( | |
page_content=text, | |
metadata={"source": f"page_{i+1}"} | |
) | |
) | |
return documents | |