plantuml-agent / src /documents.py
Sávio Santos
new version
b5c4d16
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from docling.document_converter import DocumentConverter
from bs4 import BeautifulSoup
import requests
plantuml_url = "https://plantuml.com"
def get_diagrams_names():
response = requests.get(plantuml_url)
soup = BeautifulSoup(response.content, 'html.parser')
menu = soup.find('div', class_='menu1')
links = menu.find_all('a')
diagrams = [link['href'] for link in links if 'href' in link.attrs]
return diagrams
def load_documents(diagrams_names):
converter = DocumentConverter()
documents = []
for diagram_name in diagrams_names:
result = converter.convert(f"{plantuml_url}/{diagram_name}")
markdown = result.document.export_to_markdown()
documents.append(
Document(
page_content=markdown,
metadata={"source": diagram_name}
)
)
return documents
def get_processed_documents(diagrams_names):
source_docs = load_documents(diagrams_names)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", ".", " ", ""],
)
docs_processed = text_splitter.split_documents(source_docs)
return docs_processed