from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document from docling.document_converter import DocumentConverter from bs4 import BeautifulSoup import requests plantuml_url = "https://plantuml.com" def get_diagrams_names(): response = requests.get(plantuml_url) soup = BeautifulSoup(response.content, 'html.parser') menu = soup.find('div', class_='menu1') links = menu.find_all('a') diagrams = [link['href'] for link in links if 'href' in link.attrs] return diagrams def load_documents(diagrams_names): converter = DocumentConverter() documents = [] for diagram_name in diagrams_names: result = converter.convert(f"{plantuml_url}/{diagram_name}") markdown = result.document.export_to_markdown() documents.append( Document( page_content=markdown, metadata={"source": diagram_name} ) ) return documents def get_processed_documents(diagrams_names): source_docs = load_documents(diagrams_names) text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], ) docs_processed = text_splitter.split_documents(source_docs) return docs_processed