|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.docstore.document import Document |
|
from docling.document_converter import DocumentConverter |
|
from bs4 import BeautifulSoup |
|
import requests |
|
|
|
plantuml_url = "https://plantuml.com" |
|
|
|
def get_diagrams_names(): |
|
response = requests.get(plantuml_url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
menu = soup.find('div', class_='menu1') |
|
links = menu.find_all('a') |
|
diagrams = [link['href'] for link in links if 'href' in link.attrs] |
|
return diagrams |
|
|
|
def load_documents(diagrams_names): |
|
converter = DocumentConverter() |
|
documents = [] |
|
for diagram_name in diagrams_names: |
|
result = converter.convert(f"{plantuml_url}/{diagram_name}") |
|
markdown = result.document.export_to_markdown() |
|
documents.append( |
|
Document( |
|
page_content=markdown, |
|
metadata={"source": diagram_name} |
|
) |
|
) |
|
return documents |
|
|
|
def get_processed_documents(diagrams_names): |
|
source_docs = load_documents(diagrams_names) |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=500, |
|
chunk_overlap=50, |
|
add_start_index=True, |
|
strip_whitespace=True, |
|
separators=["\n\n", "\n", ".", " ", ""], |
|
) |
|
|
|
docs_processed = text_splitter.split_documents(source_docs) |
|
|
|
return docs_processed |