File size: 1,396 Bytes
f6662f9
 
b5c4d16
 
 
f6662f9
b5c4d16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6662f9
 
 
 
 
 
 
 
 
 
 
b5c4d16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from docling.document_converter import DocumentConverter
from bs4 import BeautifulSoup
import requests

plantuml_url = "https://plantuml.com"

def get_diagrams_names():
    response = requests.get(plantuml_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    menu = soup.find('div', class_='menu1')
    links = menu.find_all('a')
    diagrams = [link['href'] for link in links if 'href' in link.attrs]
    return diagrams

def load_documents(diagrams_names):
    converter = DocumentConverter()
    documents = []
    for diagram_name in diagrams_names:
        result = converter.convert(f"{plantuml_url}/{diagram_name}")
        markdown = result.document.export_to_markdown()
        documents.append(
            Document(
                page_content=markdown,
                metadata={"source": diagram_name}
            )
        )
    return documents

def get_processed_documents(diagrams_names):
    source_docs = load_documents(diagrams_names)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = text_splitter.split_documents(source_docs)

    return docs_processed