import math import os import bs4 import pandas as pd from bs4 import BeautifulSoup def parse_section(nodes: list[bs4.element.NavigableString]) -> str: section = [] for node in nodes: if node.name == "table": node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github") elif node.name == "script": continue else: node_text = node.text section.append(node_text) section = "".join(section) return section class Parser: def __init__( self, soup: BeautifulSoup, base_url: str, filename: str, min_section_length: int = 100, max_section_length: int = 2000, ): self.soup = soup self.base_url = base_url self.filename = filename self.min_section_length = min_section_length self.max_section_length = max_section_length def parse(self) -> tuple[list[str], list[str], list[str]]: ... def find_sections(self) -> bs4.element.ResultSet: ... def build_url(self, suffix: str) -> str: ... class SphinxParser(Parser): def parse(self) -> tuple[list[str], list[str], list[str]]: found = self.find_sections() sections = [] urls = [] names = [] for i in range(len(found)): section_found = found[i] section_soup = section_found.parent.parent section_href = section_soup.find_all("a", href=True, class_="headerlink") # If sections has subsections, keep only the part before the first subsection if len(section_href) > 1 and section_soup.section is not None: section_siblings = list(section_soup.section.previous_siblings)[::-1] section = parse_section(section_siblings) else: section = parse_section(section_soup.children) # Remove special characters, plus newlines in some url and section names. section = section.strip() url = section_found["href"].strip().replace("\n", "") name = section_found.parent.text.strip()[:-1].replace("\n", "") url = self.build_url(url) # If text is too long, split into chunks of equal sizes if len(section) > self.max_section_length: n_chunks = math.ceil(len(section) / float(self.max_section_length)) separator_index = math.floor(len(section) / n_chunks) section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)] url_chunks = [url] * n_chunks name_chunks = [name] * n_chunks sections.extend(section_chunks) urls.extend(url_chunks) names.extend(name_chunks) # If text is not too short, add in 1 chunk elif len(section) > self.min_section_length: sections.append(section) urls.append(url) names.append(name) return sections, urls, names def find_sections(self) -> bs4.element.ResultSet: return self.soup.find_all("a", href=True, class_="headerlink") def build_url(self, suffix: str) -> str: return self.base_url + self.filename + suffix class HuggingfaceParser(Parser): def parse(self) -> tuple[list[str], list[str], list[str]]: found = self.find_sections() sections = [] urls = [] names = [] for i in range(len(found)): section_href = found[i].find("a", href=True, class_="header-link") section_nodes = [] for element in found[i].find_next_siblings(): if i + 1 < len(found) and element == found[i + 1]: break section_nodes.append(element) section = parse_section(section_nodes) # Remove special characters, plus newlines in some url and section names. section = section.strip() url = section_href["href"].strip().replace("\n", "") name = found[i].text.strip().replace("\n", "") url = self.build_url(url) # If text is too long, split into chunks of equal sizes if len(section) > self.max_section_length: n_chunks = math.ceil(len(section) / float(self.max_section_length)) separator_index = math.floor(len(section) / n_chunks) section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)] url_chunks = [url] * n_chunks name_chunks = [name] * n_chunks sections.extend(section_chunks) urls.extend(url_chunks) names.extend(name_chunks) # If text is not too short, add in 1 chunk elif len(section) > self.min_section_length: sections.append(section) urls.append(url) names.append(name) return sections, urls, names def find_sections(self) -> bs4.element.ResultSet: return self.soup.find_all(["h1", "h2", "h3"], class_="relative group") def build_url(self, suffix: str) -> str: # The splitext is to remove the .html extension return self.base_url + os.path.splitext(self.filename)[0] + suffix