Spaces:

jerpint
/

buster-dev

Runtime error

hbertrand commited on Apr 4, 2023

Commit

ebace01

unverified ·

1 Parent(s): 06bca0c

PR: parser improvement (#79)

* clean up

* using pathlib

Files changed (2) hide show

buster/docparser.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import glob
 import logging
 import os
 from typing import Type
 import click
@@ -54,6 +55,38 @@ supported_docs = {
 }
 def get_all_documents(
     root_dir: str,
     base_url: str,
@@ -68,23 +101,13 @@ def get_all_documents(
     """
     files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
-    sections = []
-    urls = []
-    names = []
     for file in files:
         filepath = os.path.join(root_dir, file)
-        with open(filepath, "r") as f:
-            source = f.read()
-        soup = BeautifulSoup(source, "html.parser")
-        parser = parser_cls(soup, base_url, file, min_section_length, max_section_length)
-        # sections_file, urls_file, names_file =
-        for section in parser.parse():
-            sections.append(section.text)
-            urls.append(section.url)
-            names.append(section.name)
-    documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
     return documents_df

 import glob
 import logging
 import os
+from pathlib import Path
 from typing import Type
 import click
 }
+def get_document(
+    filepath: str,
+    base_url: str,
+    parser_cls: Type[Parser],
+    min_section_length: int = 100,
+    max_section_length: int = 2000,
+) -> pd.DataFrame:
+    """Extract all sections from one file.
+    Sections are broken into subsections if they are longer than `max_section_length`.
+    Sections correspond to `section` HTML tags that have a headerlink attached.
+    """
+    with open(filepath, "r") as f:
+        source = f.read()
+    filename = Path(filepath).name
+    soup = BeautifulSoup(source, "html.parser")
+    parser = parser_cls(soup, base_url, filename, min_section_length, max_section_length)
+    sections = []
+    urls = []
+    names = []
+    for section in parser.parse():
+        sections.append(section.text)
+        urls.append(section.url)
+        names.append(section.name)
+    documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
+    return documents_df
 def get_all_documents(
     root_dir: str,
     base_url: str,
     """
     files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
+    dfs = []
     for file in files:
         filepath = os.path.join(root_dir, file)
+        df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length)
+        dfs.append(df)
+    documents_df = pd.concat(dfs, ignore_index=True)
     return documents_df

buster/parser.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import InitVar, dataclass, field
 from itertools import takewhile, zip_longest
@@ -26,7 +27,16 @@ class Section:
             else:
                 node_text = node.text
             section.append(node_text)
-        self.text = "".join(section).strip()
     def __len__(self) -> int:
         return len(self.text)

 import os
+import re
 from abc import ABC, abstractmethod
 from dataclasses import InitVar, dataclass, field
 from itertools import takewhile, zip_longest
             else:
                 node_text = node.text
             section.append(node_text)
+        self.text = "\n".join(section).strip()
+        # Remove tabs
+        self.text = self.text.replace("\t", "")
+        # Replace group of newlines with a single newline
+        self.text = re.sub("\n{2,}", "\n", self.text)
+        # Replace non-breaking spaces with regular spaces
+        self.text = self.text.replace("\xa0", " ")
     def __len__(self) -> int:
         return len(self.text)