hbertrand commited on
Commit
ebace01
·
unverified ·
1 Parent(s): 06bca0c

PR: parser improvement (#79)

Browse files

* clean up

* using pathlib

Files changed (2) hide show
  1. buster/docparser.py +37 -14
  2. buster/parser.py +11 -1
buster/docparser.py CHANGED
@@ -1,6 +1,7 @@
1
  import glob
2
  import logging
3
  import os
 
4
  from typing import Type
5
 
6
  import click
@@ -54,6 +55,38 @@ supported_docs = {
54
  }
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def get_all_documents(
58
  root_dir: str,
59
  base_url: str,
@@ -68,23 +101,13 @@ def get_all_documents(
68
  """
69
  files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
70
 
71
- sections = []
72
- urls = []
73
- names = []
74
  for file in files:
75
  filepath = os.path.join(root_dir, file)
76
- with open(filepath, "r") as f:
77
- source = f.read()
78
-
79
- soup = BeautifulSoup(source, "html.parser")
80
- parser = parser_cls(soup, base_url, file, min_section_length, max_section_length)
81
- # sections_file, urls_file, names_file =
82
- for section in parser.parse():
83
- sections.append(section.text)
84
- urls.append(section.url)
85
- names.append(section.name)
86
 
87
- documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
88
 
89
  return documents_df
90
 
 
1
  import glob
2
  import logging
3
  import os
4
+ from pathlib import Path
5
  from typing import Type
6
 
7
  import click
 
55
  }
56
 
57
 
58
+ def get_document(
59
+ filepath: str,
60
+ base_url: str,
61
+ parser_cls: Type[Parser],
62
+ min_section_length: int = 100,
63
+ max_section_length: int = 2000,
64
+ ) -> pd.DataFrame:
65
+ """Extract all sections from one file.
66
+
67
+ Sections are broken into subsections if they are longer than `max_section_length`.
68
+ Sections correspond to `section` HTML tags that have a headerlink attached.
69
+ """
70
+ with open(filepath, "r") as f:
71
+ source = f.read()
72
+
73
+ filename = Path(filepath).name
74
+ soup = BeautifulSoup(source, "html.parser")
75
+ parser = parser_cls(soup, base_url, filename, min_section_length, max_section_length)
76
+
77
+ sections = []
78
+ urls = []
79
+ names = []
80
+ for section in parser.parse():
81
+ sections.append(section.text)
82
+ urls.append(section.url)
83
+ names.append(section.name)
84
+
85
+ documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
86
+
87
+ return documents_df
88
+
89
+
90
  def get_all_documents(
91
  root_dir: str,
92
  base_url: str,
 
101
  """
102
  files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
103
 
104
+ dfs = []
 
 
105
  for file in files:
106
  filepath = os.path.join(root_dir, file)
107
+ df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length)
108
+ dfs.append(df)
 
 
 
 
 
 
 
 
109
 
110
+ documents_df = pd.concat(dfs, ignore_index=True)
111
 
112
  return documents_df
113
 
buster/parser.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from abc import ABC, abstractmethod
3
  from dataclasses import InitVar, dataclass, field
4
  from itertools import takewhile, zip_longest
@@ -26,7 +27,16 @@ class Section:
26
  else:
27
  node_text = node.text
28
  section.append(node_text)
29
- self.text = "".join(section).strip()
 
 
 
 
 
 
 
 
 
30
 
31
  def __len__(self) -> int:
32
  return len(self.text)
 
1
  import os
2
+ import re
3
  from abc import ABC, abstractmethod
4
  from dataclasses import InitVar, dataclass, field
5
  from itertools import takewhile, zip_longest
 
27
  else:
28
  node_text = node.text
29
  section.append(node_text)
30
+ self.text = "\n".join(section).strip()
31
+
32
+ # Remove tabs
33
+ self.text = self.text.replace("\t", "")
34
+
35
+ # Replace group of newlines with a single newline
36
+ self.text = re.sub("\n{2,}", "\n", self.text)
37
+
38
+ # Replace non-breaking spaces with regular spaces
39
+ self.text = self.text.replace("\xa0", " ")
40
 
41
  def __len__(self) -> int:
42
  return len(self.text)