Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +6 -3
mdr_pdf_parser.py
CHANGED
@@ -1855,7 +1855,7 @@ class MDROcrEngine:
|
|
1855 |
det_model_dir=paths["det"],
|
1856 |
cls_model_dir=paths["cls"],
|
1857 |
rec_model_dir=paths["rec"],
|
1858 |
-
rec_char_dict_path=
|
1859 |
# much lower thresholds so we actually get some candidate masks:
|
1860 |
det_db_thresh=0.1,
|
1861 |
det_db_box_thresh=0.3,
|
@@ -3084,9 +3084,12 @@ class MDRDocumentIterator:
|
|
3084 |
def iterate_sections(self, params: MDRProcessingParams) -> Generator[
|
3085 |
tuple[int, MDRExtractionResult, list[MDRLayoutElement]], None, None]:
|
3086 |
"""Yields page index, extraction result, and content layouts for each requested page."""
|
|
|
3087 |
for res, sec in self._process_and_link_sections(params):
|
3088 |
-
framework
|
3089 |
-
|
|
|
|
|
3090 |
yield sec.page_index, res, content
|
3091 |
|
3092 |
def _process_and_link_sections(self, params: MDRProcessingParams) -> Generator[
|
|
|
1855 |
det_model_dir=paths["det"],
|
1856 |
cls_model_dir=paths["cls"],
|
1857 |
rec_model_dir=paths["rec"],
|
1858 |
+
rec_char_dict_path=paths["keys"],
|
1859 |
# much lower thresholds so we actually get some candidate masks:
|
1860 |
det_db_thresh=0.1,
|
1861 |
det_db_box_thresh=0.3,
|
|
|
3084 |
def iterate_sections(self, params: MDRProcessingParams) -> Generator[
|
3085 |
tuple[int, MDRExtractionResult, list[MDRLayoutElement]], None, None]:
|
3086 |
"""Yields page index, extraction result, and content layouts for each requested page."""
|
3087 |
+
# In MDRDocumentIterator.iterate_sections
|
3088 |
for res, sec in self._process_and_link_sections(params):
|
3089 |
+
# Get the IDs of the framework elements
|
3090 |
+
framework_element_ids = {id(fw_el) for fw_el in sec.find_framework_elements()}
|
3091 |
+
# Filter content layouts by checking if their ID is not in the set of framework element IDs
|
3092 |
+
content = [l for l in res.layouts if id(l) not in framework_element_ids]
|
3093 |
yield sec.page_index, res, content
|
3094 |
|
3095 |
def _process_and_link_sections(self, params: MDRProcessingParams) -> Generator[
|