rodrigomasini commited on
Commit
8bb89d1
·
verified ·
1 Parent(s): 90c536f

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +6 -3
mdr_pdf_parser.py CHANGED
@@ -1855,7 +1855,7 @@ class MDROcrEngine:
1855
  det_model_dir=paths["det"],
1856
  cls_model_dir=paths["cls"],
1857
  rec_model_dir=paths["rec"],
1858
- rec_char_dict_path=None,
1859
  # much lower thresholds so we actually get some candidate masks:
1860
  det_db_thresh=0.1,
1861
  det_db_box_thresh=0.3,
@@ -3084,9 +3084,12 @@ class MDRDocumentIterator:
3084
  def iterate_sections(self, params: MDRProcessingParams) -> Generator[
3085
  tuple[int, MDRExtractionResult, list[MDRLayoutElement]], None, None]:
3086
  """Yields page index, extraction result, and content layouts for each requested page."""
 
3087
  for res, sec in self._process_and_link_sections(params):
3088
- framework = set(sec.find_framework_elements());
3089
- content = [l for l in res.layouts if l not in framework];
 
 
3090
  yield sec.page_index, res, content
3091
 
3092
  def _process_and_link_sections(self, params: MDRProcessingParams) -> Generator[
 
1855
  det_model_dir=paths["det"],
1856
  cls_model_dir=paths["cls"],
1857
  rec_model_dir=paths["rec"],
1858
+ rec_char_dict_path=paths["keys"],
1859
  # much lower thresholds so we actually get some candidate masks:
1860
  det_db_thresh=0.1,
1861
  det_db_box_thresh=0.3,
 
3084
  def iterate_sections(self, params: MDRProcessingParams) -> Generator[
3085
  tuple[int, MDRExtractionResult, list[MDRLayoutElement]], None, None]:
3086
  """Yields page index, extraction result, and content layouts for each requested page."""
3087
+ # In MDRDocumentIterator.iterate_sections
3088
  for res, sec in self._process_and_link_sections(params):
3089
+ # Get the IDs of the framework elements
3090
+ framework_element_ids = {id(fw_el) for fw_el in sec.find_framework_elements()}
3091
+ # Filter content layouts by checking if their ID is not in the set of framework element IDs
3092
+ content = [l for l in res.layouts if id(l) not in framework_element_ids]
3093
  yield sec.page_index, res, content
3094
 
3095
  def _process_and_link_sections(self, params: MDRProcessingParams) -> Generator[