Spaces:

leonarb
/

olmocr-demo

Running

leonarb commited on May 9

Commit

8a21578

verified ·

1 Parent(s): 9f080c3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -40,6 +40,19 @@ def clean_page_headers(text):
             cleaned.append(line)
     return "\n".join(cleaned)
 def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
@@ -117,15 +130,13 @@ def process_pdf_to_html(pdf_file, title, author):
         print(f"Decoded content for page {page_num}: {decoded}")
         cleaned_text = clean_page_headers(decoded)
         mathml_converted = convert_latex_to_mathml(cleaned_text)
         markdown_converted = markdown2.markdown(mathml_converted)
         html_page = markdown_converted.replace("\n", "<br>")
-        if page_num in toc_by_page:
-            for level, header in toc_by_page[page_num]:
-                tag = f"h{min(level, 6)}"
-                html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
         all_text += f"<div>{html_page}</div>\n"
         if page_num == 1:

             cleaned.append(line)
     return "\n".join(cleaned)
+def replace_headers_in_text(text, page_headers):
+    lines = text.split("\n")
+    for level, header in page_headers:
+        tag = f"h{min(level, 6)}"
+        pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
+        for idx, line in enumerate(lines):
+            if pattern.fullmatch(line.strip()):
+                lines[idx] = f"<{tag}>{html.escape(header.strip())}</{tag}>"
+                break  # only replace first match
+        else:
+            lines.insert(0, f"<{tag}>{html.escape(header.strip())}</{tag}>")  # fallback insert
+    return "\n".join(lines)
 def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
         print(f"Decoded content for page {page_num}: {decoded}")
         cleaned_text = clean_page_headers(decoded)
+        if page_num in toc_by_page:
+            cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
         mathml_converted = convert_latex_to_mathml(cleaned_text)
         markdown_converted = markdown2.markdown(mathml_converted)
         html_page = markdown_converted.replace("\n", "<br>")
         all_text += f"<div>{html_page}</div>\n"
         if page_num == 1: