leonarb commited on
Commit
8a21578
·
verified ·
1 Parent(s): 9f080c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -5
app.py CHANGED
@@ -40,6 +40,19 @@ def clean_page_headers(text):
40
  cleaned.append(line)
41
  return "\n".join(cleaned)
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def process_pdf_to_html(pdf_file, title, author):
44
  pdf_path = pdf_file.name
45
  doc = fitz.open(pdf_path)
@@ -117,15 +130,13 @@ def process_pdf_to_html(pdf_file, title, author):
117
  print(f"Decoded content for page {page_num}: {decoded}")
118
 
119
  cleaned_text = clean_page_headers(decoded)
 
 
 
120
  mathml_converted = convert_latex_to_mathml(cleaned_text)
121
  markdown_converted = markdown2.markdown(mathml_converted)
122
  html_page = markdown_converted.replace("\n", "<br>")
123
 
124
- if page_num in toc_by_page:
125
- for level, header in toc_by_page[page_num]:
126
- tag = f"h{min(level, 6)}"
127
- html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
128
-
129
  all_text += f"<div>{html_page}</div>\n"
130
 
131
  if page_num == 1:
 
40
  cleaned.append(line)
41
  return "\n".join(cleaned)
42
 
43
+ def replace_headers_in_text(text, page_headers):
44
+ lines = text.split("\n")
45
+ for level, header in page_headers:
46
+ tag = f"h{min(level, 6)}"
47
+ pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
48
+ for idx, line in enumerate(lines):
49
+ if pattern.fullmatch(line.strip()):
50
+ lines[idx] = f"<{tag}>{html.escape(header.strip())}</{tag}>"
51
+ break # only replace first match
52
+ else:
53
+ lines.insert(0, f"<{tag}>{html.escape(header.strip())}</{tag}>") # fallback insert
54
+ return "\n".join(lines)
55
+
56
  def process_pdf_to_html(pdf_file, title, author):
57
  pdf_path = pdf_file.name
58
  doc = fitz.open(pdf_path)
 
130
  print(f"Decoded content for page {page_num}: {decoded}")
131
 
132
  cleaned_text = clean_page_headers(decoded)
133
+ if page_num in toc_by_page:
134
+ cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
135
+
136
  mathml_converted = convert_latex_to_mathml(cleaned_text)
137
  markdown_converted = markdown2.markdown(mathml_converted)
138
  html_page = markdown_converted.replace("\n", "<br>")
139
 
 
 
 
 
 
140
  all_text += f"<div>{html_page}</div>\n"
141
 
142
  if page_num == 1: