Spaces:

intuitive262
/

Doc_Reader

Sleeping

intuitive262 commited on Sep 29, 2024

Commit

2e50c21

1 Parent(s): 86115e8

Update code files

Files changed (1) hide show

app.py CHANGED Viewed

@@ -38,12 +38,24 @@ def extract_text(image, query):
         return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 def post_process_text(text):
-    # Remove extra whitespace
-    text = re.sub(r'\s+', ' ', text).strip()
     # Remove repeated phrases
-    phrases = text.split('. ')
-    unique_phrases = list(dict.fromkeys(phrases))
-    text = '. '.join(unique_phrases)
     return text
 def ocr(image):

         return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 def post_process_text(text):
+    # Split the text into lines
+    lines = text.split('. ')
+    processed_lines = []
+    for line in lines:
+        # Separate Hindi and English text
+        parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
+        if len(parts) > 1:
+            processed_lines.append(f"{parts[0]}{parts[1]}\n    {parts[2]}")
+        else:
+            processed_lines.append(line)
+    # Join the lines with double line breaks
+    text = '\n\n'.join(processed_lines)
     # Remove repeated phrases
+    unique_phrases = list(dict.fromkeys(text.split('\n\n')))
+    text = '\n\n'.join(unique_phrases)
     return text
 def ocr(image):