Spaces:
Sleeping
Sleeping
Commit
·
2e50c21
1
Parent(s):
86115e8
Update code files
Browse files
app.py
CHANGED
@@ -38,12 +38,24 @@ def extract_text(image, query):
|
|
38 |
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
39 |
|
40 |
def post_process_text(text):
|
41 |
-
#
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Remove repeated phrases
|
44 |
-
|
45 |
-
|
46 |
-
text = '. '.join(unique_phrases)
|
47 |
return text
|
48 |
|
49 |
def ocr(image):
|
|
|
38 |
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
39 |
|
40 |
def post_process_text(text):
|
41 |
+
# Split the text into lines
|
42 |
+
lines = text.split('. ')
|
43 |
+
|
44 |
+
processed_lines = []
|
45 |
+
for line in lines:
|
46 |
+
# Separate Hindi and English text
|
47 |
+
parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
|
48 |
+
if len(parts) > 1:
|
49 |
+
processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
|
50 |
+
else:
|
51 |
+
processed_lines.append(line)
|
52 |
+
|
53 |
+
# Join the lines with double line breaks
|
54 |
+
text = '\n\n'.join(processed_lines)
|
55 |
+
|
56 |
# Remove repeated phrases
|
57 |
+
unique_phrases = list(dict.fromkeys(text.split('\n\n')))
|
58 |
+
text = '\n\n'.join(unique_phrases)
|
|
|
59 |
return text
|
60 |
|
61 |
def ocr(image):
|