intuitive262 commited on
Commit
2e50c21
·
1 Parent(s): 86115e8

Update code files

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -38,12 +38,24 @@ def extract_text(image, query):
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
  def post_process_text(text):
41
- # Remove extra whitespace
42
- text = re.sub(r'\s+', ' ', text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Remove repeated phrases
44
- phrases = text.split('. ')
45
- unique_phrases = list(dict.fromkeys(phrases))
46
- text = '. '.join(unique_phrases)
47
  return text
48
 
49
  def ocr(image):
 
38
  return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
39
 
40
  def post_process_text(text):
41
+ # Split the text into lines
42
+ lines = text.split('. ')
43
+
44
+ processed_lines = []
45
+ for line in lines:
46
+ # Separate Hindi and English text
47
+ parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
48
+ if len(parts) > 1:
49
+ processed_lines.append(f"{parts[0]}{parts[1]}\n {parts[2]}")
50
+ else:
51
+ processed_lines.append(line)
52
+
53
+ # Join the lines with double line breaks
54
+ text = '\n\n'.join(processed_lines)
55
+
56
  # Remove repeated phrases
57
+ unique_phrases = list(dict.fromkeys(text.split('\n\n')))
58
+ text = '\n\n'.join(unique_phrases)
 
59
  return text
60
 
61
  def ocr(image):