Spaces:

UniquePratham
/

DualTextOCRFusion

Sleeping

App Files Files Community

UniquePratham commited on Sep 27, 2024

Commit

afedbd6

verified ·

1 Parent(s): 8b34af2

Update ocr_cpu.py

Browse files

Files changed (1) hide show

ocr_cpu.py +70 -25

ocr_cpu.py CHANGED Viewed

@@ -1,27 +1,54 @@
 import os
 import torch
-from transformers import AutoModel, AutoTokenizer
-# Load model and tokenizer
-model_name = "srimanth-d/GOT_CPU"  # Using GOT model on CPU
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, return_tensors='pt')
-# Load the model
-model = AutoModel.from_pretrained(
-    model_name,
     trust_remote_code=True,
     low_cpu_mem_usage=True,
     use_safetensors=True,
-    pad_token_id=tokenizer.eos_token_id,
 )
-# Ensure the model is in evaluation mode and loaded on CPU
-device = torch.device("cpu")
-model = model.eval()
-# OCR function to extract text
 def extract_text_got(uploaded_file):
-    """Use GOT-OCR2.0 model to extract text from the uploaded image."""
     temp_file_path = 'temp_image.jpg'
     try:
@@ -38,7 +65,7 @@ def extract_text_got(uploaded_file):
         for ocr_type in ocr_types:
             with torch.no_grad():
                 print(f"Running OCR with type: {ocr_type}")
-                outputs = model.chat(tokenizer, temp_file_path, ocr_type=ocr_type)
                 if isinstance(outputs, list) and outputs[0].strip():
                     return outputs[0].strip()  # Return the result if successful
@@ -56,22 +83,40 @@ def extract_text_got(uploaded_file):
             os.remove(temp_file_path)
             print(f"Temporary file {temp_file_path} removed.")
-# Function to clean extracted text using AI
 def clean_text_with_ai(extracted_text):
     """
-    Cleans extracted text by leveraging an AI model to intelligently remove extra spaces.
     """
     try:
-        # Prepare the input for the AI model
-        inputs = tokenizer(extracted_text, return_tensors="pt").to(device)
-        # Generate cleaned text using the AI model
         with torch.no_grad():
-            outputs = model.generate(**inputs, max_new_tokens=100)  # Adjust max_new_tokens as needed
-        # Decode the generated output
-        cleaned_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return cleaned_text.strip()  # Return the cleaned text
     except Exception as e:
         return f"Error during AI text cleaning: {str(e)}"

+# ocr_cpu.py
 import os
 import torch
+from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
+import re
+# -----------------------------
+# OCR Model Initialization
+# -----------------------------
+# Load OCR model and tokenizer
+ocr_model_name = "srimanth-d/GOT_CPU"  # Using GOT model on CPU
+ocr_tokenizer = AutoTokenizer.from_pretrained(
+    ocr_model_name, trust_remote_code=True, return_tensors='pt'
+)
+# Load the OCR model
+ocr_model = AutoModel.from_pretrained(
+    ocr_model_name,
     trust_remote_code=True,
     low_cpu_mem_usage=True,
     use_safetensors=True,
+    pad_token_id=ocr_tokenizer.eos_token_id,
 )
+# Ensure the OCR model is in evaluation mode and loaded on CPU
+ocr_device = torch.device("cpu")
+ocr_model = ocr_model.eval().to(ocr_device)
+# -----------------------------
+# Text Cleaning Model Initialization
+# -----------------------------
+# Load Text Cleaning model and tokenizer
+clean_model_name = "gpt2"  # You can choose a different model if preferred
+clean_tokenizer = AutoTokenizer.from_pretrained(clean_model_name)
+clean_model = AutoModelForCausalLM.from_pretrained(clean_model_name)
+# Ensure the Text Cleaning model is in evaluation mode and loaded on CPU
+clean_device = torch.device("cpu")
+clean_model = clean_model.eval().to(clean_device)
+# -----------------------------
+# OCR Function
+# -----------------------------
 def extract_text_got(uploaded_file):
+    """
+    Use GOT-OCR2.0 model to extract text from the uploaded image.
+    """
     temp_file_path = 'temp_image.jpg'
     try:
         for ocr_type in ocr_types:
             with torch.no_grad():
                 print(f"Running OCR with type: {ocr_type}")
+                outputs = ocr_model.chat(ocr_tokenizer, temp_file_path, ocr_type=ocr_type)
                 if isinstance(outputs, list) and outputs[0].strip():
                     return outputs[0].strip()  # Return the result if successful
             os.remove(temp_file_path)
             print(f"Temporary file {temp_file_path} removed.")
+# -----------------------------
+# Text Cleaning Function
+# -----------------------------
 def clean_text_with_ai(extracted_text):
     """
+    Cleans extracted text by leveraging a language model to intelligently remove extra spaces and correct formatting.
     """
     try:
+        # Define the prompt for cleaning
+        prompt = f"Please clean the following text by removing extra spaces and ensuring proper formatting:\n\n{extracted_text}\n\nCleaned Text:"
+        # Tokenize the input prompt
+        inputs = clean_tokenizer.encode(prompt, return_tensors="pt").to(clean_device)
+        # Generate the cleaned text
         with torch.no_grad():
+            outputs = clean_model.generate(
+                inputs,
+                max_length=500,  # Adjust as needed
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                eos_token_id=clean_tokenizer.eos_token_id,
+                pad_token_id=clean_tokenizer.eos_token_id
+            )
+        # Decode the generated text
+        cleaned_text = clean_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract the cleaned text after the prompt
+        cleaned_text = cleaned_text.split("Cleaned Text:")[-1].strip()
+        return cleaned_text
     except Exception as e:
         return f"Error during AI text cleaning: {str(e)}"