Spaces:

mike23415
/

Vlm-test

Sleeping

App Files Files Community

mike23415 commited on 5 days ago

Commit

035a6f9

verified ·

1 Parent(s): bace547

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -22

app.py CHANGED Viewed

@@ -4,58 +4,156 @@ from PIL import Image
 import io
 import os
-from transformers import DonutProcessor, VisionEncoderDecoderModel
 import torch
 import fitz  # PyMuPDF
 # Initialize Flask
 app = Flask(__name__)
 CORS(app)
-# Load Donut model and processor
-device = "cpu"
-processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
-model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base").to(device)
 model.eval()
 def convert_pdf_to_image(file_stream):
     doc = fitz.open(stream=file_stream.read(), filetype="pdf")
     page = doc.load_page(0)
-    pix = page.get_pixmap(dpi=150)
     img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
     return img
 @app.route("/ocr", methods=["POST"])
 def ocr():
     if "file" not in request.files:
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
     filename = file.filename.lower()
-    # Convert input to PIL image
-    if filename.endswith(".pdf"):
-        image = convert_pdf_to_image(file)
-    else:
-        image = Image.open(io.BytesIO(file.read())).convert("RGB")
-    # Preprocess image
-    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
-    # Run model
-    with torch.no_grad():
-        output = model.generate(pixel_values, max_length=512, return_dict_in_generate=True)
-    # Decode output
-    parsed_text = processor.batch_decode(output.sequences)[0]
-    parsed_text = processor.tokenizer.decode(output.sequences[0], skip_special_tokens=True)
-    return jsonify({"text": parsed_text})
 @app.route("/", methods=["GET"])
 def index():
-    return "Smart OCR Flask API (Donut-based)"
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

 import io
 import os
+# Option 1: Using TrOCR (Transformer-based OCR)
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 import torch
+# Option 2: Using EasyOCR (commented out - uncomment if you prefer this)
+# import easyocr
+# Option 3: Using Tesseract (commented out - uncomment if you prefer this)
+# import pytesseract
 import fitz  # PyMuPDF
 # Initialize Flask
 app = Flask(__name__)
 CORS(app)
+# Load TrOCR model and processor (better for text extraction)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
+model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(device)
 model.eval()
+# Alternative: Initialize EasyOCR reader (uncomment if using EasyOCR)
+# reader = easyocr.Reader(['en'])
 def convert_pdf_to_image(file_stream):
+    """Convert PDF to image with higher DPI for better OCR"""
     doc = fitz.open(stream=file_stream.read(), filetype="pdf")
     page = doc.load_page(0)
+    # Increase DPI for better text recognition
+    pix = page.get_pixmap(dpi=300)  # Higher DPI
     img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    doc.close()
     return img
+def preprocess_image(image):
+    """Preprocess image for better OCR results"""
+    # Convert to grayscale if needed
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    # Resize if image is too small
+    width, height = image.size
+    if width < 1000 or height < 1000:
+        scale_factor = max(1000/width, 1000/height)
+        new_width = int(width * scale_factor)
+        new_height = int(height * scale_factor)
+        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    return image
+def extract_text_trocr(image):
+    """Extract text using TrOCR"""
+    try:
+        # Split image into chunks if it's large (TrOCR works better on smaller sections)
+        width, height = image.size
+        chunk_height = 400  # Process in chunks
+        extracted_texts = []
+        for y in range(0, height, chunk_height):
+            chunk = image.crop((0, y, width, min(y + chunk_height, height)))
+            # Process with TrOCR
+            pixel_values = processor(chunk, return_tensors="pt").pixel_values.to(device)
+            with torch.no_grad():
+                generated_ids = model.generate(pixel_values, max_length=512)
+            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            if generated_text.strip():
+                extracted_texts.append(generated_text.strip())
+        return "\n".join(extracted_texts)
+    except Exception as e:
+        print(f"TrOCR error: {e}")
+        return ""
+def extract_text_easyocr(image):
+    """Extract text using EasyOCR (uncomment the import and initialization above)"""
+    try:
+        results = reader.readtext(image)
+        extracted_text = []
+        for (bbox, text, confidence) in results:
+            if confidence > 0.5:  # Filter low confidence detections
+                extracted_text.append(text)
+        return "\n".join(extracted_text)
+    except Exception as e:
+        print(f"EasyOCR error: {e}")
+        return ""
+def extract_text_tesseract(image):
+    """Extract text using Tesseract (uncomment the import above)"""
+    try:
+        # Convert to grayscale for better OCR
+        gray_image = image.convert('L')
+        text = pytesseract.image_to_string(gray_image, config='--psm 6')
+        return text.strip()
+    except Exception as e:
+        print(f"Tesseract error: {e}")
+        return ""
 @app.route("/ocr", methods=["POST"])
 def ocr():
     if "file" not in request.files:
         return jsonify({"error": "No file uploaded"}), 400
     file = request.files["file"]
+    if not file.filename:
+        return jsonify({"error": "No file selected"}), 400
     filename = file.filename.lower()
+    try:
+        # Convert input to PIL image
+        if filename.endswith(".pdf"):
+            image = convert_pdf_to_image(file)
+        else:
+            image = Image.open(io.BytesIO(file.read())).convert("RGB")
+        # Preprocess image
+        image = preprocess_image(image)
+        # Extract text using TrOCR (primary method)
+        extracted_text = extract_text_trocr(image)
+        # If TrOCR fails or returns empty, try alternative methods
+        if not extracted_text:
+            print("TrOCR failed, trying alternative methods...")
+            # Uncomment one of these if you have the libraries installed:
+            # extracted_text = extract_text_easyocr(image)
+            # extracted_text = extract_text_tesseract(image)
+        if not extracted_text:
+            return jsonify({
+                "text": "",
+                "message": "No text could be extracted from the image. The image might be too blurry, have low contrast, or contain handwritten text."
+            })
+        return jsonify({
+            "text": extracted_text,
+            "message": "Text extracted successfully"
+        })
+    except Exception as e:
+        print(f"OCR processing error: {e}")
+        return jsonify({"error": f"Failed to process file: {str(e)}"}), 500
 @app.route("/", methods=["GET"])
 def index():
+    return "Smart OCR Flask API (TrOCR-based)"
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)