Spaces:

mdasad3617
/

lab-report-analyzer

Sleeping

File size: 6,891 Bytes

import streamlit as st
import logging
from concurrent.futures import ThreadPoolExecutor
import subprocess
import sys

# Attempt to import libraries, with fallback
try:
    import pytesseract
    import cv2
    import numpy as np
    from PIL import Image
    import fitz  # PyMuPDF for PDF processing
    from transformers import pipeline
except ImportError:
    st.error("Required libraries are missing. Please install them using pip.")
    st.stop()

# Setup logging
def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
    )

# Tesseract installation check and guide
def check_tesseract():
    try:
        # Try to get Tesseract version
        version = subprocess.check_output(['tesseract', '--version'], 
                                          stderr=subprocess.STDOUT).decode('utf-8')
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        # Provide installation instructions based on operating system
        st.error("Tesseract OCR is not installed.")
        st.markdown("### Tesseract Installation Guide:")
        
        if sys.platform.startswith('linux'):
            st.code("""
            # For Ubuntu/Debian
            sudo apt-get update
            sudo apt-get install -y tesseract-ocr
            
            # For Fedora
            sudo dnf install -y tesseract
            
            # For CentOS/RHEL
            sudo yum install -y tesseract
            """)
        elif sys.platform.startswith('darwin'):
            st.code("""
            # For macOS (using Homebrew)
            brew install tesseract
            """)
        elif sys.platform.startswith('win'):
            st.markdown("""
            1. Download Tesseract installer from:
               https://github.com/UB-Mannheim/tesseract/wiki
            2. Run the installer
            3. Add Tesseract directory to your system PATH
            """)
        
        st.info("After installation, restart your application.")
        return False

# Load models globally for faster performance
@st.cache_resource
def load_models():
    logging.info("Loading Hugging Face models...")
    
    # Translation models
    translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
    
    # Summarization model
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    
    return translator_hi, translator_ur, summarizer

# Function to preprocess image for better OCR
def preprocess_image(image):
    # Convert PIL Image to OpenCV format
    img_np = np.array(image)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
    
    # Apply thresholding to preprocess the image
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
    # Apply deskewing if needed
    coords = np.column_stack(np.where(gray > 0))
    
    # Prevent error if no foreground pixels found
    if coords.size == 0:
        return gray
    
    angle = cv2.minAreaRect(coords)[-1]
    
    # The cv2.minAreaRect returns values in the range [:-90, 0)
    # so we need to take the inverse to get the rotation from the horizontal axis
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    # Rotate the image to deskew
    (h, w) = gray.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

# Function to extract text from images
def extract_text_from_image(image):
    logging.info("Extracting text from image...")
    
    # Preprocess image
    preprocessed_img = preprocess_image(image)
    
    # Use pytesseract for OCR
    text = pytesseract.image_to_string(preprocessed_img)
    
    return text.strip()

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_file):
    logging.info("Extracting text from PDF...")
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to process text in chunks for better performance
def process_chunks(text, model, chunk_size=500):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    results = []
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
    return " ".join([result[0]["translation_text"] for result in results])

# Main app logic
def main():
    # Check Tesseract installation first
    if not check_tesseract():
        return

    setup_logging()
    st.title("Advanced Lab Report Analyzer")
    st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
    
    # Load all models
    translator_hi, translator_ur, summarizer = load_models()
    
    file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
    
    if file:
        text = ""
        try:
            if file.type in ["image/jpeg", "image/png", "image/jpg"]:
                image = Image.open(file)
                text = extract_text_from_image(image)
            elif file.type == "application/pdf":
                text = extract_text_from_pdf(file)
            elif file.type == "text/plain":
                text = file.read().decode("utf-8")
            
            if text:
                with st.spinner("Analyzing the report..."):
                    # Generate summary
                    summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
                    
                    # Generate translations
                    hindi_translation = process_chunks(text, translator_hi)
                    urdu_translation = process_chunks(text, translator_ur)
                    
                    # Display results
                    st.subheader("Original Text:")
                    st.write(text)
                    
                    st.subheader("Analysis Summary (English):")
                    st.write(summary)
                    
                    st.subheader("Hindi Translation:")
                    st.write(hindi_translation)
                    
                    st.subheader("Urdu Translation:")
                    st.write(urdu_translation)
            else:
                st.warning("No text could be extracted. Please check the file and try again.")
        
        except Exception as e:
            logging.error(f"Error processing the file: {e}")
            st.error(f"An error occurred while processing the file: {e}")
    else:
        st.info("Please upload a file to begin.")

if __name__ == "__main__":
    main()