File size: 6,891 Bytes
ca69a0e
8b18b7b
 
cc10da2
 
 
 
 
 
 
 
 
 
 
 
 
 
ae7d660
8b18b7b
 
 
 
 
 
fcfc162
cc10da2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b18b7b
 
 
 
bd2b9ae
5244794
8b18b7b
 
 
 
 
 
233d635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc10da2
 
 
 
 
233d635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca69a0e
5244794
233d635
8b18b7b
bd2b9ae
233d635
 
bd2b9ae
233d635
 
bd2b9ae
233d635
ca69a0e
8b18b7b
 
 
 
 
 
 
 
ca69a0e
8b18b7b
 
 
 
 
 
 
ca69a0e
8b18b7b
ddb299c
cc10da2
 
 
 
8b18b7b
5244794
8b18b7b
 
5244794
233d635
8b18b7b
 
5244794
8b18b7b
 
 
 
 
233d635
8b18b7b
 
 
 
5244794
8b18b7b
 
 
 
5244794
8b18b7b
 
 
5244794
8b18b7b
5244794
 
 
8b18b7b
 
5244794
8b18b7b
 
5244794
8b18b7b
 
 
 
5244794
8b18b7b
 
5244794
8b18b7b
 
ae7d660
 
5244794
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import streamlit as st
import logging
from concurrent.futures import ThreadPoolExecutor
import subprocess
import sys

# Attempt to import libraries, with fallback
try:
    import pytesseract
    import cv2
    import numpy as np
    from PIL import Image
    import fitz  # PyMuPDF for PDF processing
    from transformers import pipeline
except ImportError:
    st.error("Required libraries are missing. Please install them using pip.")
    st.stop()

# Setup logging
def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
    )

# Tesseract installation check and guide
def check_tesseract():
    try:
        # Try to get Tesseract version
        version = subprocess.check_output(['tesseract', '--version'], 
                                          stderr=subprocess.STDOUT).decode('utf-8')
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        # Provide installation instructions based on operating system
        st.error("Tesseract OCR is not installed.")
        st.markdown("### Tesseract Installation Guide:")
        
        if sys.platform.startswith('linux'):
            st.code("""
            # For Ubuntu/Debian
            sudo apt-get update
            sudo apt-get install -y tesseract-ocr
            
            # For Fedora
            sudo dnf install -y tesseract
            
            # For CentOS/RHEL
            sudo yum install -y tesseract
            """)
        elif sys.platform.startswith('darwin'):
            st.code("""
            # For macOS (using Homebrew)
            brew install tesseract
            """)
        elif sys.platform.startswith('win'):
            st.markdown("""
            1. Download Tesseract installer from:
               https://github.com/UB-Mannheim/tesseract/wiki
            2. Run the installer
            3. Add Tesseract directory to your system PATH
            """)
        
        st.info("After installation, restart your application.")
        return False

# Load models globally for faster performance
@st.cache_resource
def load_models():
    logging.info("Loading Hugging Face models...")
    
    # Translation models
    translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
    
    # Summarization model
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    
    return translator_hi, translator_ur, summarizer

# Function to preprocess image for better OCR
def preprocess_image(image):
    # Convert PIL Image to OpenCV format
    img_np = np.array(image)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
    
    # Apply thresholding to preprocess the image
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
    # Apply deskewing if needed
    coords = np.column_stack(np.where(gray > 0))
    
    # Prevent error if no foreground pixels found
    if coords.size == 0:
        return gray
    
    angle = cv2.minAreaRect(coords)[-1]
    
    # The cv2.minAreaRect returns values in the range [:-90, 0)
    # so we need to take the inverse to get the rotation from the horizontal axis
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    # Rotate the image to deskew
    (h, w) = gray.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

# Function to extract text from images
def extract_text_from_image(image):
    logging.info("Extracting text from image...")
    
    # Preprocess image
    preprocessed_img = preprocess_image(image)
    
    # Use pytesseract for OCR
    text = pytesseract.image_to_string(preprocessed_img)
    
    return text.strip()

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_file):
    logging.info("Extracting text from PDF...")
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to process text in chunks for better performance
def process_chunks(text, model, chunk_size=500):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    results = []
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
    return " ".join([result[0]["translation_text"] for result in results])

# Main app logic
def main():
    # Check Tesseract installation first
    if not check_tesseract():
        return

    setup_logging()
    st.title("Advanced Lab Report Analyzer")
    st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
    
    # Load all models
    translator_hi, translator_ur, summarizer = load_models()
    
    file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
    
    if file:
        text = ""
        try:
            if file.type in ["image/jpeg", "image/png", "image/jpg"]:
                image = Image.open(file)
                text = extract_text_from_image(image)
            elif file.type == "application/pdf":
                text = extract_text_from_pdf(file)
            elif file.type == "text/plain":
                text = file.read().decode("utf-8")
            
            if text:
                with st.spinner("Analyzing the report..."):
                    # Generate summary
                    summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
                    
                    # Generate translations
                    hindi_translation = process_chunks(text, translator_hi)
                    urdu_translation = process_chunks(text, translator_ur)
                    
                    # Display results
                    st.subheader("Original Text:")
                    st.write(text)
                    
                    st.subheader("Analysis Summary (English):")
                    st.write(summary)
                    
                    st.subheader("Hindi Translation:")
                    st.write(hindi_translation)
                    
                    st.subheader("Urdu Translation:")
                    st.write(urdu_translation)
            else:
                st.warning("No text could be extracted. Please check the file and try again.")
        
        except Exception as e:
            logging.error(f"Error processing the file: {e}")
            st.error(f"An error occurred while processing the file: {e}")
    else:
        st.info("Please upload a file to begin.")

if __name__ == "__main__":
    main()