Praful Nayak commited on
Commit
bf65784
·
1 Parent(s): e4db3c7

Deploy Flask Summarization App

Browse files
Files changed (2) hide show
  1. Dockerfile +18 -8
  2. app.py +88 -29
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  # Use a lightweight Python image
2
- FROM python:3.9
3
 
4
- # Create a user and set environment
5
  RUN useradd -m -u 1000 user
6
  USER user
7
  ENV PATH="/home/user/.local/bin:$PATH"
@@ -9,15 +9,25 @@ ENV PATH="/home/user/.local/bin:$PATH"
9
  # Set working directory
10
  WORKDIR /app
11
 
12
- # Copy requirements file and install dependencies
13
- COPY --chown=user requirements.txt requirements.txt
14
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Copy application files
17
- COPY --chown=user . /app
18
 
19
  # Expose the necessary port
20
  EXPOSE 7860
21
 
22
- # Run the Flask app using Gunicorn
23
- CMD ["gunicorn", "-w", "2", "-b", "0.0.0.0:7860", "app:app"]
 
1
  # Use a lightweight Python image
2
+ FROM python:3.9-slim
3
 
4
+ # Create a non-root user and set environment
5
  RUN useradd -m -u 1000 user
6
  USER user
7
  ENV PATH="/home/user/.local/bin:$PATH"
 
9
  # Set working directory
10
  WORKDIR /app
11
 
12
+ # Install system dependencies for pdfplumber and pytesseract
13
+ RUN apt-get update && apt-get install -y \
14
+ libpng-dev \
15
+ libjpeg-dev \
16
+ zlib1g-dev \
17
+ tesseract-ocr \
18
+ libtesseract-dev \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ # Copy requirements file and install Python dependencies
22
+ COPY --chown=user:user requirements.txt requirements.txt
23
+ RUN pip install --no-cache-dir --upgrade pip && \
24
+ pip install --no-cache-dir -r requirements.txt
25
 
26
  # Copy application files
27
+ COPY --chown=user:user . /app
28
 
29
  # Expose the necessary port
30
  EXPOSE 7860
31
 
32
+ # Run the Flask app using Gunicorn with a higher timeout
33
+ CMD ["gunicorn", "--workers", "2", "--timeout", "300", "--bind", "0.0.0.0:7860", "app:app"]
app.py CHANGED
@@ -5,61 +5,120 @@ import pytesseract
5
  from PIL import Image
6
  from transformers import PegasusForConditionalGeneration, PegasusTokenizer
7
  import torch
 
8
 
9
  app = Flask(__name__)
10
 
 
 
 
 
11
  # Load Pegasus Model
 
12
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
13
  model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
 
14
 
15
- # Extract text from PDF
16
- def extract_text_from_pdf(file_path):
17
  text = ""
18
- with pdfplumber.open(file_path) as pdf:
19
- for page in pdf.pages:
20
- text += page.extract_text() or ""
21
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Extract text from image (OCR)
24
  def extract_text_from_image(file_path):
25
- image = Image.open(file_path)
26
- text = pytesseract.image_to_string(image)
27
- return text
 
 
 
 
 
28
 
29
- # Summarize text using Pegasus
30
- def summarize_text(text):
31
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
32
- summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=4)
33
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
34
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  @app.route('/summarize', methods=['POST'])
37
  def summarize_document():
38
  if 'file' not in request.files:
 
39
  return jsonify({"error": "No file uploaded"}), 400
40
 
41
  file = request.files['file']
42
  filename = file.filename
 
 
 
 
43
  file_path = os.path.join("/tmp", filename)
44
- file.save(file_path)
45
-
46
  try:
47
- if filename.endswith('.pdf'):
48
- text = extract_text_from_pdf(file_path)
49
- elif filename.endswith(('.png', '.jpeg', '.jpg')):
 
 
 
50
  text = extract_text_from_image(file_path)
51
  else:
52
- return jsonify({"error": "Unsupported file format"}), 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  except Exception as e:
 
54
  return jsonify({"error": str(e)}), 500
 
55
  finally:
56
- os.remove(file_path)
57
-
58
- if not text.strip():
59
- return jsonify({"error": "No text extracted"}), 400
60
-
61
- summary = summarize_text(text)
62
- return jsonify({"summary": summary})
63
 
64
  if __name__ == '__main__':
65
- app.run(host='0.0.0.0', port=7860)
 
 
5
  from PIL import Image
6
  from transformers import PegasusForConditionalGeneration, PegasusTokenizer
7
  import torch
8
+ import logging
9
 
10
  app = Flask(__name__)
11
 
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
  # Load Pegasus Model
17
+ logger.info("Loading Pegasus model and tokenizer...")
18
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
19
  model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
20
+ logger.info("Model loaded successfully.")
21
 
22
+ # Extract text from PDF with page limit and timeout handling
23
+ def extract_text_from_pdf(file_path, max_pages=10):
24
  text = ""
25
+ try:
26
+ with pdfplumber.open(file_path) as pdf:
27
+ total_pages = len(pdf.pages)
28
+ pages_to_process = min(total_pages, max_pages)
29
+ logger.info(f"Extracting text from {pages_to_process} of {total_pages} pages in {file_path}")
30
+ for i, page in enumerate(pdf.pages[:pages_to_process]):
31
+ try:
32
+ extracted = page.extract_text()
33
+ if extracted:
34
+ text += extracted + "\n"
35
+ except Exception as e:
36
+ logger.warning(f"Error extracting text from page {i+1}: {e}")
37
+ continue
38
+ except Exception as e:
39
+ logger.error(f"Failed to process PDF {file_path}: {e}")
40
+ return ""
41
+ return text.strip()
42
 
43
  # Extract text from image (OCR)
44
  def extract_text_from_image(file_path):
45
+ try:
46
+ logger.info(f"Extracting text from image {file_path} using OCR...")
47
+ image = Image.open(file_path)
48
+ text = pytesseract.image_to_string(image)
49
+ return text.strip()
50
+ except Exception as e:
51
+ logger.error(f"Failed to process image {file_path}: {e}")
52
+ return ""
53
 
54
+ # Summarize text using Pegasus with truncation
55
+ def summarize_text(text, max_input_length=512, max_output_length=150):
56
+ try:
57
+ logger.info("Summarizing text...")
58
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length)
59
+ summary_ids = model.generate(
60
+ inputs["input_ids"],
61
+ max_length=max_output_length,
62
+ min_length=30,
63
+ num_beams=4,
64
+ early_stopping=True
65
+ )
66
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
67
+ logger.info("Summarization completed.")
68
+ return summary
69
+ except Exception as e:
70
+ logger.error(f"Error during summarization: {e}")
71
+ return ""
72
 
73
  @app.route('/summarize', methods=['POST'])
74
  def summarize_document():
75
  if 'file' not in request.files:
76
+ logger.error("No file uploaded in request.")
77
  return jsonify({"error": "No file uploaded"}), 400
78
 
79
  file = request.files['file']
80
  filename = file.filename
81
+ if not filename:
82
+ logger.error("Empty filename in request.")
83
+ return jsonify({"error": "No file uploaded"}), 400
84
+
85
  file_path = os.path.join("/tmp", filename)
 
 
86
  try:
87
+ file.save(file_path)
88
+ logger.info(f"File saved to {file_path}")
89
+
90
+ if filename.lower().endswith('.pdf'):
91
+ text = extract_text_from_pdf(file_path, max_pages=5)
92
+ elif filename.lower().endswith(('.png', '.jpeg', '.jpg')):
93
  text = extract_text_from_image(file_path)
94
  else:
95
+ logger.error(f"Unsupported file format: {filename}")
96
+ return jsonify({"error": "Unsupported file format. Use PDF, PNG, JPEG, or JPG"}), 400
97
+
98
+ if not text:
99
+ logger.warning(f"No text extracted from {filename}")
100
+ return jsonify({"error": "No text extracted from the file"}), 400
101
+
102
+ summary = summarize_text(text)
103
+ if not summary:
104
+ logger.warning("Summarization failed to produce output.")
105
+ return jsonify({"error": "Failed to generate summary"}), 500
106
+
107
+ logger.info(f"Summary generated for {filename}")
108
+ return jsonify({"summary": summary})
109
+
110
  except Exception as e:
111
+ logger.error(f"Unexpected error processing {filename}: {e}")
112
  return jsonify({"error": str(e)}), 500
113
+
114
  finally:
115
+ if os.path.exists(file_path):
116
+ try:
117
+ os.remove(file_path)
118
+ logger.info(f"Cleaned up file: {file_path}")
119
+ except Exception as e:
120
+ logger.warning(f"Failed to delete {file_path}: {e}")
 
121
 
122
  if __name__ == '__main__':
123
+ logger.info("Starting Flask app...")
124
+ app.run(host='0.0.0.0', port=7860)