ApsidalSolid4 commited on
Commit
c38f00e
·
verified ·
1 Parent(s): 8fa279d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -48
app.py CHANGED
@@ -57,7 +57,7 @@ if not ADMIN_PASSWORD_HASH:
57
  EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
58
 
59
  # OCR API settings
60
- OCR_API_KEY = "9e11346f1288957" # This is a partial key - replace with the full one
61
  OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
62
  OCR_MAX_PDF_PAGES = 3
63
  OCR_MAX_FILE_SIZE_MB = 1
@@ -95,10 +95,6 @@ class OCRProcessor:
95
  file_type = self._get_file_type(file_path)
96
  ocr_logger.info(f"Detected file type: {file_type}")
97
 
98
- # Prepare the API request
99
- with open(file_path, 'rb') as f:
100
- file_data = f.read()
101
-
102
  # Set up API parameters
103
  payload = {
104
  'isOverlayRequired': 'false',
@@ -113,9 +109,9 @@ class OCRProcessor:
113
  ocr_logger.info("PDF document detected, enforcing page limit")
114
  payload['filetype'] = 'PDF'
115
 
116
- # Prepare file for OCR API
117
  files = {
118
- 'file': (os.path.basename(file_path), file_data, file_type)
119
  }
120
 
121
  headers = {
@@ -124,33 +120,52 @@ class OCRProcessor:
124
 
125
  # Make the OCR API request
126
  try:
127
- ocr_logger.info("Sending request to OCR.space API")
128
  response = requests.post(
129
  self.endpoint,
130
  files=files,
131
  data=payload,
132
- headers=headers
 
133
  )
 
 
 
 
 
 
134
  response.raise_for_status()
135
- result = response.json()
136
 
137
- # Process the OCR results
138
- if result.get('OCRExitCode') in [1, 2]: # Success or partial success
139
- extracted_text = self._extract_text_from_result(result)
140
- processing_time = time.time() - start_time
141
- ocr_logger.info(f"OCR processing completed in {processing_time:.2f} seconds")
142
 
143
- return {
144
- "success": True,
145
- "text": extracted_text,
146
- "word_count": len(extracted_text.split()),
147
- "processing_time_ms": int(processing_time * 1000)
148
- }
149
- else:
150
- ocr_logger.error(f"OCR API error: {result.get('ErrorMessage', 'Unknown error')}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  return {
152
  "success": False,
153
- "error": result.get('ErrorMessage', 'OCR processing failed'),
154
  "text": ""
155
  }
156
 
@@ -161,6 +176,9 @@ class OCRProcessor:
161
  "error": f"OCR API request failed: {str(e)}",
162
  "text": ""
163
  }
 
 
 
164
 
165
  def _extract_text_from_result(self, result: Dict) -> str:
166
  """
@@ -504,35 +522,50 @@ def handle_file_upload_and_analyze(file_obj, mode: str, classifier: TextClassifi
504
  "No file uploaded for analysis"
505
  )
506
 
507
- # Create a temporary file with an appropriate extension based on content
508
- content_start = file_obj[:20] # Look at the first few bytes
509
-
510
- # Default to .bin extension
511
- file_ext = ".bin"
512
-
513
- # Try to detect PDF files
514
- if content_start.startswith(b'%PDF'):
515
- file_ext = ".pdf"
516
- # For images, detect by common magic numbers
517
- elif content_start.startswith(b'\xff\xd8'): # JPEG
518
- file_ext = ".jpg"
519
- elif content_start.startswith(b'\x89PNG'): # PNG
520
- file_ext = ".png"
521
- elif content_start.startswith(b'GIF'): # GIF
522
- file_ext = ".gif"
523
-
524
- # Create a temporary file with the detected extension
525
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
526
- temp_file_path = temp_file.name
527
- # Write uploaded file data to the temporary file
528
- temp_file.write(file_obj)
529
 
530
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  # Process the file with OCR
532
  ocr_processor = OCRProcessor()
 
533
  ocr_result = ocr_processor.process_file(temp_file_path)
534
 
535
  if not ocr_result["success"]:
 
536
  return (
537
  "OCR Processing Error",
538
  ocr_result["error"],
@@ -541,9 +574,11 @@ def handle_file_upload_and_analyze(file_obj, mode: str, classifier: TextClassifi
541
 
542
  # Get the extracted text
543
  extracted_text = ocr_result["text"]
 
544
 
545
  # If no text was extracted
546
  if not extracted_text.strip():
 
547
  return (
548
  "No text extracted",
549
  "The OCR process did not extract any text from the uploaded file.",
@@ -551,12 +586,24 @@ def handle_file_upload_and_analyze(file_obj, mode: str, classifier: TextClassifi
551
  )
552
 
553
  # Call the original text analysis function with the extracted text
 
554
  return analyze_text(extracted_text, mode, classifier)
555
 
 
 
 
 
 
 
 
556
  finally:
557
  # Clean up the temporary file
558
- if os.path.exists(temp_file_path):
559
- os.remove(temp_file_path)
 
 
 
 
560
 
561
  def initialize_excel_log():
562
  """Initialize the Excel log file if it doesn't exist."""
 
57
  EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
58
 
59
  # OCR API settings
60
+ OCR_API_KEY = "9e11346f1288957" # Now using the complete key
61
  OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
62
  OCR_MAX_PDF_PAGES = 3
63
  OCR_MAX_FILE_SIZE_MB = 1
 
95
  file_type = self._get_file_type(file_path)
96
  ocr_logger.info(f"Detected file type: {file_type}")
97
 
 
 
 
 
98
  # Set up API parameters
99
  payload = {
100
  'isOverlayRequired': 'false',
 
109
  ocr_logger.info("PDF document detected, enforcing page limit")
110
  payload['filetype'] = 'PDF'
111
 
112
+ # Prepare file for OCR API - using file object directly as in the test script
113
  files = {
114
+ 'file': (os.path.basename(file_path), open(file_path, 'rb'))
115
  }
116
 
117
  headers = {
 
120
 
121
  # Make the OCR API request
122
  try:
123
+ ocr_logger.info(f"Sending request to OCR.space API for file: {os.path.basename(file_path)}")
124
  response = requests.post(
125
  self.endpoint,
126
  files=files,
127
  data=payload,
128
+ headers=headers,
129
+ timeout=60 # Add 60 second timeout
130
  )
131
+
132
+ ocr_logger.info(f"OCR API status code: {response.status_code}")
133
+
134
+ # Log response text for debugging (first 200 chars)
135
+ ocr_logger.info(f"OCR API response preview: {response.text[:200]}...")
136
+
137
  response.raise_for_status()
 
138
 
139
+ try:
140
+ result = response.json()
141
+ ocr_logger.info(f"OCR API exit code: {result.get('OCRExitCode')}")
 
 
142
 
143
+ # Process the OCR results
144
+ if result.get('OCRExitCode') in [1, 2]: # Success or partial success
145
+ extracted_text = self._extract_text_from_result(result)
146
+ processing_time = time.time() - start_time
147
+ ocr_logger.info(f"OCR processing completed in {processing_time:.2f} seconds")
148
+ ocr_logger.info(f"Extracted text word count: {len(extracted_text.split())}")
149
+
150
+ return {
151
+ "success": True,
152
+ "text": extracted_text,
153
+ "word_count": len(extracted_text.split()),
154
+ "processing_time_ms": int(processing_time * 1000)
155
+ }
156
+ else:
157
+ error_msg = result.get('ErrorMessage', 'OCR processing failed')
158
+ ocr_logger.error(f"OCR API error: {error_msg}")
159
+ return {
160
+ "success": False,
161
+ "error": error_msg,
162
+ "text": ""
163
+ }
164
+ except ValueError as e:
165
+ ocr_logger.error(f"Invalid JSON response: {str(e)}")
166
  return {
167
  "success": False,
168
+ "error": f"Invalid response from OCR API: {str(e)}",
169
  "text": ""
170
  }
171
 
 
176
  "error": f"OCR API request failed: {str(e)}",
177
  "text": ""
178
  }
179
+ finally:
180
+ # Close the file handle
181
+ files['file'][1].close()
182
 
183
  def _extract_text_from_result(self, result: Dict) -> str:
184
  """
 
522
  "No file uploaded for analysis"
523
  )
524
 
525
+ # Log the type of file object received
526
+ logger.info(f"Received file upload of type: {type(file_obj)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  try:
529
+ # Create a temporary file with an appropriate extension based on content
530
+ if isinstance(file_obj, bytes):
531
+ content_start = file_obj[:20] # Look at the first few bytes
532
+
533
+ # Default to .bin extension
534
+ file_ext = ".bin"
535
+
536
+ # Try to detect PDF files
537
+ if content_start.startswith(b'%PDF'):
538
+ file_ext = ".pdf"
539
+ # For images, detect by common magic numbers
540
+ elif content_start.startswith(b'\xff\xd8'): # JPEG
541
+ file_ext = ".jpg"
542
+ elif content_start.startswith(b'\x89PNG'): # PNG
543
+ file_ext = ".png"
544
+ elif content_start.startswith(b'GIF'): # GIF
545
+ file_ext = ".gif"
546
+
547
+ # Create a temporary file with the detected extension
548
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
549
+ temp_file_path = temp_file.name
550
+ # Write uploaded file data to the temporary file
551
+ temp_file.write(file_obj)
552
+ logger.info(f"Saved uploaded file to {temp_file_path}")
553
+ else:
554
+ # Handle other file object types (should not typically happen with Gradio)
555
+ logger.error(f"Unexpected file object type: {type(file_obj)}")
556
+ return (
557
+ "File upload error",
558
+ "Unexpected file format",
559
+ "Unable to process this file format"
560
+ )
561
+
562
  # Process the file with OCR
563
  ocr_processor = OCRProcessor()
564
+ logger.info(f"Starting OCR processing for file: {temp_file_path}")
565
  ocr_result = ocr_processor.process_file(temp_file_path)
566
 
567
  if not ocr_result["success"]:
568
+ logger.error(f"OCR processing failed: {ocr_result['error']}")
569
  return (
570
  "OCR Processing Error",
571
  ocr_result["error"],
 
574
 
575
  # Get the extracted text
576
  extracted_text = ocr_result["text"]
577
+ logger.info(f"OCR processing complete. Extracted {len(extracted_text.split())} words")
578
 
579
  # If no text was extracted
580
  if not extracted_text.strip():
581
+ logger.warning("No text extracted from file")
582
  return (
583
  "No text extracted",
584
  "The OCR process did not extract any text from the uploaded file.",
 
586
  )
587
 
588
  # Call the original text analysis function with the extracted text
589
+ logger.info("Proceeding with text analysis")
590
  return analyze_text(extracted_text, mode, classifier)
591
 
592
+ except Exception as e:
593
+ logger.error(f"Error in file upload processing: {str(e)}")
594
+ return (
595
+ "Error Processing File",
596
+ f"An error occurred while processing the file: {str(e)}",
597
+ "File processing error. Please try again or try a different file."
598
+ )
599
  finally:
600
  # Clean up the temporary file
601
+ if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
602
+ try:
603
+ os.remove(temp_file_path)
604
+ logger.info(f"Removed temporary file: {temp_file_path}")
605
+ except Exception as e:
606
+ logger.warning(f"Could not remove temporary file: {str(e)}")
607
 
608
  def initialize_excel_log():
609
  """Initialize the Excel log file if it doesn't exist."""