ApsidalSolid4 commited on
Commit
5e42313
·
verified ·
1 Parent(s): 608e53d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -146
app.py CHANGED
@@ -33,28 +33,36 @@ MODEL_NAME = "microsoft/deberta-v3-small"
33
  WINDOW_SIZE = 6
34
  WINDOW_OVERLAP = 2
35
  CONFIDENCE_THRESHOLD = 0.65
36
- BATCH_SIZE = 8
37
- MAX_WORKERS = 4
38
-
39
 
 
 
40
  if not torch.cuda.is_available():
 
41
  torch.set_num_threads(MAX_WORKERS)
42
  try:
 
43
  torch.set_num_interop_threads(MAX_WORKERS)
44
  except RuntimeError as e:
45
  logger.warning(f"Could not set interop threads: {str(e)}")
46
 
 
47
  ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
48
 
49
  if not ADMIN_PASSWORD_HASH:
50
  ADMIN_PASSWORD_HASH = "5e22d1ed71b273b1b2b5331f2d3e0f6cf34595236f201c6924d6bc81de27cdcb"
51
 
 
52
  EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
53
- OCR_API_KEY = "9e11346f1288957"
 
 
54
  OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
55
  OCR_MAX_PDF_PAGES = 3
56
  OCR_MAX_FILE_SIZE_MB = 1
57
 
 
58
  ocr_logger = logging.getLogger("ocr_module")
59
  ocr_logger.setLevel(logging.INFO)
60
 
@@ -87,6 +95,10 @@ class OCRProcessor:
87
  file_type = self._get_file_type(file_path)
88
  ocr_logger.info(f"Detected file type: {file_type}")
89
 
 
 
 
 
90
  # Set up API parameters
91
  payload = {
92
  'isOverlayRequired': 'false',
@@ -101,10 +113,7 @@ class OCRProcessor:
101
  ocr_logger.info("PDF document detected, enforcing page limit")
102
  payload['filetype'] = 'PDF'
103
 
104
- # Prepare file for OCR API - using file data as bytes to avoid file handle issues
105
- with open(file_path, 'rb') as f:
106
- file_data = f.read()
107
-
108
  files = {
109
  'file': (os.path.basename(file_path), file_data, file_type)
110
  }
@@ -115,61 +124,33 @@ class OCRProcessor:
115
 
116
  # Make the OCR API request
117
  try:
118
- ocr_logger.info(f"Sending request to OCR.space API for file: {os.path.basename(file_path)}")
119
  response = requests.post(
120
  self.endpoint,
121
  files=files,
122
  data=payload,
123
- headers=headers,
124
- timeout=60 # Add 60 second timeout
125
  )
 
 
126
 
127
- ocr_logger.info(f"OCR API status code: {response.status_code}")
128
-
129
- # Log response text for debugging (first 200 chars)
130
- response_preview = response.text[:200] if hasattr(response, 'text') else "No text content"
131
- ocr_logger.info(f"OCR API response preview: {response_preview}...")
132
-
133
- try:
134
- response.raise_for_status()
135
- except Exception as e:
136
- ocr_logger.error(f"HTTP Error: {str(e)}")
137
  return {
138
- "success": False,
139
- "error": f"OCR API HTTP Error: {str(e)}",
140
- "text": ""
 
141
  }
142
-
143
- try:
144
- result = response.json()
145
- ocr_logger.info(f"OCR API exit code: {result.get('OCRExitCode')}")
146
-
147
- # Process the OCR results
148
- if result.get('OCRExitCode') in [1, 2]: # Success or partial success
149
- extracted_text = self._extract_text_from_result(result)
150
- processing_time = time.time() - start_time
151
- ocr_logger.info(f"OCR processing completed in {processing_time:.2f} seconds")
152
- ocr_logger.info(f"Extracted text word count: {len(extracted_text.split())}")
153
-
154
- return {
155
- "success": True,
156
- "text": extracted_text,
157
- "word_count": len(extracted_text.split()),
158
- "processing_time_ms": int(processing_time * 1000)
159
- }
160
- else:
161
- error_msg = result.get('ErrorMessage', 'OCR processing failed')
162
- ocr_logger.error(f"OCR API error: {error_msg}")
163
- return {
164
- "success": False,
165
- "error": error_msg,
166
- "text": ""
167
- }
168
- except ValueError as e:
169
- ocr_logger.error(f"Invalid JSON response: {str(e)}")
170
  return {
171
  "success": False,
172
- "error": f"Invalid response from OCR API: {str(e)}",
173
  "text": ""
174
  }
175
 
@@ -180,9 +161,6 @@ class OCRProcessor:
180
  "error": f"OCR API request failed: {str(e)}",
181
  "text": ""
182
  }
183
- finally:
184
- # No need to close file handle as we're using bytes directly
185
- pass
186
 
187
  def _extract_text_from_result(self, result: Dict) -> str:
188
  """
@@ -515,14 +493,10 @@ class TextClassifier:
515
  }
516
 
517
  # Function to handle file upload, OCR processing, and text analysis
518
- def handle_file_upload_and_analyze(file_obj, mode: str) -> tuple:
519
  """
520
  Handle file upload, OCR processing, and text analysis
521
  """
522
- # Use the global classifier
523
- global classifier
524
- classifier_to_use = classifier
525
-
526
  if file_obj is None:
527
  return (
528
  "No file uploaded",
@@ -530,50 +504,35 @@ def handle_file_upload_and_analyze(file_obj, mode: str) -> tuple:
530
  "No file uploaded for analysis"
531
  )
532
 
533
- # Log the type of file object received
534
- logger.info(f"Received file upload of type: {type(file_obj)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
  try:
537
- # Create a temporary file with an appropriate extension based on content
538
- if isinstance(file_obj, bytes):
539
- content_start = file_obj[:20] # Look at the first few bytes
540
-
541
- # Default to .bin extension
542
- file_ext = ".bin"
543
-
544
- # Try to detect PDF files
545
- if content_start.startswith(b'%PDF'):
546
- file_ext = ".pdf"
547
- # For images, detect by common magic numbers
548
- elif content_start.startswith(b'\xff\xd8'): # JPEG
549
- file_ext = ".jpg"
550
- elif content_start.startswith(b'\x89PNG'): # PNG
551
- file_ext = ".png"
552
- elif content_start.startswith(b'GIF'): # GIF
553
- file_ext = ".gif"
554
-
555
- # Create a temporary file with the detected extension
556
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
557
- temp_file_path = temp_file.name
558
- # Write uploaded file data to the temporary file
559
- temp_file.write(file_obj)
560
- logger.info(f"Saved uploaded file to {temp_file_path}")
561
- else:
562
- # Handle other file object types (should not typically happen with Gradio)
563
- logger.error(f"Unexpected file object type: {type(file_obj)}")
564
- return (
565
- "File upload error",
566
- "Unexpected file format",
567
- "Unable to process this file format"
568
- )
569
-
570
  # Process the file with OCR
571
  ocr_processor = OCRProcessor()
572
- logger.info(f"Starting OCR processing for file: {temp_file_path}")
573
  ocr_result = ocr_processor.process_file(temp_file_path)
574
 
575
  if not ocr_result["success"]:
576
- logger.error(f"OCR processing failed: {ocr_result['error']}")
577
  return (
578
  "OCR Processing Error",
579
  ocr_result["error"],
@@ -582,11 +541,9 @@ def handle_file_upload_and_analyze(file_obj, mode: str) -> tuple:
582
 
583
  # Get the extracted text
584
  extracted_text = ocr_result["text"]
585
- logger.info(f"OCR processing complete. Extracted {len(extracted_text.split())} words")
586
 
587
  # If no text was extracted
588
  if not extracted_text.strip():
589
- logger.warning("No text extracted from file")
590
  return (
591
  "No text extracted",
592
  "The OCR process did not extract any text from the uploaded file.",
@@ -594,24 +551,12 @@ def handle_file_upload_and_analyze(file_obj, mode: str) -> tuple:
594
  )
595
 
596
  # Call the original text analysis function with the extracted text
597
- logger.info("Proceeding with text analysis")
598
- return analyze_text(extracted_text, mode, classifier_to_use)
599
 
600
- except Exception as e:
601
- logger.error(f"Error in file upload processing: {str(e)}")
602
- return (
603
- "Error Processing File",
604
- f"An error occurred while processing the file: {str(e)}",
605
- "File processing error. Please try again or try a different file."
606
- )
607
  finally:
608
  # Clean up the temporary file
609
- if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
610
- try:
611
- os.remove(temp_file_path)
612
- logger.info(f"Removed temporary file: {temp_file_path}")
613
- except Exception as e:
614
- logger.warning(f"Could not remove temporary file: {str(e)}")
615
 
616
  def initialize_excel_log():
617
  """Initialize the Excel log file if it doesn't exist."""
@@ -825,7 +770,7 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
825
  # Initialize the classifier globally
826
  classifier = TextClassifier()
827
 
828
- # Create Gradio interface with a file upload button matched to the radio buttons
829
  def create_interface():
830
  # Custom CSS for the interface
831
  css = """
@@ -835,46 +780,49 @@ def create_interface():
835
  color: white !important;
836
  }
837
 
838
- /* Style the file upload to be more compact */
839
- .file-upload {
840
- width: 150px !important;
841
- margin-left: 15px !important;
 
842
  }
843
 
844
- /* Hide file preview elements */
845
- .file-upload .file-preview,
846
- .file-upload p:not(.file-upload p:first-child),
847
- .file-upload svg,
848
- .file-upload [data-testid="chunkFileDropArea"],
849
- .file-upload .file-drop {
850
  display: none !important;
851
  }
852
 
853
- /* Style the upload button */
854
- .file-upload button {
 
855
  height: 40px !important;
856
- width: 100% !important;
857
  background-color: #f0f0f0 !important;
858
  border: 1px solid #d9d9d9 !important;
859
  border-radius: 4px !important;
860
- color: #333 !important;
861
- font-size: 14px !important;
862
  display: flex !important;
863
  align-items: center !important;
864
  justify-content: center !important;
 
865
  margin: 0 !important;
866
- padding: 0 !important;
867
  }
868
 
869
- /* Hide the "or" text */
870
- .file-upload .or {
871
  display: none !important;
872
  }
873
 
874
- /* Make the container compact */
875
- .file-upload [data-testid="block"] {
876
- margin: 0 !important;
877
- padding: 0 !important;
 
 
 
 
 
 
 
 
878
  }
879
  """
880
 
@@ -905,12 +853,15 @@ def create_interface():
905
  show_label=False
906
  )
907
 
908
- # Revert to File component but with better styling
909
- file_upload = gr.File(
910
- file_types=["pdf", "doc", "docx"],
911
- type="binary",
912
- elem_classes=["file-upload"]
913
- )
 
 
 
914
 
915
  # Analyze button
916
  analyze_btn = gr.Button("Analyze Text", elem_id="analyze-btn")
@@ -922,13 +873,14 @@ def create_interface():
922
  output_result = gr.Textbox(label="Overall Result", lines=4)
923
 
924
  # Connect components
 
925
  analyze_btn.click(
926
  fn=lambda text, mode: analyze_text(text, mode, classifier),
927
  inputs=[text_input, mode_selection],
928
  outputs=[output_html, output_sentences, output_result]
929
  )
930
 
931
- # Use the file upload handler without passing classifier (will use global)
932
  file_upload.change(
933
  fn=handle_file_upload_and_analyze,
934
  inputs=[file_upload, mode_selection],
@@ -936,7 +888,7 @@ def create_interface():
936
  )
937
 
938
  return demo
939
-
940
  # Setup the app with CORS middleware
941
  def setup_app():
942
  demo = create_interface()
 
33
  WINDOW_SIZE = 6
34
  WINDOW_OVERLAP = 2
35
  CONFIDENCE_THRESHOLD = 0.65
36
+ BATCH_SIZE = 8 # Reduced batch size for CPU
37
+ MAX_WORKERS = 4 # Number of worker threads for processing
 
38
 
39
+ # IMPORTANT: Set PyTorch thread configuration at the module level
40
+ # before any parallel work starts
41
  if not torch.cuda.is_available():
42
+ # Set thread configuration only once at the beginning
43
  torch.set_num_threads(MAX_WORKERS)
44
  try:
45
+ # Only set interop threads if it hasn't been set already
46
  torch.set_num_interop_threads(MAX_WORKERS)
47
  except RuntimeError as e:
48
  logger.warning(f"Could not set interop threads: {str(e)}")
49
 
50
+ # Get password hash from environment variable (more secure)
51
  ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
52
 
53
  if not ADMIN_PASSWORD_HASH:
54
  ADMIN_PASSWORD_HASH = "5e22d1ed71b273b1b2b5331f2d3e0f6cf34595236f201c6924d6bc81de27cdcb"
55
 
56
+ # Excel file path for logs
57
  EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
58
+
59
+ # OCR API settings
60
+ OCR_API_KEY = "9e11346f1288957" # This is a partial key - replace with the full one
61
  OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
62
  OCR_MAX_PDF_PAGES = 3
63
  OCR_MAX_FILE_SIZE_MB = 1
64
 
65
+ # Configure logging for OCR module
66
  ocr_logger = logging.getLogger("ocr_module")
67
  ocr_logger.setLevel(logging.INFO)
68
 
 
95
  file_type = self._get_file_type(file_path)
96
  ocr_logger.info(f"Detected file type: {file_type}")
97
 
98
+ # Prepare the API request
99
+ with open(file_path, 'rb') as f:
100
+ file_data = f.read()
101
+
102
  # Set up API parameters
103
  payload = {
104
  'isOverlayRequired': 'false',
 
113
  ocr_logger.info("PDF document detected, enforcing page limit")
114
  payload['filetype'] = 'PDF'
115
 
116
+ # Prepare file for OCR API
 
 
 
117
  files = {
118
  'file': (os.path.basename(file_path), file_data, file_type)
119
  }
 
124
 
125
  # Make the OCR API request
126
  try:
127
+ ocr_logger.info("Sending request to OCR.space API")
128
  response = requests.post(
129
  self.endpoint,
130
  files=files,
131
  data=payload,
132
+ headers=headers
 
133
  )
134
+ response.raise_for_status()
135
+ result = response.json()
136
 
137
+ # Process the OCR results
138
+ if result.get('OCRExitCode') in [1, 2]: # Success or partial success
139
+ extracted_text = self._extract_text_from_result(result)
140
+ processing_time = time.time() - start_time
141
+ ocr_logger.info(f"OCR processing completed in {processing_time:.2f} seconds")
142
+
 
 
 
 
143
  return {
144
+ "success": True,
145
+ "text": extracted_text,
146
+ "word_count": len(extracted_text.split()),
147
+ "processing_time_ms": int(processing_time * 1000)
148
  }
149
+ else:
150
+ ocr_logger.error(f"OCR API error: {result.get('ErrorMessage', 'Unknown error')}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  return {
152
  "success": False,
153
+ "error": result.get('ErrorMessage', 'OCR processing failed'),
154
  "text": ""
155
  }
156
 
 
161
  "error": f"OCR API request failed: {str(e)}",
162
  "text": ""
163
  }
 
 
 
164
 
165
  def _extract_text_from_result(self, result: Dict) -> str:
166
  """
 
493
  }
494
 
495
  # Function to handle file upload, OCR processing, and text analysis
496
+ def handle_file_upload_and_analyze(file_obj, mode: str, classifier) -> tuple:
497
  """
498
  Handle file upload, OCR processing, and text analysis
499
  """
 
 
 
 
500
  if file_obj is None:
501
  return (
502
  "No file uploaded",
 
504
  "No file uploaded for analysis"
505
  )
506
 
507
+ # Create a temporary file with an appropriate extension based on content
508
+ content_start = file_obj[:20] # Look at the first few bytes
509
+
510
+ # Default to .bin extension
511
+ file_ext = ".bin"
512
+
513
+ # Try to detect PDF files
514
+ if content_start.startswith(b'%PDF'):
515
+ file_ext = ".pdf"
516
+ # For images, detect by common magic numbers
517
+ elif content_start.startswith(b'\xff\xd8'): # JPEG
518
+ file_ext = ".jpg"
519
+ elif content_start.startswith(b'\x89PNG'): # PNG
520
+ file_ext = ".png"
521
+ elif content_start.startswith(b'GIF'): # GIF
522
+ file_ext = ".gif"
523
+
524
+ # Create a temporary file with the detected extension
525
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
526
+ temp_file_path = temp_file.name
527
+ # Write uploaded file data to the temporary file
528
+ temp_file.write(file_obj)
529
 
530
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  # Process the file with OCR
532
  ocr_processor = OCRProcessor()
 
533
  ocr_result = ocr_processor.process_file(temp_file_path)
534
 
535
  if not ocr_result["success"]:
 
536
  return (
537
  "OCR Processing Error",
538
  ocr_result["error"],
 
541
 
542
  # Get the extracted text
543
  extracted_text = ocr_result["text"]
 
544
 
545
  # If no text was extracted
546
  if not extracted_text.strip():
 
547
  return (
548
  "No text extracted",
549
  "The OCR process did not extract any text from the uploaded file.",
 
551
  )
552
 
553
  # Call the original text analysis function with the extracted text
554
+ return analyze_text(extracted_text, mode, classifier)
 
555
 
 
 
 
 
 
 
 
556
  finally:
557
  # Clean up the temporary file
558
+ if os.path.exists(temp_file_path):
559
+ os.remove(temp_file_path)
 
 
 
 
560
 
561
  def initialize_excel_log():
562
  """Initialize the Excel log file if it doesn't exist."""
 
770
  # Initialize the classifier globally
771
  classifier = TextClassifier()
772
 
773
+ # Create Gradio interface with a properly sized file upload button
774
  def create_interface():
775
  # Custom CSS for the interface
776
  css = """
 
780
  color: white !important;
781
  }
782
 
783
+ /* Style the file upload container to match the radio buttons */
784
+ .file-upload-container {
785
+ margin-left: 15px;
786
+ display: inline-block;
787
+ vertical-align: middle;
788
  }
789
 
790
+ /* Hide file info and preview */
791
+ .file-upload-container .file-preview {
 
 
 
 
792
  display: none !important;
793
  }
794
 
795
+ /* Style the upload button to a proper size */
796
+ .file-upload-container [data-testid="chunkFileDropArea"] {
797
+ width: 150px !important;
798
  height: 40px !important;
 
799
  background-color: #f0f0f0 !important;
800
  border: 1px solid #d9d9d9 !important;
801
  border-radius: 4px !important;
 
 
802
  display: flex !important;
803
  align-items: center !important;
804
  justify-content: center !important;
805
+ padding: 0 10px !important;
806
  margin: 0 !important;
 
807
  }
808
 
809
+ /* Show only the "Upload Document" text */
810
+ .file-upload-container [data-testid="chunkFileDropArea"] * {
811
  display: none !important;
812
  }
813
 
814
+ /* Add a new label */
815
+ .file-upload-container [data-testid="chunkFileDropArea"]::before {
816
+ content: "Upload Document" !important;
817
+ display: block !important;
818
+ font-size: 14px !important;
819
+ color: #444 !important;
820
+ }
821
+
822
+ /* Hover effect */
823
+ .file-upload-container [data-testid="chunkFileDropArea"]:hover {
824
+ background-color: #e0e0e0 !important;
825
+ cursor: pointer !important;
826
  }
827
  """
828
 
 
853
  show_label=False
854
  )
855
 
856
+ # File upload component with compact styling
857
+ with gr.Column(elem_classes=["file-upload-container"], scale=0):
858
+ file_upload = gr.File(
859
+ file_types=["image", "pdf", "doc", "docx"],
860
+ type="binary",
861
+ label="",
862
+ show_label=False,
863
+ elem_id="file-upload"
864
+ )
865
 
866
  # Analyze button
867
  analyze_btn = gr.Button("Analyze Text", elem_id="analyze-btn")
 
873
  output_result = gr.Textbox(label="Overall Result", lines=4)
874
 
875
  # Connect components
876
+ # 1. Analyze button click
877
  analyze_btn.click(
878
  fn=lambda text, mode: analyze_text(text, mode, classifier),
879
  inputs=[text_input, mode_selection],
880
  outputs=[output_html, output_sentences, output_result]
881
  )
882
 
883
+ # 2. File upload change event
884
  file_upload.change(
885
  fn=handle_file_upload_and_analyze,
886
  inputs=[file_upload, mode_selection],
 
888
  )
889
 
890
  return demo
891
+
892
  # Setup the app with CORS middleware
893
  def setup_app():
894
  demo = create_interface()