paddle-ocr-demo

Sleeping

App Files Files

codic commited on Apr 30

Commit

cef9fba

verified ·

1 Parent(s): 21c5eee

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -55

app.py CHANGED Viewed

@@ -27,14 +27,12 @@ except Exception:
     logger.exception("Failed to load GLiNER model")
     raise
-# Regex patterns for emails and websites
 EMAIL_REGEX = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
 WEBSITE_REGEX = re.compile(r"(?:https?://)?(?:www\.)?([A-Za-z0-9-]+\.[A-Za-z]{2,})")
-# Phone number constants and regex for Saudi/UAE support
-SAUDI_CODE = '+966'
 UAE_CODE = '+971'
-PHONE_REGEX = re.compile(r'^(?:\+9665\d{8}|\+9715\d{8}|05\d{8}|5\d{8})$')
 # Utility functions
 def extract_emails(text: str) -> list[str]:
@@ -47,50 +45,54 @@ def normalize_website(url: str) -> str | None:
     u = url.lower().replace('www.', '').split('/')[0]
     return f"www.{u}" if re.match(r"^[a-z0-9-]+\.[a-z]{2,}$", u) else None
 def clean_phone_number(phone: str) -> str | None:
-    cleaned = re.sub(r"[^\d+]", "", phone)
-    # International formats
-    if cleaned.startswith(SAUDI_CODE + '5') and len(cleaned) == 12:
-        return cleaned
-    if cleaned.startswith(UAE_CODE + '5') and len(cleaned) == 12:
-        return cleaned
-    # Local to international
-    if cleaned.startswith('05') and len(cleaned) == 10:
-        # Determine country by leading digit after 0 (6 Saudi, 5 UAE)
-        return (SAUDI_CODE if cleaned[1]=='5' and cleaned[1:2] == '5' else UAE_CODE) + cleaned[1:]
-    if cleaned.startswith('5') and len(cleaned) == 9:
-        return UAE_CODE + cleaned
-    if cleaned.startswith('9665') and len(cleaned) == 12:
         return '+' + cleaned
     return None
 def process_phone_numbers(text: str) -> list[str]:
     found = []
-    for match in re.finditer(r'(?:\+?\d{8,13}|05\d{8})', text):
         raw = match.group().strip()
         if (c := clean_phone_number(raw)):
             found.append(c)
     return list(set(found))
 def extract_address(ocr_texts: list[str]) -> str | None:
     keywords = ["block","street","ave","area","industrial","road"]
     parts = [t for t in ocr_texts if any(kw in t.lower() for kw in keywords)]
     return " ".join(parts) if parts else None
 # QR scanning
 def scan_qr_code(image: Image.Image) -> str | None:
     try:
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             image.save(tmp, format="PNG")
             path = tmp.name
         img_cv = cv2.imread(path)
-        # Direct decode
         try:
             res = zxingcpp.read_barcodes(img_cv)
             if res and res[0].text:
                 return res[0].text.strip()
         except:
-            logger.warning("Direct ZXing decode failed")
         # Fallback recolor
         default_color = (0, 0, 0)
         tol = 50
@@ -107,6 +109,7 @@ def scan_qr_code(image: Image.Image) -> str | None:
     return None
 # Deduplication
 def deduplicate_data(results: dict[str, list[str]]) -> None:
     def clean_list(items, normalizer=lambda x: x):
         seen = set(); out = []
@@ -118,11 +121,11 @@ def deduplicate_data(results: dict[str, list[str]]) -> None:
                 if norm and norm not in seen:
                     seen.add(norm); out.append(norm)
         return out
-    # Normalize lists
     results['Email Address'] = clean_list(results.get('Email Address', []), lambda e: e.lower())
     results['Website'] = clean_list(results.get('Website', []), normalize_website)
     results['Phone Number'] = clean_list(results.get('Phone Number', []), clean_phone_number)
-    # Others: simple dedupe
     for key in ['Person Name','Company Name','Job Title','Address','QR Code']:
         seen = set(); out = []
         for v in results.get(key, []):
@@ -134,10 +137,7 @@ def deduplicate_data(results: dict[str, list[str]]) -> None:
 # Inference pipeline
 def inference(img: Image.Image, confidence: float):
     try:
-        ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False,
-                        det_model_dir='./models/det/en',
-                        cls_model_dir='./models/cls/en',
-                        rec_model_dir='./models/rec/en')
         arr = np.array(img)
         raw = ocr.ocr(arr, cls=True)[0]
         ocr_texts = [ln[1][0] for ln in raw]
@@ -147,63 +147,61 @@ def inference(img: Image.Image, confidence: float):
         entities = gliner_model.predict_entities(full_text, labels, threshold=confidence, flat_ner=True)
         results = {k: [] for k in ['Person Name','Company Name','Job Title','Phone Number','Email Address','Address','Website','QR Code']}
-        # Entity processing
         for ent in entities:
             txt, lbl = ent['text'].strip(), ent['label'].lower()
-            if lbl == 'person name':
-                results['Person Name'].append(txt)
-            elif lbl == 'company name':
-                results['Company Name'].append(txt)
-            elif lbl == 'job title':
-                results['Job Title'].append(txt.title())
             elif lbl == 'phone number':
-                if (c:=clean_phone_number(txt)):
-                    results['Phone Number'].append(c)
             elif lbl == 'email address' and EMAIL_REGEX.fullmatch(txt):
                 results['Email Address'].append(txt.lower())
             elif lbl == 'website' and WEBSITE_REGEX.search(txt):
-                if (n:=normalize_website(txt)):
-                    results['Website'].append(n)
-            elif lbl == 'address':
-                results['Address'].append(txt)
         # Regex fallbacks
         results['Email Address'] += extract_emails(full_text)
         results['Website'] += extract_websites(full_text)
-        # Phone regex fallback
         results['Phone Number'] += process_phone_numbers(full_text)
         # QR code
         if qr := scan_qr_code(img):
             results['QR Code'].append(qr)
         # Address fallback
-        if not results['Address']:
-            if addr := extract_address(ocr_texts):
-                results['Address'].append(addr)
-        # Deduplicate
         deduplicate_data(results)
         # Company fallback
-        if not results['Company Name']:
-            if results['Email Address']:
-                dom = results['Email Address'][0].split('@')[-1].split('.')[0]
-                results['Company Name'].append(dom.title())
-            elif results['Website']:
-                dom = results['Website'][0].split('.')[1]
-                results['Company Name'].append(dom.title())
         # Name fallback
         if not results['Person Name']:
             for t in ocr_texts:
                 if re.match(r'^(?:[A-Z][a-z]+\s?){2,}$', t):
                     results['Person Name'].append(t)
                     break
-        # Build CSV map including all keys
-        csv_map = {k: '; '.join(v) for k,v in results.items()}
         with tempfile.NamedTemporaryFile(suffix='.csv', delete=False, mode='w') as f:
             pd.DataFrame([csv_map]).to_csv(f, index=False)
             csv_path = f.name
         return full_text, results, csv_path, ''
     except Exception:
         err = traceback.format_exc()
         logger.error(f"Processing failed: {err}")
-        return '', {k: [] for k in ['Person Name','Company Name','Job Title','Phone Number','Email Address','Address','Website','QR Code']}, None, f"Error:\n{err}"
 # Gradio Interface
 if __name__ == '__main__':
@@ -216,7 +214,8 @@ if __name__ == '__main__':
          gr.File(label="Download CSV"),
          gr.Textbox(label="Error Log")],
         title='Enhanced Business Card Parser',
-        description='Accurate entity extraction with combined AI and regex validation (with Saudi/UAE support)',
         css=".gr-interface {max-width: 800px !important;}"
     )
     demo.launch()

     logger.exception("Failed to load GLiNER model")
     raise
+# Regex patterns
 EMAIL_REGEX = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
 WEBSITE_REGEX = re.compile(r"(?:https?://)?(?:www\.)?([A-Za-z0-9-]+\.[A-Za-z]{2,})")
+# UAE phone country code
 UAE_CODE = '+971'
 # Utility functions
 def extract_emails(text: str) -> list[str]:
     u = url.lower().replace('www.', '').split('/')[0]
     return f"www.{u}" if re.match(r"^[a-z0-9-]+\.[a-z]{2,}$", u) else None
+# Phone cleaning: treat all local '0XXXXXXXXX' as UAE
 def clean_phone_number(phone: str) -> str | None:
+    cleaned = re.sub(r"\D", "", phone)
+    # Local UAE numbers (10 digits starting with 0)
+    if len(cleaned) == 10 and cleaned.startswith('0'):
+        return UAE_CODE + cleaned[1:]
+    # International UAE numbers without plus (12 digits starting '971')
+    if len(cleaned) == 12 and cleaned.startswith('971'):
         return '+' + cleaned
+    # Already plus-prefixed UAE number
+    if phone.strip().startswith('+971') and len(cleaned) == 12:
+        return phone.strip()
     return None
+# Extract phone numbers from text
 def process_phone_numbers(text: str) -> list[str]:
     found = []
+    # Match '05' followed by 8 digits or plus variant
+    for match in re.finditer(r'(?:05\d{8}|\+?\d{8,12})', text):
         raw = match.group().strip()
         if (c := clean_phone_number(raw)):
             found.append(c)
     return list(set(found))
+# Address extraction
 def extract_address(ocr_texts: list[str]) -> str | None:
     keywords = ["block","street","ave","area","industrial","road"]
     parts = [t for t in ocr_texts if any(kw in t.lower() for kw in keywords)]
     return " ".join(parts) if parts else None
 # QR scanning
 def scan_qr_code(image: Image.Image) -> str | None:
     try:
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             image.save(tmp, format="PNG")
             path = tmp.name
         img_cv = cv2.imread(path)
+        # Direct decoding
         try:
             res = zxingcpp.read_barcodes(img_cv)
             if res and res[0].text:
                 return res[0].text.strip()
         except:
+            logger.warning("Direct QR decode failed")
         # Fallback recolor
         default_color = (0, 0, 0)
         tol = 50
     return None
 # Deduplication
 def deduplicate_data(results: dict[str, list[str]]) -> None:
     def clean_list(items, normalizer=lambda x: x):
         seen = set(); out = []
                 if norm and norm not in seen:
                     seen.add(norm); out.append(norm)
         return out
     results['Email Address'] = clean_list(results.get('Email Address', []), lambda e: e.lower())
     results['Website'] = clean_list(results.get('Website', []), normalize_website)
     results['Phone Number'] = clean_list(results.get('Phone Number', []), clean_phone_number)
     for key in ['Person Name','Company Name','Job Title','Address','QR Code']:
         seen = set(); out = []
         for v in results.get(key, []):
 # Inference pipeline
 def inference(img: Image.Image, confidence: float):
     try:
+        ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)
         arr = np.array(img)
         raw = ocr.ocr(arr, cls=True)[0]
         ocr_texts = [ln[1][0] for ln in raw]
         entities = gliner_model.predict_entities(full_text, labels, threshold=confidence, flat_ner=True)
         results = {k: [] for k in ['Person Name','Company Name','Job Title','Phone Number','Email Address','Address','Website','QR Code']}
+        # Process NER entities
         for ent in entities:
             txt, lbl = ent['text'].strip(), ent['label'].lower()
+            if lbl == 'person name': results['Person Name'].append(txt)
+            elif lbl == 'company name': results['Company Name'].append(txt)
+            elif lbl == 'job title': results['Job Title'].append(txt.title())
             elif lbl == 'phone number':
+                if (c := clean_phone_number(txt)): results['Phone Number'].append(c)
             elif lbl == 'email address' and EMAIL_REGEX.fullmatch(txt):
                 results['Email Address'].append(txt.lower())
             elif lbl == 'website' and WEBSITE_REGEX.search(txt):
+                if (n := normalize_website(txt)): results['Website'].append(n)
+            elif lbl == 'address': results['Address'].append(txt)
         # Regex fallbacks
         results['Email Address'] += extract_emails(full_text)
         results['Website'] += extract_websites(full_text)
         results['Phone Number'] += process_phone_numbers(full_text)
         # QR code
         if qr := scan_qr_code(img):
             results['QR Code'].append(qr)
         # Address fallback
+        if not results['Address'] and (addr := extract_address(ocr_texts)):
+            results['Address'].append(addr)
+        # Deduplicate all fields
         deduplicate_data(results)
         # Company fallback
+        if not results['Company Name'] and (dom := (results['Email Address'] or results['Website'])):
+            domain = dom[0].split('@')[-1].split('.')[0]
+            results['Company Name'].append(domain.title())
         # Name fallback
         if not results['Person Name']:
             for t in ocr_texts:
                 if re.match(r'^(?:[A-Z][a-z]+\s?){2,}$', t):
                     results['Person Name'].append(t)
                     break
+        # Prepare CSV
+        csv_map = {k: '; '.join(v) for k, v in results.items()}
         with tempfile.NamedTemporaryFile(suffix='.csv', delete=False, mode='w') as f:
             pd.DataFrame([csv_map]).to_csv(f, index=False)
             csv_path = f.name
         return full_text, results, csv_path, ''
     except Exception:
         err = traceback.format_exc()
         logger.error(f"Processing failed: {err}")
+        empty = {k: [] for k in ['Person Name','Company Name','Job Title','Phone Number','Email Address','Address','Website','QR Code']}
+        return '', empty, None, f"Error:\n{err}"
 # Gradio Interface
 if __name__ == '__main__':
          gr.File(label="Download CSV"),
          gr.Textbox(label="Error Log")],
         title='Enhanced Business Card Parser',
+        description='Entity extraction with AI and regex validation (UAE-focused phone support)',
         css=".gr-interface {max-width: 800px !important;}"
     )
     demo.launch()