Spaces:

KennethTM
/

DocDestroyer11000

Sleeping

App Files Files Community

KennethTM commited on Jan 20

Commit

1594055

verified ·

1 Parent(s): 35cf863

Upload 2 files

Browse files

Files changed (2) hide show

app.py +126 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from gradio import Interface, File, Dropdown, Textbox, Slider
+import json
+from gliner import GLiNER
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+class DoctrHandler:
+    def __init__(self):
+        self.model = ocr_predictor(det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True)
+    def extract_text(self, file_path):
+        try:
+            # Handle both PDF and image files
+            doc = DocumentFile.from_pdf(file_path) if file_path.endswith('.pdf') else DocumentFile.from_images(file_path)
+            # Perform OCR
+            result = self.model(doc)
+            # Extract text from result
+            text = ""
+            for page in result.pages:
+                for block in page.blocks:
+                    for line in block.lines:
+                        for word in line.words:
+                            text += word.value + " "
+            return text.strip()
+        except Exception as e:
+            raise Exception(f"Error during OCR processing: {str(e)}")
+class GlinerHandler:
+    def __init__(self):
+        self.max_length = 384
+        self.model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", max_length=self.max_length)
+    def predict_entities(self, text, labels, threshold):
+        entities = self.model.predict_entities(text, labels, threshold=threshold)
+        return entities
+# Initialize handlers
+ocr_handler = DoctrHandler()
+ner_handler = GlinerHandler()
+# Default entities
+DEFAULT_ENTITIES = ["name", "person", "bank account number", "email", "address", "phone number", "date", "currency", "amount", "document number", "iban", "country"]
+def process_file(uploaded_file, selected_entities, custom_entities, threshold=0.5):
+    # Input validation
+    if not selected_entities and not custom_entities:
+        return json.dumps({
+            "message": "Please select or provide at least one entity to search for",
+            "hits": 0,
+            "searched_for": [],
+            "entities": []
+        }, indent=4)
+    # Handle no file uploaded
+    if not uploaded_file:
+        return json.dumps({
+            "message": "No file uploaded",
+            "hits": 0,
+            "searched_for": [],
+            "entities": []
+        }, indent=4)
+    # Convert custom entities string to list and clean whitespace
+    custom_entity_list = [e.strip() for e in custom_entities.split(",") if e.strip()] if custom_entities else []
+    # Combine default and custom entities
+    all_entities = selected_entities + custom_entity_list
+    # Perform OCR on the uploaded file
+    extracted_text = ocr_handler.extract_text(uploaded_file.name)
+    # Perform NER on the extracted text with threshold
+    entities = ner_handler.predict_entities(extracted_text, all_entities, threshold)
+    if not entities:
+        return json.dumps({
+            "message": "No entities were found in the document",
+            "hits": 0,
+            "searched_for": all_entities,
+            "entities": []
+        }, indent=4)
+    # Clean and sort entities
+    cleaned_entities = []
+    for entity in entities:
+        cleaned_entity = {
+            "text": entity["text"],
+            "label": entity["label"],
+            "confidence": entity["score"]
+        }
+        cleaned_entities.append(cleaned_entity)
+    # Sort by confidence score in descending order
+    cleaned_entities.sort(key=lambda x: x["confidence"], reverse=True)
+    # Return structured response
+    response = {
+        "message": "Document destroyed successfully!",
+        "hits": len(cleaned_entities),
+        "searched_for": all_entities,
+        "entities": cleaned_entities
+    }
+    return json.dumps(response, indent=4)
+# Create Gradio interface
+iface = Interface(
+    fn=process_file,
+    inputs=[
+        File(label="Upload Document (PDF or Image)"),
+        Dropdown(choices=DEFAULT_ENTITIES, label="Select Entities", multiselect=True),
+        Textbox(label="Custom Entities (comma-separated)", placeholder="entity1, entity2, ..."),
+        Slider(minimum=0.1, maximum=1.0, value=0.5, step=0.1, label="Confidence Threshold")
+    ],
+    outputs=Textbox(label="Extracted Entities (JSON)"),
+    title="DocDestroyer11000",
+    allow_flagging=False,
+    description="Extract valuable information from your documents in a snap! Upload your PDFs or images, select the entities you care about et started now and watch your documents be **destroyed** (or in other words - turned into JSON)! 🚀<br>Tech: Copilot/Claude Sonnet + https://mindee.github.io/doctr/ + https://huggingface.co/urchade/gliner_multi-v2.1"
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+--index-url https://download.pytorch.org/whl/cpu
+torch
+torchvision
+gliner
+python-doctr
+gradio