|
import gradio as gr |
|
from transformers import pipeline |
|
import PyPDF2 |
|
from docx import Document |
|
import re |
|
|
|
|
|
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") |
|
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True) |
|
|
|
|
|
def read_file(file_obj): |
|
name = file_obj.name |
|
if name.endswith(".txt"): |
|
return file_obj.read().decode("utf-8", errors="ignore") |
|
elif name.endswith(".pdf"): |
|
reader = PyPDF2.PdfReader(file_obj) |
|
return " ".join([page.extract_text() for page in reader.pages if page.extract_text()]) |
|
elif name.endswith(".docx"): |
|
doc = Document(file_obj) |
|
return "\n".join([para.text for para in doc.paragraphs]) |
|
else: |
|
return "Unsupported file format" |
|
|
|
|
|
def is_contract(text): |
|
result = classifier(text[:1000], ["contract", "not a contract"]) |
|
return result['labels'][0] == 'contract', result |
|
|
|
|
|
def extract_parties_with_rules(text): |
|
results = set() |
|
|
|
|
|
matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE) |
|
for match in matches: |
|
results.update(match) |
|
|
|
|
|
named_matches = re.findall(r'β([^β]+)β\s*\(.*?Party [AB]\)', text) |
|
results.update(named_matches) |
|
|
|
|
|
entities = ner(text[:1000]) |
|
ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']] |
|
results.update(ner_parties) |
|
|
|
return list(results) |
|
|
|
|
|
def process_file(file): |
|
text = read_file(file) |
|
if not text.strip(): |
|
return "Empty or unreadable file.", None |
|
|
|
is_contract_flag, classification = is_contract(text) |
|
if is_contract_flag: |
|
parties = extract_parties_with_rules(text) |
|
return "β
This is a contract.", ", ".join(parties) |
|
else: |
|
return "β This is NOT a contract.", "" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_file, |
|
inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"), |
|
outputs=[ |
|
gr.Textbox(label="Classification Result"), |
|
gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)") |
|
], |
|
title="Contract Classifier with RoBERTa", |
|
description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching." |
|
) |
|
|
|
iface.launch() |