Spaces:

manish-aggarwal
/

file-classification

Running

App Files Files Community

file-classification / app.py

manish-aggarwal

Update app.py

3318c67 verified 3 months ago

raw

history blame contribute delete

2.51 kB

	import gradio as gr
	from transformers import pipeline
	import PyPDF2
	from docx import Document
	import re

	# Load pipelines
	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True)

	# File reading
	def read_file(file_obj):
	name = file_obj.name
	if name.endswith(".txt"):
	return file_obj.read().decode("utf-8", errors="ignore")
	elif name.endswith(".pdf"):
	reader = PyPDF2.PdfReader(file_obj)
	return " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
	elif name.endswith(".docx"):
	doc = Document(file_obj)
	return "\n".join([para.text for para in doc.paragraphs])
	else:
	return "Unsupported file format"

	# Contract classification
	def is_contract(text):
	result = classifier(text[:1000], ["contract", "not a contract"])
	return result['labels'][0] == 'contract', result

	# Rule-based + NER-based party extraction
	def extract_parties_with_rules(text):
	results = set()

	# Rule-based: between X and Y
	matches = re.findall(r'between\s+(.?)\s+and\s+(.?)[\.,\n]', text, re.IGNORECASE)
	for match in matches:
	results.update(match)

	# Rule-based: "X" (Party A), etc.
	named_matches = re.findall(r'“([^”]+)”\s\(.?Party [AB]\)', text)
	results.update(named_matches)

	# NER fallback
	entities = ner(text[:1000])
	ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']]
	results.update(ner_parties)

	return list(results)

	# Main logic
	def process_file(file):
	text = read_file(file)
	if not text.strip():
	return "Empty or unreadable file.", None

	is_contract_flag, classification = is_contract(text)
	if is_contract_flag:
	parties = extract_parties_with_rules(text)
	return "✅ This is a contract.", ", ".join(parties)
	else:
	return "❌ This is NOT a contract.", ""

	# Gradio interface
	iface = gr.Interface(
	fn=process_file,
	inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"),
	outputs=[
	gr.Textbox(label="Classification Result"),
	gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)")
	],
	title="Contract Classifier with RoBERTa",
	description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching."
	)

	iface.launch()