File size: 7,562 Bytes
14cb7ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# models/pdf_analysis.py
import fitz # PyMuPDF
import re
from .model_loader import load_model
from .logging_config import logger
from sentence_transformers import SentenceTransformer, util
from .property_relation import check_if_property_related
from .utils import summarize_text
# Initialize sentence transformer
try:
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
logger.info("Sentence transformer loaded successfully in pdf_analysis.py")
except Exception as e:
logger.error(f"Error loading sentence transformer in pdf_analysis.py: {str(e)}")
sentence_model = None
def extract_pdf_text(pdf_file):
try:
pdf_document = fitz.Document(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in pdf_document:
text += page.get_text()
pdf_document.close()
return text
except Exception as e:
logger.error(f"Error extracting PDF text: {str(e)}")
return ""
def analyze_pdf_content(document_text, property_data):
try:
if not document_text:
return {
'document_type': {'classification': 'unknown', 'confidence': 0.0},
'authenticity': {'assessment': 'could not verify', 'confidence': 0.0},
'key_info': {},
'consistency_score': 0.0,
'is_property_related': False,
'summary': 'Empty document',
'has_signatures': False,
'has_dates': False,
'verification_score': 0.0
}
# Use a more sophisticated model for document classification
classifier = load_model("zero-shot-classification", "typeform/mobilebert-uncased-mnli")
# Enhanced document types with more specific categories
doc_types = [
"property deed", "sales agreement", "mortgage document",
"property tax record", "title document", "khata certificate",
"encumbrance certificate", "lease agreement", "rental agreement",
"property registration document", "building permit", "other document"
]
# Analyze document type with context
doc_context = f"{document_text[:1000]} property_type:{property_data.get('property_type', '')} location:{property_data.get('city', '')}"
doc_result = classifier(doc_context, doc_types)
doc_type = doc_result['labels'][0]
doc_confidence = doc_result['scores'][0]
# Enhanced authenticity check with multiple aspects
authenticity_aspects = [
"authentic legal document",
"questionable document",
"forged document",
"template document",
"official document"
]
authenticity_result = classifier(document_text[:1000], authenticity_aspects)
authenticity = "likely authentic" if authenticity_result['labels'][0] == "authentic legal document" else "questionable"
authenticity_confidence = authenticity_result['scores'][0]
# Extract key information using NLP
key_info = extract_document_key_info(document_text)
# Enhanced consistency check
consistency_score = check_document_consistency(document_text, property_data)
# Property relation check with context
property_context = f"{document_text[:1000]} property:{property_data.get('property_name', '')} type:{property_data.get('property_type', '')}"
is_property_related = check_if_property_related(property_context)['is_related']
# Generate summary using BART
summary = summarize_text(document_text[:2000])
# Enhanced signature and date detection
has_signatures = bool(re.search(r'(?:sign|signature|signed|witness|notary|authorized).{0,50}(?:by|of|for)', document_text.lower()))
has_dates = bool(re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}', document_text))
# Calculate verification score with weighted components
verification_weights = {
'doc_type': 0.3,
'authenticity': 0.3,
'consistency': 0.2,
'property_relation': 0.1,
'signatures_dates': 0.1
}
verification_score = (
doc_confidence * verification_weights['doc_type'] +
authenticity_confidence * verification_weights['authenticity'] +
consistency_score * verification_weights['consistency'] +
float(is_property_related) * verification_weights['property_relation'] +
float(has_signatures and has_dates) * verification_weights['signatures_dates']
)
return {
'document_type': {'classification': doc_type, 'confidence': float(doc_confidence)},
'authenticity': {'assessment': authenticity, 'confidence': float(authenticity_confidence)},
'key_info': key_info,
'consistency_score': float(consistency_score),
'is_property_related': is_property_related,
'summary': summary,
'has_signatures': has_signatures,
'has_dates': has_dates,
'verification_score': float(verification_score)
}
except Exception as e:
logger.error(f"Error analyzing PDF content: {str(e)}")
return {
'document_type': {'classification': 'unknown', 'confidence': 0.0},
'authenticity': {'assessment': 'could not verify', 'confidence': 0.0},
'key_info': {},
'consistency_score': 0.0,
'is_property_related': False,
'summary': 'Could not analyze document',
'has_signatures': False,
'has_dates': False,
'verification_score': 0.0,
'error': str(e)
}
def check_document_consistency(document_text, property_data):
try:
if not sentence_model:
logger.warning("Sentence model unavailable")
return 0.5
property_text = ' '.join([
property_data.get(key, '') for key in [
'property_name', 'property_type', 'address', 'city',
'state', 'market_value', 'sq_ft', 'bedrooms'
]
])
property_embedding = sentence_model.encode(property_text)
document_embedding = sentence_model.encode(document_text[:1000])
similarity = util.cos_sim(property_embedding, document_embedding)[0][0].item()
return max(0.0, min(1.0, float(similarity)))
except Exception as e:
logger.error(f"Error checking document consistency: {str(e)}")
return 0.0
def extract_document_key_info(text):
try:
info = {}
patterns = {
'property_address': r'(?:property|premises|located at)[:\s]+([^\n.]+)',
'price': r'(?:price|value|amount)[:\s]+(?:Rs\.?|₹)?[\s]*([0-9,.]+)',
'date': r'(?:date|dated|executed on)[:\s]+([^\n.]+\d{4})',
'seller': r'(?:seller|grantor|owner)[:\s]+([^\n.]+)',
'buyer': r'(?:buyer|grantee|purchaser)[:\s]+([^\n.]+)',
'size': r'(?:area|size|extent)[:\s]+([0-9,.]+)[\s]*(?:sq\.?[\s]*(?:ft|feet))',
'registration_number': r'(?:registration|reg\.?|document)[\s]*(?:no\.?|number|#)[:\s]*([A-Za-z0-9\-/]+)'
}
for key, pattern in patterns.items():
match = re.search(pattern, text, re.IGNORECASE)
if match:
info[key] = match.group(1).strip()
return info
except Exception as e:
logger.error(f"Error extracting document key info: {str(e)}")
return {}
|