# models/pdf_analysis.py import fitz # PyMuPDF import re from .model_loader import load_model from .logging_config import logger from sentence_transformers import SentenceTransformer, util from .property_relation import check_if_property_related from .utils import summarize_text # Initialize sentence transformer try: sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') logger.info("Sentence transformer loaded successfully in pdf_analysis.py") except Exception as e: logger.error(f"Error loading sentence transformer in pdf_analysis.py: {str(e)}") sentence_model = None def extract_pdf_text(pdf_file): try: pdf_document = fitz.Document(stream=pdf_file.read(), filetype="pdf") text = "" for page in pdf_document: text += page.get_text() pdf_document.close() return text except Exception as e: logger.error(f"Error extracting PDF text: {str(e)}") return "" def analyze_pdf_content(document_text, property_data): try: if not document_text: return { 'document_type': {'classification': 'unknown', 'confidence': 0.0}, 'authenticity': {'assessment': 'could not verify', 'confidence': 0.0}, 'key_info': {}, 'consistency_score': 0.0, 'is_property_related': False, 'summary': 'Empty document', 'has_signatures': False, 'has_dates': False, 'verification_score': 0.0 } # Use a more sophisticated model for document classification classifier = load_model("zero-shot-classification", "typeform/mobilebert-uncased-mnli") # Enhanced document types with more specific categories doc_types = [ "property deed", "sales agreement", "mortgage document", "property tax record", "title document", "khata certificate", "encumbrance certificate", "lease agreement", "rental agreement", "property registration document", "building permit", "other document" ] # Analyze document type with context doc_context = f"{document_text[:1000]} property_type:{property_data.get('property_type', '')} location:{property_data.get('city', '')}" doc_result = classifier(doc_context, doc_types) doc_type = doc_result['labels'][0] doc_confidence = doc_result['scores'][0] # Enhanced authenticity check with multiple aspects authenticity_aspects = [ "authentic legal document", "questionable document", "forged document", "template document", "official document" ] authenticity_result = classifier(document_text[:1000], authenticity_aspects) authenticity = "likely authentic" if authenticity_result['labels'][0] == "authentic legal document" else "questionable" authenticity_confidence = authenticity_result['scores'][0] # Extract key information using NLP key_info = extract_document_key_info(document_text) # Enhanced consistency check consistency_score = check_document_consistency(document_text, property_data) # Property relation check with context property_context = f"{document_text[:1000]} property:{property_data.get('property_name', '')} type:{property_data.get('property_type', '')}" is_property_related = check_if_property_related(property_context)['is_related'] # Generate summary using BART summary = summarize_text(document_text[:2000]) # Enhanced signature and date detection has_signatures = bool(re.search(r'(?:sign|signature|signed|witness|notary|authorized).{0,50}(?:by|of|for)', document_text.lower())) has_dates = bool(re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}', document_text)) # Calculate verification score with weighted components verification_weights = { 'doc_type': 0.3, 'authenticity': 0.3, 'consistency': 0.2, 'property_relation': 0.1, 'signatures_dates': 0.1 } verification_score = ( doc_confidence * verification_weights['doc_type'] + authenticity_confidence * verification_weights['authenticity'] + consistency_score * verification_weights['consistency'] + float(is_property_related) * verification_weights['property_relation'] + float(has_signatures and has_dates) * verification_weights['signatures_dates'] ) return { 'document_type': {'classification': doc_type, 'confidence': float(doc_confidence)}, 'authenticity': {'assessment': authenticity, 'confidence': float(authenticity_confidence)}, 'key_info': key_info, 'consistency_score': float(consistency_score), 'is_property_related': is_property_related, 'summary': summary, 'has_signatures': has_signatures, 'has_dates': has_dates, 'verification_score': float(verification_score) } except Exception as e: logger.error(f"Error analyzing PDF content: {str(e)}") return { 'document_type': {'classification': 'unknown', 'confidence': 0.0}, 'authenticity': {'assessment': 'could not verify', 'confidence': 0.0}, 'key_info': {}, 'consistency_score': 0.0, 'is_property_related': False, 'summary': 'Could not analyze document', 'has_signatures': False, 'has_dates': False, 'verification_score': 0.0, 'error': str(e) } def check_document_consistency(document_text, property_data): try: if not sentence_model: logger.warning("Sentence model unavailable") return 0.5 property_text = ' '.join([ property_data.get(key, '') for key in [ 'property_name', 'property_type', 'address', 'city', 'state', 'market_value', 'sq_ft', 'bedrooms' ] ]) property_embedding = sentence_model.encode(property_text) document_embedding = sentence_model.encode(document_text[:1000]) similarity = util.cos_sim(property_embedding, document_embedding)[0][0].item() return max(0.0, min(1.0, float(similarity))) except Exception as e: logger.error(f"Error checking document consistency: {str(e)}") return 0.0 def extract_document_key_info(text): try: info = {} patterns = { 'property_address': r'(?:property|premises|located at)[:\s]+([^\n.]+)', 'price': r'(?:price|value|amount)[:\s]+(?:Rs\.?|₹)?[\s]*([0-9,.]+)', 'date': r'(?:date|dated|executed on)[:\s]+([^\n.]+\d{4})', 'seller': r'(?:seller|grantor|owner)[:\s]+([^\n.]+)', 'buyer': r'(?:buyer|grantee|purchaser)[:\s]+([^\n.]+)', 'size': r'(?:area|size|extent)[:\s]+([0-9,.]+)[\s]*(?:sq\.?[\s]*(?:ft|feet))', 'registration_number': r'(?:registration|reg\.?|document)[\s]*(?:no\.?|number|#)[:\s]*([A-Za-z0-9\-/]+)' } for key, pattern in patterns.items(): match = re.search(pattern, text, re.IGNORECASE) if match: info[key] = match.group(1).strip() return info except Exception as e: logger.error(f"Error extracting document key info: {str(e)}") return {}