File size: 7,562 Bytes
14cb7ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# models/pdf_analysis.py

import fitz  # PyMuPDF
import re
from .model_loader import load_model
from .logging_config import logger
from sentence_transformers import SentenceTransformer, util
from .property_relation import check_if_property_related
from .utils import summarize_text

# Initialize sentence transformer
try:
    sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    logger.info("Sentence transformer loaded successfully in pdf_analysis.py")
except Exception as e:
    logger.error(f"Error loading sentence transformer in pdf_analysis.py: {str(e)}")
    sentence_model = None

def extract_pdf_text(pdf_file):
    try:
        pdf_document = fitz.Document(stream=pdf_file.read(), filetype="pdf")
        text = ""
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
        return text
    except Exception as e:
        logger.error(f"Error extracting PDF text: {str(e)}")
        return ""

def analyze_pdf_content(document_text, property_data):
    try:
        if not document_text:
            return {
                'document_type': {'classification': 'unknown', 'confidence': 0.0},
                'authenticity': {'assessment': 'could not verify', 'confidence': 0.0},
                'key_info': {},
                'consistency_score': 0.0,
                'is_property_related': False,
                'summary': 'Empty document',
                'has_signatures': False,
                'has_dates': False,
                'verification_score': 0.0
            }

        # Use a more sophisticated model for document classification
        classifier = load_model("zero-shot-classification", "typeform/mobilebert-uncased-mnli")

        # Enhanced document types with more specific categories
        doc_types = [
            "property deed", "sales agreement", "mortgage document",
            "property tax record", "title document", "khata certificate",
            "encumbrance certificate", "lease agreement", "rental agreement",
            "property registration document", "building permit", "other document"
        ]

        # Analyze document type with context
        doc_context = f"{document_text[:1000]} property_type:{property_data.get('property_type', '')} location:{property_data.get('city', '')}"
        doc_result = classifier(doc_context, doc_types)
        doc_type = doc_result['labels'][0]
        doc_confidence = doc_result['scores'][0]

        # Enhanced authenticity check with multiple aspects
        authenticity_aspects = [
            "authentic legal document",
            "questionable document",
            "forged document",
            "template document",
            "official document"
        ]
        authenticity_result = classifier(document_text[:1000], authenticity_aspects)
        authenticity = "likely authentic" if authenticity_result['labels'][0] == "authentic legal document" else "questionable"
        authenticity_confidence = authenticity_result['scores'][0]

        # Extract key information using NLP
        key_info = extract_document_key_info(document_text)

        # Enhanced consistency check
        consistency_score = check_document_consistency(document_text, property_data)

        # Property relation check with context
        property_context = f"{document_text[:1000]} property:{property_data.get('property_name', '')} type:{property_data.get('property_type', '')}"
        is_property_related = check_if_property_related(property_context)['is_related']

        # Generate summary using BART
        summary = summarize_text(document_text[:2000])

        # Enhanced signature and date detection
        has_signatures = bool(re.search(r'(?:sign|signature|signed|witness|notary|authorized).{0,50}(?:by|of|for)', document_text.lower()))
        has_dates = bool(re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}', document_text))

        # Calculate verification score with weighted components
        verification_weights = {
            'doc_type': 0.3,
            'authenticity': 0.3,
            'consistency': 0.2,
            'property_relation': 0.1,
            'signatures_dates': 0.1
        }

        verification_score = (
            doc_confidence * verification_weights['doc_type'] +
            authenticity_confidence * verification_weights['authenticity'] +
            consistency_score * verification_weights['consistency'] +
            float(is_property_related) * verification_weights['property_relation'] +
            float(has_signatures and has_dates) * verification_weights['signatures_dates']
        )

        return {
            'document_type': {'classification': doc_type, 'confidence': float(doc_confidence)},
            'authenticity': {'assessment': authenticity, 'confidence': float(authenticity_confidence)},
            'key_info': key_info,
            'consistency_score': float(consistency_score),
            'is_property_related': is_property_related,
            'summary': summary,
            'has_signatures': has_signatures,
            'has_dates': has_dates,
            'verification_score': float(verification_score)
        }
    except Exception as e:
        logger.error(f"Error analyzing PDF content: {str(e)}")
        return {
            'document_type': {'classification': 'unknown', 'confidence': 0.0},
            'authenticity': {'assessment': 'could not verify', 'confidence': 0.0},
            'key_info': {},
            'consistency_score': 0.0,
            'is_property_related': False,
            'summary': 'Could not analyze document',
            'has_signatures': False,
            'has_dates': False,
            'verification_score': 0.0,
            'error': str(e)
        }

def check_document_consistency(document_text, property_data):
    try:
        if not sentence_model:
            logger.warning("Sentence model unavailable")
            return 0.5
        property_text = ' '.join([
            property_data.get(key, '') for key in [
                'property_name', 'property_type', 'address', 'city',
                'state', 'market_value', 'sq_ft', 'bedrooms'
            ]
        ])
        property_embedding = sentence_model.encode(property_text)
        document_embedding = sentence_model.encode(document_text[:1000])
        similarity = util.cos_sim(property_embedding, document_embedding)[0][0].item()
        return max(0.0, min(1.0, float(similarity)))
    except Exception as e:
        logger.error(f"Error checking document consistency: {str(e)}")
        return 0.0

def extract_document_key_info(text):
    try:
        info = {}
        patterns = {
            'property_address': r'(?:property|premises|located at)[:\s]+([^\n.]+)',
            'price': r'(?:price|value|amount)[:\s]+(?:Rs\.?|₹)?[\s]*([0-9,.]+)',
            'date': r'(?:date|dated|executed on)[:\s]+([^\n.]+\d{4})',
            'seller': r'(?:seller|grantor|owner)[:\s]+([^\n.]+)',
            'buyer': r'(?:buyer|grantee|purchaser)[:\s]+([^\n.]+)',
            'size': r'(?:area|size|extent)[:\s]+([0-9,.]+)[\s]*(?:sq\.?[\s]*(?:ft|feet))',
            'registration_number': r'(?:registration|reg\.?|document)[\s]*(?:no\.?|number|#)[:\s]*([A-Za-z0-9\-/]+)'
        }
        for key, pattern in patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                info[key] = match.group(1).strip()
        return info
    except Exception as e:
        logger.error(f"Error extracting document key info: {str(e)}")
        return {}