|
|
|
|
|
from .model_loader import load_model |
|
from geopy.geocoders import Nominatim |
|
from .logging_config import logger |
|
import re |
|
import time |
|
from typing import Dict, Any |
|
from geopy.distance import geodesic |
|
|
|
geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10) |
|
|
|
def validate_address_format(address: str) -> bool: |
|
"""Validate the format of the address.""" |
|
if not address: |
|
return False |
|
|
|
|
|
if len(address.strip()) < 10: |
|
return False |
|
|
|
|
|
components = [comp.strip() for comp in address.split(',')] |
|
if len(components) < 2: |
|
return False |
|
|
|
|
|
patterns = [ |
|
r'\d+', |
|
r'[A-Za-z\s]+', |
|
r'(?:street|road|avenue|lane|colony|society|apartment|flat|house|building|plot|block|sector|phase|floor|wing|area|locality|main|cross|circle|square|market|ward|zone|mandal|municipal|corporation|greater)', |
|
] |
|
|
|
|
|
pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower())) |
|
if pattern_matches < 2: |
|
return False |
|
|
|
|
|
address_lower = address.lower() |
|
has_location = any(term in address_lower for term in [ |
|
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater', |
|
'street', 'road', 'avenue', 'lane', 'colony', 'society' |
|
]) |
|
has_area = any(term in address_lower for term in [ |
|
'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector', |
|
'area', 'locality', 'main', 'cross', 'circle', 'square', 'market' |
|
]) |
|
|
|
return has_location or has_area |
|
|
|
def validate_postal_code(postal_code: str) -> bool: |
|
"""Validate Indian postal code format.""" |
|
if not postal_code: |
|
return False |
|
|
|
|
|
postal_code = str(postal_code).strip().replace(' ', '') |
|
|
|
|
|
if not re.match(r'^\d{6}$', postal_code): |
|
return False |
|
|
|
|
|
first_digit = int(postal_code[0]) |
|
if first_digit not in range(1, 9): |
|
return False |
|
|
|
return True |
|
|
|
def validate_coordinates(latitude: str, longitude: str) -> bool: |
|
"""Validate coordinate format and range for India.""" |
|
try: |
|
|
|
lat = float(str(latitude).strip()) |
|
lng = float(str(longitude).strip()) |
|
|
|
|
|
india_bounds = { |
|
'lat_min': 6.0, |
|
'lat_max': 38.0, |
|
'lng_min': 67.0, |
|
'lng_max': 98.0 |
|
} |
|
|
|
|
|
if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and |
|
india_bounds['lng_min'] <= lng <= india_bounds['lng_max']): |
|
return False |
|
|
|
|
|
lat_str = f"{lat:.6f}" |
|
lng_str = f"{lng:.6f}" |
|
|
|
|
|
if abs(float(lat_str) - lat) > 0.000001 or abs(float(lng_str) - lng) > 0.000001: |
|
return False |
|
|
|
return True |
|
except (ValueError, TypeError): |
|
return False |
|
|
|
def verify_location_in_city(address: str, city: str) -> bool: |
|
"""Verify if the address exists in the given city.""" |
|
if not address or not city: |
|
return False |
|
|
|
try: |
|
|
|
address = address.strip() |
|
city = city.strip() |
|
|
|
|
|
address_components = [comp.strip() for comp in address.split(',')] |
|
|
|
|
|
address_formats = [ |
|
|
|
f"{address}, India", |
|
|
|
f"{city}, {address_components[0]}, India", |
|
f"{city}, {address_components[1]}, India", |
|
|
|
f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India", |
|
|
|
f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India", |
|
|
|
f"{address_components[0]}, {city}, India", |
|
|
|
f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India" |
|
] |
|
|
|
|
|
for addr_format in address_formats: |
|
try: |
|
location = geocoder.geocode(addr_format, timeout=10) |
|
if location: |
|
|
|
location_address = location.address.lower() |
|
city_lower = city.lower() |
|
|
|
|
|
city_variations = [ |
|
city_lower, |
|
city_lower.replace(' ', ''), |
|
city_lower.replace(' ', '-'), |
|
f"{city_lower} city", |
|
f"{city_lower} district", |
|
f"{city_lower} municipal corporation", |
|
f"greater {city_lower}", |
|
f"greater {city_lower} municipal corporation" |
|
] |
|
|
|
|
|
if any(var in location_address for var in city_variations): |
|
|
|
location_components = [comp.strip().lower() for comp in location_address.split(',')] |
|
|
|
|
|
key_components = [ |
|
comp.lower() for comp in address_components |
|
if any(keyword in comp.lower() for keyword in [ |
|
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater' |
|
]) |
|
] |
|
|
|
|
|
matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components)) |
|
if matching_components >= 2: |
|
return True |
|
except Exception as e: |
|
logger.debug(f"Error in address verification: {str(e)}") |
|
continue |
|
time.sleep(1) |
|
|
|
|
|
try: |
|
|
|
city_location = geocoder.geocode(f"{city}, India", timeout=10) |
|
if city_location: |
|
|
|
address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10) |
|
if address_location: |
|
|
|
city_coords = (city_location.latitude, city_location.longitude) |
|
address_coords = (address_location.latitude, address_location.longitude) |
|
distance = geodesic(city_coords, address_coords).kilometers |
|
|
|
|
|
city_lower = city.lower() |
|
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] |
|
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", |
|
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", |
|
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] |
|
|
|
if any(city in city_lower for city in metro_cities): |
|
max_distance = 50 |
|
elif any(city in city_lower for city in tier2_cities): |
|
max_distance = 30 |
|
else: |
|
max_distance = 20 |
|
|
|
return distance <= max_distance |
|
except Exception as e: |
|
logger.debug(f"Error in reverse geocoding: {str(e)}") |
|
|
|
return False |
|
except Exception as e: |
|
logger.error(f"Error in location verification: {str(e)}") |
|
return False |
|
|
|
def verify_city_in_state(city: str, state: str) -> bool: |
|
"""Verify if the city exists in the given state.""" |
|
if not city or not state: |
|
return False |
|
|
|
try: |
|
|
|
formats = [ |
|
f"{city}, {state}, India", |
|
f"{state}, {city}, India", |
|
f"{city}, {state}" |
|
] |
|
|
|
for fmt in formats: |
|
try: |
|
location = geocoder.geocode(fmt, timeout=10) |
|
if location: |
|
location_address = location.address.lower() |
|
city_lower = city.lower() |
|
state_lower = state.lower() |
|
|
|
|
|
city_variations = [ |
|
city_lower, |
|
city_lower.replace(' ', ''), |
|
city_lower.replace(' ', '-') |
|
] |
|
|
|
state_variations = [ |
|
state_lower, |
|
state_lower.replace(' ', ''), |
|
state_lower.replace(' ', '-') |
|
] |
|
|
|
if any(city_var in location_address for city_var in city_variations) and \ |
|
any(state_var in location_address for state_var in state_variations): |
|
return True |
|
except: |
|
continue |
|
time.sleep(1) |
|
|
|
return False |
|
except: |
|
return False |
|
|
|
def verify_state_in_country(state: str, country: str = "India") -> bool: |
|
"""Verify if the state exists in the given country.""" |
|
if not state: |
|
return False |
|
|
|
|
|
valid_states = [ |
|
'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', |
|
'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', |
|
'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', |
|
'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', |
|
'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal', |
|
'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu', |
|
'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry' |
|
] |
|
|
|
state_lower = state.lower() |
|
return state_lower in valid_states |
|
|
|
def verify_postal_code_in_city(postal_code: str, city: str) -> bool: |
|
"""Verify if the postal code belongs to the given city.""" |
|
if not postal_code or not city: |
|
return False |
|
|
|
try: |
|
|
|
formats = [ |
|
f"{postal_code}, {city}, India", |
|
f"{city}, {postal_code}, India", |
|
f"{postal_code}, {city}" |
|
] |
|
|
|
for fmt in formats: |
|
try: |
|
location = geocoder.geocode(fmt, timeout=10) |
|
if location: |
|
location_address = location.address.lower() |
|
city_lower = city.lower() |
|
|
|
|
|
city_variations = [ |
|
city_lower, |
|
city_lower.replace(' ', ''), |
|
city_lower.replace(' ', '-') |
|
] |
|
|
|
if any(var in location_address for var in city_variations): |
|
return True |
|
except: |
|
continue |
|
time.sleep(1) |
|
|
|
return False |
|
except: |
|
return False |
|
|
|
def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool: |
|
"""Verify if the coordinates are within the given city.""" |
|
if not all([latitude, longitude, city]): |
|
return False |
|
|
|
try: |
|
|
|
lat = float(str(latitude).strip()) |
|
lng = float(str(longitude).strip()) |
|
|
|
|
|
city_location = geocoder.geocode(f"{city}, India", timeout=10) |
|
if not city_location: |
|
return False |
|
|
|
city_coords = (city_location.latitude, city_location.longitude) |
|
property_coords = (lat, lng) |
|
|
|
|
|
distance = geodesic(city_coords, property_coords).kilometers |
|
|
|
|
|
city_lower = city.lower() |
|
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] |
|
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", |
|
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", |
|
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] |
|
|
|
|
|
if any(city in city_lower for city in metro_cities): |
|
max_distance = 50 |
|
elif any(city in city_lower for city in tier2_cities): |
|
max_distance = 30 |
|
else: |
|
max_distance = 20 |
|
|
|
return distance <= max_distance |
|
except: |
|
return False |
|
|
|
def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]: |
|
"""Analyze location data with detailed verification.""" |
|
try: |
|
|
|
verification_results = { |
|
'address_format_valid': validate_address_format(data.get('address', '')), |
|
'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')), |
|
'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')), |
|
'state_in_country': verify_state_in_country(data.get('state', '')), |
|
'postal_code_valid': validate_postal_code(data.get('zip', '')), |
|
'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')), |
|
'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')), |
|
'coordinates_in_city': verify_coordinates_in_city( |
|
data.get('latitude', ''), |
|
data.get('longitude', ''), |
|
data.get('city', '') |
|
) |
|
} |
|
|
|
|
|
weights = { |
|
'address_format_valid': 0.15, |
|
'address_in_city': 0.20, |
|
'city_in_state': 0.10, |
|
'state_in_country': 0.10, |
|
'postal_code_valid': 0.10, |
|
'postal_code_in_city': 0.10, |
|
'coordinates_valid': 0.10, |
|
'coordinates_in_city': 0.15 |
|
} |
|
|
|
completeness_score = sum( |
|
weights[key] * 100 if result else 0 |
|
for key, result in verification_results.items() |
|
) |
|
|
|
|
|
critical_checks = ['address_format_valid', 'city_in_state', 'state_in_country', 'postal_code_valid'] |
|
secondary_checks = ['address_in_city', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city'] |
|
|
|
|
|
critical_passed = all(verification_results[check] for check in critical_checks) |
|
secondary_passed = sum(1 for check in secondary_checks if verification_results[check]) |
|
location_quality = "verified" if critical_passed and secondary_passed >= 2 else "unverified" |
|
|
|
|
|
landmarks_analysis = { |
|
'provided': bool(data.get('nearby_landmarks')), |
|
'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0, |
|
'types': [] |
|
} |
|
|
|
if data.get('nearby_landmarks'): |
|
landmark_types = { |
|
'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'], |
|
'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'], |
|
'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'], |
|
'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'], |
|
'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'], |
|
'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub'] |
|
} |
|
|
|
landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')] |
|
for landmark in landmarks: |
|
for type_name, keywords in landmark_types.items(): |
|
if any(keyword in landmark for keyword in keywords): |
|
if type_name not in landmarks_analysis['types']: |
|
landmarks_analysis['types'].append(type_name) |
|
|
|
|
|
city_tier = "unknown" |
|
if data.get('city'): |
|
city_lower = data['city'].lower() |
|
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] |
|
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", |
|
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", |
|
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] |
|
|
|
if any(city in city_lower for city in metro_cities): |
|
city_tier = "metro" |
|
elif any(city in city_lower for city in tier2_cities): |
|
city_tier = "tier2" |
|
else: |
|
city_tier = "tier3" |
|
|
|
return { |
|
**verification_results, |
|
'assessment': "complete" if completeness_score >= 80 else "partial" if completeness_score >= 50 else "minimal", |
|
'completeness_score': completeness_score, |
|
'location_quality': location_quality, |
|
'city_tier': city_tier, |
|
'landmarks_analysis': landmarks_analysis, |
|
'verification_status': "verified" if location_quality == "verified" else "unverified", |
|
'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}" |
|
} |
|
|
|
except Exception as e: |
|
logger.error(f"Error analyzing location: {str(e)}") |
|
return { |
|
'assessment': 'error', |
|
'completeness_score': 0, |
|
'location_quality': 'error', |
|
'city_tier': 'unknown', |
|
'landmarks_analysis': {'provided': False, 'count': 0, 'types': []}, |
|
'verification_status': 'error', |
|
'formatted_address': '', |
|
'address_format_valid': False, |
|
'address_in_city': False, |
|
'city_in_state': False, |
|
'state_in_country': False, |
|
'postal_code_valid': False, |
|
'postal_code_in_city': False, |
|
'coordinates_valid': False, |
|
'coordinates_in_city': False |
|
} |
|
|
|
def calculate_location_completeness(data): |
|
|
|
weights = { |
|
'address': 0.25, |
|
'city': 0.20, |
|
'state': 0.15, |
|
'country': 0.05, |
|
'zip': 0.10, |
|
'latitude': 0.10, |
|
'longitude': 0.10, |
|
'nearby_landmarks': 0.05 |
|
} |
|
|
|
|
|
score = 0 |
|
for field, weight in weights.items(): |
|
if data[field]: |
|
score += weight |
|
|
|
return int(score * 100) |
|
|