property_verification_bot / models /location_analysis.py
sksameermujahid's picture
Upload 45 files
14cb7ae verified
# models/location_analysis.py
from .model_loader import load_model
from geopy.geocoders import Nominatim
from .logging_config import logger
import re
import time
from typing import Dict, Any
from geopy.distance import geodesic
geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10)
def validate_address_format(address: str) -> bool:
"""Validate the format of the address."""
if not address:
return False
# Check for minimum length
if len(address.strip()) < 10: # Minimum reasonable length for an address
return False
# Check for minimum components
components = [comp.strip() for comp in address.split(',')]
if len(components) < 2: # At least area and city
return False
# Check for common address patterns
patterns = [
r'\d+', # Should contain numbers
r'[A-Za-z\s]+', # Should contain letters
r'(?:street|road|avenue|lane|colony|society|apartment|flat|house|building|plot|block|sector|phase|floor|wing|area|locality|main|cross|circle|square|market|ward|zone|mandal|municipal|corporation|greater)', # Common address terms
]
# Check if at least 2 patterns match
pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower()))
if pattern_matches < 2:
return False
# Check for common address components
address_lower = address.lower()
has_location = any(term in address_lower for term in [
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater',
'street', 'road', 'avenue', 'lane', 'colony', 'society'
])
has_area = any(term in address_lower for term in [
'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector',
'area', 'locality', 'main', 'cross', 'circle', 'square', 'market'
])
return has_location or has_area
def validate_postal_code(postal_code: str) -> bool:
"""Validate Indian postal code format."""
if not postal_code:
return False
# Remove any spaces and convert to string
postal_code = str(postal_code).strip().replace(' ', '')
# Check format
if not re.match(r'^\d{6}$', postal_code):
return False
# Validate first digit (region)
first_digit = int(postal_code[0])
if first_digit not in range(1, 9): # India has 8 postal regions
return False
return True
def validate_coordinates(latitude: str, longitude: str) -> bool:
"""Validate coordinate format and range for India."""
try:
# Convert to float and handle any string formatting
lat = float(str(latitude).strip())
lng = float(str(longitude).strip())
# India's approximate boundaries with some buffer
india_bounds = {
'lat_min': 6.0, # Slightly expanded for coastal areas
'lat_max': 38.0, # Slightly expanded for northern regions
'lng_min': 67.0, # Slightly expanded for western regions
'lng_max': 98.0 # Slightly expanded for eastern regions
}
# Check if coordinates are within India's boundaries
if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and
india_bounds['lng_min'] <= lng <= india_bounds['lng_max']):
return False
# Check for reasonable precision (no more than 6 decimal places)
lat_str = f"{lat:.6f}"
lng_str = f"{lng:.6f}"
# Check if the original values match the formatted values
if abs(float(lat_str) - lat) > 0.000001 or abs(float(lng_str) - lng) > 0.000001:
return False
return True
except (ValueError, TypeError):
return False
def verify_location_in_city(address: str, city: str) -> bool:
"""Verify if the address exists in the given city."""
if not address or not city:
return False
try:
# Clean and normalize inputs
address = address.strip()
city = city.strip()
# Extract key components from the address
address_components = [comp.strip() for comp in address.split(',')]
# Try different address formats with various combinations
address_formats = [
# Full address
f"{address}, India",
# City with key components
f"{city}, {address_components[0]}, India", # First component (usually area/ward)
f"{city}, {address_components[1]}, India", # Second component (usually ward details)
# Municipal corporation format
f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India",
# Mandal format
f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India",
# Basic format
f"{address_components[0]}, {city}, India",
# Zone format
f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India"
]
# Try each format with rate limiting
for addr_format in address_formats:
try:
location = geocoder.geocode(addr_format, timeout=10)
if location:
# Get the full address and normalize it
location_address = location.address.lower()
city_lower = city.lower()
# Check for city name in different formats
city_variations = [
city_lower,
city_lower.replace(' ', ''),
city_lower.replace(' ', '-'),
f"{city_lower} city",
f"{city_lower} district",
f"{city_lower} municipal corporation",
f"greater {city_lower}",
f"greater {city_lower} municipal corporation"
]
# Check if any city variation is in the address
if any(var in location_address for var in city_variations):
# Additional verification: check if the address components match
location_components = [comp.strip().lower() for comp in location_address.split(',')]
# Check for key components
key_components = [
comp.lower() for comp in address_components
if any(keyword in comp.lower() for keyword in [
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater'
])
]
# Check if at least 2 key components match
matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components))
if matching_components >= 2:
return True
except Exception as e:
logger.debug(f"Error in address verification: {str(e)}")
continue
time.sleep(1) # Rate limiting
# If direct verification fails, try reverse geocoding
try:
# Get city coordinates
city_location = geocoder.geocode(f"{city}, India", timeout=10)
if city_location:
# Try to geocode the address
address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10)
if address_location:
# Calculate distance between coordinates
city_coords = (city_location.latitude, city_location.longitude)
address_coords = (address_location.latitude, address_location.longitude)
distance = geodesic(city_coords, address_coords).kilometers
# Use tier-based distance threshold
city_lower = city.lower()
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"]
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore",
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad",
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"]
if any(city in city_lower for city in metro_cities):
max_distance = 50 # 50km for metro cities
elif any(city in city_lower for city in tier2_cities):
max_distance = 30 # 30km for tier 2 cities
else:
max_distance = 20 # 20km for other cities
return distance <= max_distance
except Exception as e:
logger.debug(f"Error in reverse geocoding: {str(e)}")
return False
except Exception as e:
logger.error(f"Error in location verification: {str(e)}")
return False
def verify_city_in_state(city: str, state: str) -> bool:
"""Verify if the city exists in the given state."""
if not city or not state:
return False
try:
# Try different formats
formats = [
f"{city}, {state}, India",
f"{state}, {city}, India",
f"{city}, {state}"
]
for fmt in formats:
try:
location = geocoder.geocode(fmt, timeout=10)
if location:
location_address = location.address.lower()
city_lower = city.lower()
state_lower = state.lower()
# Check for city and state names in different formats
city_variations = [
city_lower,
city_lower.replace(' ', ''),
city_lower.replace(' ', '-')
]
state_variations = [
state_lower,
state_lower.replace(' ', ''),
state_lower.replace(' ', '-')
]
if any(city_var in location_address for city_var in city_variations) and \
any(state_var in location_address for state_var in state_variations):
return True
except:
continue
time.sleep(1)
return False
except:
return False
def verify_state_in_country(state: str, country: str = "India") -> bool:
"""Verify if the state exists in the given country."""
if not state:
return False
# List of valid Indian states and union territories
valid_states = [
'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh',
'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka',
'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram',
'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu',
'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal',
'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu',
'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry'
]
state_lower = state.lower()
return state_lower in valid_states
def verify_postal_code_in_city(postal_code: str, city: str) -> bool:
"""Verify if the postal code belongs to the given city."""
if not postal_code or not city:
return False
try:
# Try different formats
formats = [
f"{postal_code}, {city}, India",
f"{city}, {postal_code}, India",
f"{postal_code}, {city}"
]
for fmt in formats:
try:
location = geocoder.geocode(fmt, timeout=10)
if location:
location_address = location.address.lower()
city_lower = city.lower()
# Check for city name in different formats
city_variations = [
city_lower,
city_lower.replace(' ', ''),
city_lower.replace(' ', '-')
]
if any(var in location_address for var in city_variations):
return True
except:
continue
time.sleep(1)
return False
except:
return False
def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool:
"""Verify if the coordinates are within the given city."""
if not all([latitude, longitude, city]):
return False
try:
# Convert to float and handle any string formatting
lat = float(str(latitude).strip())
lng = float(str(longitude).strip())
# Get city coordinates
city_location = geocoder.geocode(f"{city}, India", timeout=10)
if not city_location:
return False
city_coords = (city_location.latitude, city_location.longitude)
property_coords = (lat, lng)
# Calculate distance between coordinates
distance = geodesic(city_coords, property_coords).kilometers
# Define maximum allowed distance based on city tier
city_lower = city.lower()
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"]
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore",
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad",
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"]
# Adjust max distance based on city tier
if any(city in city_lower for city in metro_cities):
max_distance = 50 # 50km for metro cities
elif any(city in city_lower for city in tier2_cities):
max_distance = 30 # 30km for tier 2 cities
else:
max_distance = 20 # 20km for other cities
return distance <= max_distance
except:
return False
def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze location data with detailed verification."""
try:
# Initialize verification results
verification_results = {
'address_format_valid': validate_address_format(data.get('address', '')),
'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')),
'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')),
'state_in_country': verify_state_in_country(data.get('state', '')),
'postal_code_valid': validate_postal_code(data.get('zip', '')),
'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')),
'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')),
'coordinates_in_city': verify_coordinates_in_city(
data.get('latitude', ''),
data.get('longitude', ''),
data.get('city', '')
)
}
# Calculate weighted completeness score with adjusted weights
weights = {
'address_format_valid': 0.15,
'address_in_city': 0.20, # Increased weight for address verification
'city_in_state': 0.10,
'state_in_country': 0.10,
'postal_code_valid': 0.10,
'postal_code_in_city': 0.10,
'coordinates_valid': 0.10,
'coordinates_in_city': 0.15
}
completeness_score = sum(
weights[key] * 100 if result else 0
for key, result in verification_results.items()
)
# Determine location quality with more lenient criteria
critical_checks = ['address_format_valid', 'city_in_state', 'state_in_country', 'postal_code_valid']
secondary_checks = ['address_in_city', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city']
# Location is verified if all critical checks pass and at least 2 secondary checks pass
critical_passed = all(verification_results[check] for check in critical_checks)
secondary_passed = sum(1 for check in secondary_checks if verification_results[check])
location_quality = "verified" if critical_passed and secondary_passed >= 2 else "unverified"
# Analyze landmarks
landmarks_analysis = {
'provided': bool(data.get('nearby_landmarks')),
'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0,
'types': []
}
if data.get('nearby_landmarks'):
landmark_types = {
'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'],
'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'],
'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'],
'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'],
'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'],
'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub']
}
landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')]
for landmark in landmarks:
for type_name, keywords in landmark_types.items():
if any(keyword in landmark for keyword in keywords):
if type_name not in landmarks_analysis['types']:
landmarks_analysis['types'].append(type_name)
# Determine city tier
city_tier = "unknown"
if data.get('city'):
city_lower = data['city'].lower()
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"]
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore",
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad",
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"]
if any(city in city_lower for city in metro_cities):
city_tier = "metro"
elif any(city in city_lower for city in tier2_cities):
city_tier = "tier2"
else:
city_tier = "tier3"
return {
**verification_results,
'assessment': "complete" if completeness_score >= 80 else "partial" if completeness_score >= 50 else "minimal",
'completeness_score': completeness_score,
'location_quality': location_quality,
'city_tier': city_tier,
'landmarks_analysis': landmarks_analysis,
'verification_status': "verified" if location_quality == "verified" else "unverified",
'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}"
}
except Exception as e:
logger.error(f"Error analyzing location: {str(e)}")
return {
'assessment': 'error',
'completeness_score': 0,
'location_quality': 'error',
'city_tier': 'unknown',
'landmarks_analysis': {'provided': False, 'count': 0, 'types': []},
'verification_status': 'error',
'formatted_address': '',
'address_format_valid': False,
'address_in_city': False,
'city_in_state': False,
'state_in_country': False,
'postal_code_valid': False,
'postal_code_in_city': False,
'coordinates_valid': False,
'coordinates_in_city': False
}
def calculate_location_completeness(data):
# Define weights for different fields
weights = {
'address': 0.25,
'city': 0.20,
'state': 0.15,
'country': 0.05,
'zip': 0.10,
'latitude': 0.10,
'longitude': 0.10,
'nearby_landmarks': 0.05
}
# Calculate weighted score
score = 0
for field, weight in weights.items():
if data[field]:
score += weight
return int(score * 100)