# models/location_analysis.py from .model_loader import load_model from geopy.geocoders import Nominatim from .logging_config import logger import re import time from typing import Dict, Any from geopy.distance import geodesic geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10) def validate_address_format(address: str) -> bool: """Validate the format of the address.""" if not address: return False # Check for minimum length if len(address.strip()) < 10: # Minimum reasonable length for an address return False # Check for minimum components components = [comp.strip() for comp in address.split(',')] if len(components) < 2: # At least area and city return False # Check for common address patterns patterns = [ r'\d+', # Should contain numbers r'[A-Za-z\s]+', # Should contain letters r'(?:street|road|avenue|lane|colony|society|apartment|flat|house|building|plot|block|sector|phase|floor|wing|area|locality|main|cross|circle|square|market|ward|zone|mandal|municipal|corporation|greater)', # Common address terms ] # Check if at least 2 patterns match pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower())) if pattern_matches < 2: return False # Check for common address components address_lower = address.lower() has_location = any(term in address_lower for term in [ 'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater', 'street', 'road', 'avenue', 'lane', 'colony', 'society' ]) has_area = any(term in address_lower for term in [ 'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector', 'area', 'locality', 'main', 'cross', 'circle', 'square', 'market' ]) return has_location or has_area def validate_postal_code(postal_code: str) -> bool: """Validate Indian postal code format.""" if not postal_code: return False # Remove any spaces and convert to string postal_code = str(postal_code).strip().replace(' ', '') # Check format if not re.match(r'^\d{6}$', postal_code): return False # Validate first digit (region) first_digit = int(postal_code[0]) if first_digit not in range(1, 9): # India has 8 postal regions return False return True def validate_coordinates(latitude: str, longitude: str) -> bool: """Validate coordinate format and range for India.""" try: # Convert to float and handle any string formatting lat = float(str(latitude).strip()) lng = float(str(longitude).strip()) # India's approximate boundaries with some buffer india_bounds = { 'lat_min': 6.0, # Slightly expanded for coastal areas 'lat_max': 38.0, # Slightly expanded for northern regions 'lng_min': 67.0, # Slightly expanded for western regions 'lng_max': 98.0 # Slightly expanded for eastern regions } # Check if coordinates are within India's boundaries if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and india_bounds['lng_min'] <= lng <= india_bounds['lng_max']): return False # Check for reasonable precision (no more than 6 decimal places) lat_str = f"{lat:.6f}" lng_str = f"{lng:.6f}" # Check if the original values match the formatted values if abs(float(lat_str) - lat) > 0.000001 or abs(float(lng_str) - lng) > 0.000001: return False return True except (ValueError, TypeError): return False def verify_location_in_city(address: str, city: str) -> bool: """Verify if the address exists in the given city.""" if not address or not city: return False try: # Clean and normalize inputs address = address.strip() city = city.strip() # Extract key components from the address address_components = [comp.strip() for comp in address.split(',')] # Try different address formats with various combinations address_formats = [ # Full address f"{address}, India", # City with key components f"{city}, {address_components[0]}, India", # First component (usually area/ward) f"{city}, {address_components[1]}, India", # Second component (usually ward details) # Municipal corporation format f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India", # Mandal format f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India", # Basic format f"{address_components[0]}, {city}, India", # Zone format f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India" ] # Try each format with rate limiting for addr_format in address_formats: try: location = geocoder.geocode(addr_format, timeout=10) if location: # Get the full address and normalize it location_address = location.address.lower() city_lower = city.lower() # Check for city name in different formats city_variations = [ city_lower, city_lower.replace(' ', ''), city_lower.replace(' ', '-'), f"{city_lower} city", f"{city_lower} district", f"{city_lower} municipal corporation", f"greater {city_lower}", f"greater {city_lower} municipal corporation" ] # Check if any city variation is in the address if any(var in location_address for var in city_variations): # Additional verification: check if the address components match location_components = [comp.strip().lower() for comp in location_address.split(',')] # Check for key components key_components = [ comp.lower() for comp in address_components if any(keyword in comp.lower() for keyword in [ 'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater' ]) ] # Check if at least 2 key components match matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components)) if matching_components >= 2: return True except Exception as e: logger.debug(f"Error in address verification: {str(e)}") continue time.sleep(1) # Rate limiting # If direct verification fails, try reverse geocoding try: # Get city coordinates city_location = geocoder.geocode(f"{city}, India", timeout=10) if city_location: # Try to geocode the address address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10) if address_location: # Calculate distance between coordinates city_coords = (city_location.latitude, city_location.longitude) address_coords = (address_location.latitude, address_location.longitude) distance = geodesic(city_coords, address_coords).kilometers # Use tier-based distance threshold city_lower = city.lower() metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] if any(city in city_lower for city in metro_cities): max_distance = 50 # 50km for metro cities elif any(city in city_lower for city in tier2_cities): max_distance = 30 # 30km for tier 2 cities else: max_distance = 20 # 20km for other cities return distance <= max_distance except Exception as e: logger.debug(f"Error in reverse geocoding: {str(e)}") return False except Exception as e: logger.error(f"Error in location verification: {str(e)}") return False def verify_city_in_state(city: str, state: str) -> bool: """Verify if the city exists in the given state.""" if not city or not state: return False try: # Try different formats formats = [ f"{city}, {state}, India", f"{state}, {city}, India", f"{city}, {state}" ] for fmt in formats: try: location = geocoder.geocode(fmt, timeout=10) if location: location_address = location.address.lower() city_lower = city.lower() state_lower = state.lower() # Check for city and state names in different formats city_variations = [ city_lower, city_lower.replace(' ', ''), city_lower.replace(' ', '-') ] state_variations = [ state_lower, state_lower.replace(' ', ''), state_lower.replace(' ', '-') ] if any(city_var in location_address for city_var in city_variations) and \ any(state_var in location_address for state_var in state_variations): return True except: continue time.sleep(1) return False except: return False def verify_state_in_country(state: str, country: str = "India") -> bool: """Verify if the state exists in the given country.""" if not state: return False # List of valid Indian states and union territories valid_states = [ 'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu', 'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry' ] state_lower = state.lower() return state_lower in valid_states def verify_postal_code_in_city(postal_code: str, city: str) -> bool: """Verify if the postal code belongs to the given city.""" if not postal_code or not city: return False try: # Try different formats formats = [ f"{postal_code}, {city}, India", f"{city}, {postal_code}, India", f"{postal_code}, {city}" ] for fmt in formats: try: location = geocoder.geocode(fmt, timeout=10) if location: location_address = location.address.lower() city_lower = city.lower() # Check for city name in different formats city_variations = [ city_lower, city_lower.replace(' ', ''), city_lower.replace(' ', '-') ] if any(var in location_address for var in city_variations): return True except: continue time.sleep(1) return False except: return False def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool: """Verify if the coordinates are within the given city.""" if not all([latitude, longitude, city]): return False try: # Convert to float and handle any string formatting lat = float(str(latitude).strip()) lng = float(str(longitude).strip()) # Get city coordinates city_location = geocoder.geocode(f"{city}, India", timeout=10) if not city_location: return False city_coords = (city_location.latitude, city_location.longitude) property_coords = (lat, lng) # Calculate distance between coordinates distance = geodesic(city_coords, property_coords).kilometers # Define maximum allowed distance based on city tier city_lower = city.lower() metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] # Adjust max distance based on city tier if any(city in city_lower for city in metro_cities): max_distance = 50 # 50km for metro cities elif any(city in city_lower for city in tier2_cities): max_distance = 30 # 30km for tier 2 cities else: max_distance = 20 # 20km for other cities return distance <= max_distance except: return False def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]: """Analyze location data with detailed verification.""" try: # Initialize verification results verification_results = { 'address_format_valid': validate_address_format(data.get('address', '')), 'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')), 'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')), 'state_in_country': verify_state_in_country(data.get('state', '')), 'postal_code_valid': validate_postal_code(data.get('zip', '')), 'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')), 'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')), 'coordinates_in_city': verify_coordinates_in_city( data.get('latitude', ''), data.get('longitude', ''), data.get('city', '') ) } # Calculate weighted completeness score with adjusted weights weights = { 'address_format_valid': 0.15, 'address_in_city': 0.20, # Increased weight for address verification 'city_in_state': 0.10, 'state_in_country': 0.10, 'postal_code_valid': 0.10, 'postal_code_in_city': 0.10, 'coordinates_valid': 0.10, 'coordinates_in_city': 0.15 } completeness_score = sum( weights[key] * 100 if result else 0 for key, result in verification_results.items() ) # Determine location quality with more lenient criteria critical_checks = ['address_format_valid', 'city_in_state', 'state_in_country', 'postal_code_valid'] secondary_checks = ['address_in_city', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city'] # Location is verified if all critical checks pass and at least 2 secondary checks pass critical_passed = all(verification_results[check] for check in critical_checks) secondary_passed = sum(1 for check in secondary_checks if verification_results[check]) location_quality = "verified" if critical_passed and secondary_passed >= 2 else "unverified" # Analyze landmarks landmarks_analysis = { 'provided': bool(data.get('nearby_landmarks')), 'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0, 'types': [] } if data.get('nearby_landmarks'): landmark_types = { 'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'], 'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'], 'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'], 'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'], 'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'], 'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub'] } landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')] for landmark in landmarks: for type_name, keywords in landmark_types.items(): if any(keyword in landmark for keyword in keywords): if type_name not in landmarks_analysis['types']: landmarks_analysis['types'].append(type_name) # Determine city tier city_tier = "unknown" if data.get('city'): city_lower = data['city'].lower() metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", "thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", "ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] if any(city in city_lower for city in metro_cities): city_tier = "metro" elif any(city in city_lower for city in tier2_cities): city_tier = "tier2" else: city_tier = "tier3" return { **verification_results, 'assessment': "complete" if completeness_score >= 80 else "partial" if completeness_score >= 50 else "minimal", 'completeness_score': completeness_score, 'location_quality': location_quality, 'city_tier': city_tier, 'landmarks_analysis': landmarks_analysis, 'verification_status': "verified" if location_quality == "verified" else "unverified", 'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}" } except Exception as e: logger.error(f"Error analyzing location: {str(e)}") return { 'assessment': 'error', 'completeness_score': 0, 'location_quality': 'error', 'city_tier': 'unknown', 'landmarks_analysis': {'provided': False, 'count': 0, 'types': []}, 'verification_status': 'error', 'formatted_address': '', 'address_format_valid': False, 'address_in_city': False, 'city_in_state': False, 'state_in_country': False, 'postal_code_valid': False, 'postal_code_in_city': False, 'coordinates_valid': False, 'coordinates_in_city': False } def calculate_location_completeness(data): # Define weights for different fields weights = { 'address': 0.25, 'city': 0.20, 'state': 0.15, 'country': 0.05, 'zip': 0.10, 'latitude': 0.10, 'longitude': 0.10, 'nearby_landmarks': 0.05 } # Calculate weighted score score = 0 for field, weight in weights.items(): if data[field]: score += weight return int(score * 100)