import re from urllib.parse import urlparse, urlunparse import httpx def extract_urls(text: str): """Extract URLs from raw text.""" url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?""" return re.findall(url_pattern, text) def extract_domain_from_url(url: str) -> str: """ Extracts the domain (netloc) from a given URL. Parameters: url (str): The full URL. Returns: str: The domain (e.g., 'example.com'). """ redirect_url = resolve_short_url(url) print(f"redirect: {url} -> {redirect_url}") parsed = urlparse(redirect_url) domain = parsed.netloc print(f"domain: {redirect_url} -> {domain}") return domain def normalize_url(url: str) -> str: """Ensure the URL has a scheme and is normalized.""" parsed = urlparse(url, scheme="http") if not parsed.netloc: parsed = urlparse("http://" + url) return urlunparse(parsed) def resolve_short_url(url: str) -> str: """Make a HEAD request without following redirects, return the Location if redirected.""" url = normalize_url(url) try: with httpx.Client(follow_redirects=False, timeout=5) as client: response = client.head(url, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code in {301, 302, 303, 307, 308}: location = response.headers.get("location") return resolve_short_url(location) return url # No redirect except httpx.RequestError as e: print(f"Error: {e}") return url