import re from urllib.parse import urlparse, urlunparse import httpx def extract_urls(text: str): """Extract URLs from raw text.""" url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?""" return re.findall(url_pattern, text) def normalize_url(url: str) -> str: """Ensure the URL has a scheme and is normalized.""" parsed = urlparse(url, scheme="http") if not parsed.netloc: parsed = urlparse("http://" + url) return urlunparse(parsed) def resolve_short_url(url: str) -> str: """Make a HEAD request without following redirects, return the Location if redirected.""" url = normalize_url(url) try: with httpx.Client(follow_redirects=False, timeout=5) as client: response = client.head(url, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code in {301, 302, 303, 307, 308}: return response.headers.get("location") return url # No redirect except httpx.RequestError as e: print(f"Error: {e}") return url