akhaliq HF Staff commited on
Commit
a842cd5
·
1 Parent(s): e2d3712
Files changed (1) hide show
  1. app.py +55 -6
app.py CHANGED
@@ -473,14 +473,40 @@ def extract_website_content(url: str) -> str:
473
  if not parsed_url.netloc:
474
  return "Error: Invalid URL provided"
475
 
476
- # Set headers to mimic a browser request
477
  headers = {
478
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
 
 
 
 
 
 
 
 
479
  }
480
 
481
- # Make the request
482
- response = requests.get(url, headers=headers, timeout=10)
483
- response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
  # Parse HTML content
486
  soup = BeautifulSoup(response.content, 'html.parser')
@@ -572,6 +598,19 @@ PAGE STRUCTURE:
572
 
573
  return website_content.strip()
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  except requests.exceptions.RequestException as e:
576
  return f"Error accessing website: {str(e)}"
577
  except Exception as e:
@@ -603,7 +642,17 @@ def generation_code(query: Optional[str], image: Optional[gr.Image], file: Optio
603
  website_text = website_text[:8000] # Limit to 8000 chars for prompt size
604
  query = f"{query}\n\n[Website content to redesign below]\n{website_text}"
605
  elif website_text.startswith("Error"):
606
- query = f"{query}\n\n[Error extracting website: {website_text}]"
 
 
 
 
 
 
 
 
 
 
607
 
608
  # Enhance query with search if enabled
609
  enhanced_query = enhance_query_with_search(query, enable_search)
 
473
  if not parsed_url.netloc:
474
  return "Error: Invalid URL provided"
475
 
476
+ # Set comprehensive headers to mimic a real browser request
477
  headers = {
478
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
479
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
480
+ 'Accept-Language': 'en-US,en;q=0.9',
481
+ 'Accept-Encoding': 'gzip, deflate, br',
482
+ 'DNT': '1',
483
+ 'Connection': 'keep-alive',
484
+ 'Upgrade-Insecure-Requests': '1',
485
+ 'Sec-Fetch-Dest': 'document',
486
+ 'Sec-Fetch-Mode': 'navigate',
487
+ 'Sec-Fetch-Site': 'none',
488
+ 'Sec-Fetch-User': '?1',
489
+ 'Cache-Control': 'max-age=0'
490
  }
491
 
492
+ # Create a session to maintain cookies and handle redirects
493
+ session = requests.Session()
494
+ session.headers.update(headers)
495
+
496
+ # Make the request with retry logic
497
+ max_retries = 3
498
+ for attempt in range(max_retries):
499
+ try:
500
+ response = session.get(url, timeout=15, allow_redirects=True)
501
+ response.raise_for_status()
502
+ break
503
+ except requests.exceptions.HTTPError as e:
504
+ if e.response.status_code == 403 and attempt < max_retries - 1:
505
+ # Try with different User-Agent on 403
506
+ session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
507
+ continue
508
+ else:
509
+ raise
510
 
511
  # Parse HTML content
512
  soup = BeautifulSoup(response.content, 'html.parser')
 
598
 
599
  return website_content.strip()
600
 
601
+ except requests.exceptions.HTTPError as e:
602
+ if e.response.status_code == 403:
603
+ return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
604
+ elif e.response.status_code == 404:
605
+ return f"Error: Website not found (404). Please check the URL and try again."
606
+ elif e.response.status_code >= 500:
607
+ return f"Error: Website server error ({e.response.status_code}). Please try again later."
608
+ else:
609
+ return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
610
+ except requests.exceptions.Timeout:
611
+ return "Error: Request timed out. The website may be slow or unavailable."
612
+ except requests.exceptions.ConnectionError:
613
+ return "Error: Could not connect to the website. Please check your internet connection and the URL."
614
  except requests.exceptions.RequestException as e:
615
  return f"Error accessing website: {str(e)}"
616
  except Exception as e:
 
642
  website_text = website_text[:8000] # Limit to 8000 chars for prompt size
643
  query = f"{query}\n\n[Website content to redesign below]\n{website_text}"
644
  elif website_text.startswith("Error"):
645
+ # Provide helpful guidance when website extraction fails
646
+ fallback_guidance = """
647
+ Since I couldn't extract the website content, please provide additional details about what you'd like to build:
648
+
649
+ 1. What type of website is this? (e.g., e-commerce, blog, portfolio, dashboard)
650
+ 2. What are the main features you want?
651
+ 3. What's the target audience?
652
+ 4. Any specific design preferences? (colors, style, layout)
653
+
654
+ This will help me create a better design for you."""
655
+ query = f"{query}\n\n[Error extracting website: {website_text}]{fallback_guidance}"
656
 
657
  # Enhance query with search if enabled
658
  enhanced_query = enhance_query_with_search(query, enable_search)