akhaliq HF Staff commited on
Commit
8496c04
·
1 Parent(s): a842cd5
Files changed (1) hide show
  1. app.py +230 -43
app.py CHANGED
@@ -29,11 +29,15 @@ When asked to create an application, you should:
29
  5. Ensure the code is functional and follows best practices
30
 
31
  For website redesign tasks:
32
- - Analyze the extracted website content to understand the structure and purpose
33
- - Create a modern, responsive design that improves upon the original
34
- - Maintain the core functionality and content while enhancing the user experience
 
 
35
  - Use modern CSS frameworks and design patterns
36
  - Ensure accessibility and mobile responsiveness
 
 
37
 
38
  If an image is provided, analyze it and use the visual information to better understand the user's requirements.
39
 
@@ -52,12 +56,16 @@ When asked to create an application, you should:
52
  6. Ensure the code is functional and follows best practices
53
 
54
  For website redesign tasks:
55
- - Analyze the extracted website content to understand the structure and purpose
 
 
 
56
  - Use web search to find current design trends and best practices for the specific type of website
57
- - Create a modern, responsive design that improves upon the original
58
- - Maintain the core functionality and content while enhancing the user experience
59
  - Use modern CSS frameworks and design patterns
60
  - Ensure accessibility and mobile responsiveness
 
 
61
 
62
  If an image is provided, analyze it and use the visual information to better understand the user's requirements.
63
 
@@ -462,7 +470,7 @@ def extract_text_from_file(file_path):
462
  return f"Error extracting text: {e}"
463
 
464
  def extract_website_content(url: str) -> str:
465
- """Extract content from a website URL"""
466
  try:
467
  # Validate URL
468
  parsed_url = urlparse(url)
@@ -508,12 +516,42 @@ def extract_website_content(url: str) -> str:
508
  else:
509
  raise
510
 
511
- # Parse HTML content
512
- soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
 
514
- # Remove script and style elements
515
- for script in soup(["script", "style"]):
516
- script.decompose()
 
 
517
 
518
  # Extract title
519
  title = soup.find('title')
@@ -523,10 +561,8 @@ def extract_website_content(url: str) -> str:
523
  meta_desc = soup.find('meta', attrs={'name': 'description'})
524
  description = meta_desc.get('content', '') if meta_desc else ""
525
 
526
- # Extract main content areas
527
  content_sections = []
528
-
529
- # Look for common content containers
530
  main_selectors = [
531
  'main', 'article', '.content', '.main-content', '.post-content',
532
  '#content', '#main', '.entry-content', '.post-body'
@@ -539,16 +575,7 @@ def extract_website_content(url: str) -> str:
539
  if len(text) > 100: # Only include substantial content
540
  content_sections.append(text)
541
 
542
- # If no main content found, extract from body
543
- if not content_sections:
544
- body = soup.find('body')
545
- if body:
546
- # Remove navigation, footer, and other non-content elements
547
- for element in body.find_all(['nav', 'footer', 'header', 'aside']):
548
- element.decompose()
549
- content_sections.append(body.get_text().strip())
550
-
551
- # Extract navigation links
552
  nav_links = []
553
  nav_elements = soup.find_all(['nav', 'header'])
554
  for nav in nav_elements:
@@ -559,41 +586,201 @@ def extract_website_content(url: str) -> str:
559
  if link_text and link_href:
560
  nav_links.append(f"{link_text}: {link_href}")
561
 
562
- # Extract images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  images = []
564
  img_elements = soup.find_all('img')
565
  for img in img_elements:
566
  src = img.get('src', '')
567
  alt = img.get('alt', '')
568
  if src:
569
- # Convert relative URLs to absolute
570
- if not src.startswith(('http://', 'https://')):
571
- src = urljoin(url, src)
572
- images.append(f"Image: {alt} ({src})")
573
 
574
- # Compile the extracted content
575
- website_content = f"""
576
- WEBSITE CONTENT EXTRACTION
577
- ==========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
  URL: {url}
580
  Title: {title_text}
581
- Description: {description}
582
 
583
- NAVIGATION MENU:
584
- {chr(10).join(nav_links[:10]) if nav_links else "No navigation found"}
 
 
 
 
 
 
 
 
 
 
 
 
585
 
586
- MAIN CONTENT:
587
- {chr(10).join(content_sections[:3]) if content_sections else "No main content found"}
 
 
 
 
 
 
588
 
589
- IMAGES:
590
- {chr(10).join(images[:10]) if images else "No images found"}
 
591
 
592
- PAGE STRUCTURE:
593
  - This appears to be a {title_text.lower()} website
594
  - Contains {len(content_sections)} main content sections
595
  - Has {len(nav_links)} navigation links
596
  - Includes {len(images)} images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  """
598
 
599
  return website_content.strip()
@@ -729,7 +916,7 @@ with gr.Blocks(
729
 
730
  # URL input for website redesign
731
  website_url_input = gr.Textbox(
732
- label="Website URL (for redesign)",
733
  placeholder="https://example.com",
734
  lines=1,
735
  visible=True
 
29
  5. Ensure the code is functional and follows best practices
30
 
31
  For website redesign tasks:
32
+ - Use the provided original HTML code as the starting point for redesign
33
+ - Preserve all original content, structure, and functionality
34
+ - Keep the same semantic HTML structure but enhance the styling
35
+ - Reuse all original images and their URLs from the HTML code
36
+ - Create a modern, responsive design with improved typography and spacing
37
  - Use modern CSS frameworks and design patterns
38
  - Ensure accessibility and mobile responsiveness
39
+ - Maintain the same navigation and user flow
40
+ - Enhance the visual design while keeping the original layout structure
41
 
42
  If an image is provided, analyze it and use the visual information to better understand the user's requirements.
43
 
 
56
  6. Ensure the code is functional and follows best practices
57
 
58
  For website redesign tasks:
59
+ - Use the provided original HTML code as the starting point for redesign
60
+ - Preserve all original content, structure, and functionality
61
+ - Keep the same semantic HTML structure but enhance the styling
62
+ - Reuse all original images and their URLs from the HTML code
63
  - Use web search to find current design trends and best practices for the specific type of website
64
+ - Create a modern, responsive design with improved typography and spacing
 
65
  - Use modern CSS frameworks and design patterns
66
  - Ensure accessibility and mobile responsiveness
67
+ - Maintain the same navigation and user flow
68
+ - Enhance the visual design while keeping the original layout structure
69
 
70
  If an image is provided, analyze it and use the visual information to better understand the user's requirements.
71
 
 
470
  return f"Error extracting text: {e}"
471
 
472
  def extract_website_content(url: str) -> str:
473
+ """Extract HTML code and content from a website URL"""
474
  try:
475
  # Validate URL
476
  parsed_url = urlparse(url)
 
516
  else:
517
  raise
518
 
519
+ # Get the raw HTML content with proper encoding
520
+ try:
521
+ # Try to get the content with automatic encoding detection
522
+ response.encoding = response.apparent_encoding
523
+ raw_html = response.text
524
+ except:
525
+ # Fallback to UTF-8 if encoding detection fails
526
+ raw_html = response.content.decode('utf-8', errors='ignore')
527
+
528
+ # Debug: Check if we got valid HTML
529
+ if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
530
+ print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
531
+ print(f"Response headers: {dict(response.headers)}")
532
+ print(f"Response encoding: {response.encoding}")
533
+ print(f"Apparent encoding: {response.apparent_encoding}")
534
+
535
+ # Try alternative approaches
536
+ try:
537
+ raw_html = response.content.decode('latin-1', errors='ignore')
538
+ print("Tried latin-1 decoding")
539
+ except:
540
+ try:
541
+ raw_html = response.content.decode('utf-8', errors='ignore')
542
+ print("Tried UTF-8 decoding")
543
+ except:
544
+ raw_html = response.content.decode('cp1252', errors='ignore')
545
+ print("Tried cp1252 decoding")
546
+
547
+ # Parse HTML content for analysis
548
+ soup = BeautifulSoup(raw_html, 'html.parser')
549
 
550
+ # Check if this is a JavaScript-heavy site
551
+ script_tags = soup.find_all('script')
552
+ if len(script_tags) > 10:
553
+ print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
554
+ print("The content might be loaded dynamically and not available in the initial HTML")
555
 
556
  # Extract title
557
  title = soup.find('title')
 
561
  meta_desc = soup.find('meta', attrs={'name': 'description'})
562
  description = meta_desc.get('content', '') if meta_desc else ""
563
 
564
+ # Extract main content areas for analysis
565
  content_sections = []
 
 
566
  main_selectors = [
567
  'main', 'article', '.content', '.main-content', '.post-content',
568
  '#content', '#main', '.entry-content', '.post-body'
 
575
  if len(text) > 100: # Only include substantial content
576
  content_sections.append(text)
577
 
578
+ # Extract navigation links for analysis
 
 
 
 
 
 
 
 
 
579
  nav_links = []
580
  nav_elements = soup.find_all(['nav', 'header'])
581
  for nav in nav_elements:
 
586
  if link_text and link_href:
587
  nav_links.append(f"{link_text}: {link_href}")
588
 
589
+ # Extract and fix image URLs in the HTML
590
+ img_elements = soup.find_all('img')
591
+ for img in img_elements:
592
+ src = img.get('src', '')
593
+ if src:
594
+ # Handle different URL formats
595
+ if src.startswith('//'):
596
+ # Protocol-relative URL
597
+ absolute_src = 'https:' + src
598
+ img['src'] = absolute_src
599
+ elif src.startswith('/'):
600
+ # Root-relative URL
601
+ absolute_src = urljoin(url, src)
602
+ img['src'] = absolute_src
603
+ elif not src.startswith(('http://', 'https://')):
604
+ # Relative URL
605
+ absolute_src = urljoin(url, src)
606
+ img['src'] = absolute_src
607
+ # If it's already absolute, keep it as is
608
+
609
+ # Also check for data-src (lazy loading) and other common attributes
610
+ data_src = img.get('data-src', '')
611
+ if data_src and not src:
612
+ # Use data-src if src is empty
613
+ if data_src.startswith('//'):
614
+ absolute_data_src = 'https:' + data_src
615
+ img['src'] = absolute_data_src
616
+ elif data_src.startswith('/'):
617
+ absolute_data_src = urljoin(url, data_src)
618
+ img['src'] = absolute_data_src
619
+ elif not data_src.startswith(('http://', 'https://')):
620
+ absolute_data_src = urljoin(url, data_src)
621
+ img['src'] = absolute_data_src
622
+ else:
623
+ img['src'] = data_src
624
+
625
+ # Also fix background image URLs in style attributes
626
+ elements_with_style = soup.find_all(attrs={'style': True})
627
+ for element in elements_with_style:
628
+ style_attr = element.get('style', '')
629
+ # Find and replace relative URLs in background-image
630
+ import re
631
+ bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
632
+ matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
633
+ for match in matches:
634
+ if match:
635
+ if match.startswith('//'):
636
+ absolute_bg = 'https:' + match
637
+ style_attr = style_attr.replace(match, absolute_bg)
638
+ elif match.startswith('/'):
639
+ absolute_bg = urljoin(url, match)
640
+ style_attr = style_attr.replace(match, absolute_bg)
641
+ elif not match.startswith(('http://', 'https://')):
642
+ absolute_bg = urljoin(url, match)
643
+ style_attr = style_attr.replace(match, absolute_bg)
644
+ element['style'] = style_attr
645
+
646
+ # Fix background images in <style> tags
647
+ style_elements = soup.find_all('style')
648
+ for style in style_elements:
649
+ if style.string:
650
+ style_content = style.string
651
+ # Find and replace relative URLs in background-image
652
+ bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
653
+ matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
654
+ for match in matches:
655
+ if match:
656
+ if match.startswith('//'):
657
+ absolute_bg = 'https:' + match
658
+ style_content = style_content.replace(match, absolute_bg)
659
+ elif match.startswith('/'):
660
+ absolute_bg = urljoin(url, match)
661
+ style_content = style_content.replace(match, absolute_bg)
662
+ elif not match.startswith(('http://', 'https://')):
663
+ absolute_bg = urljoin(url, match)
664
+ style_content = style_content.replace(match, absolute_bg)
665
+ style.string = style_content
666
+
667
+ # Extract images for analysis (after fixing URLs)
668
  images = []
669
  img_elements = soup.find_all('img')
670
  for img in img_elements:
671
  src = img.get('src', '')
672
  alt = img.get('alt', '')
673
  if src:
674
+ images.append({'src': src, 'alt': alt})
 
 
 
675
 
676
+ # Debug: Print some image URLs to see what we're getting
677
+ print(f"Found {len(images)} images:")
678
+ for i, img in enumerate(images[:5]): # Show first 5 images
679
+ print(f" {i+1}. {img['alt'] or 'No alt'} - {img['src']}")
680
+
681
+ # Test a few image URLs to see if they're accessible
682
+ def test_image_url(img_url):
683
+ try:
684
+ test_response = requests.head(img_url, timeout=5, allow_redirects=True)
685
+ return test_response.status_code == 200
686
+ except:
687
+ return False
688
+
689
+ # Test first few images
690
+ working_images = []
691
+ for img in images[:10]: # Test first 10 images
692
+ if test_image_url(img['src']):
693
+ working_images.append(img)
694
+ else:
695
+ print(f" ❌ Broken image: {img['src']}")
696
+
697
+ print(f"Working images: {len(working_images)} out of {len(images)}")
698
+
699
+ # Get the modified HTML with absolute URLs
700
+ modified_html = str(soup)
701
+
702
+ # Clean and format the HTML for better readability
703
+ # Remove unnecessary whitespace and comments
704
+ import re
705
+ cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL) # Remove HTML comments
706
+ cleaned_html = re.sub(r'\s+', ' ', cleaned_html) # Normalize whitespace
707
+ cleaned_html = re.sub(r'>\s+<', '><', cleaned_html) # Remove whitespace between tags
708
+
709
+ # Limit HTML size to avoid token limits (keep first 15000 chars)
710
+ if len(cleaned_html) > 15000:
711
+ cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
712
+
713
+ # Check if we got any meaningful content
714
+ if not title_text or title_text == "No title found":
715
+ title_text = url.split('/')[-1] or url.split('/')[-2] or "Website"
716
+
717
+ # If we couldn't extract any meaningful content, provide a fallback
718
+ if len(cleaned_html.strip()) < 100:
719
+ website_content = f"""
720
+ WEBSITE REDESIGN - EXTRACTION FAILED
721
+ ====================================
722
 
723
  URL: {url}
724
  Title: {title_text}
 
725
 
726
+ ERROR: Could not extract meaningful HTML content from this website. This could be due to:
727
+ 1. The website uses heavy JavaScript to load content dynamically
728
+ 2. The website has anti-bot protection
729
+ 3. The website requires authentication
730
+ 4. The website is using advanced compression or encoding
731
+
732
+ FALLBACK APPROACH:
733
+ Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
734
+
735
+ 1. Create a typical layout for this type of website
736
+ 2. Use placeholder content that would be appropriate
737
+ 3. Include modern design elements and responsive features
738
+ 4. Use a clean, professional design with good typography
739
+ 5. Make it mobile-friendly and accessible
740
 
741
+ The website appears to be: {title_text}
742
+ """
743
+ return website_content.strip()
744
+
745
+ # Compile the extracted content with the actual HTML code
746
+ website_content = f"""
747
+ WEBSITE REDESIGN - ORIGINAL HTML CODE
748
+ =====================================
749
 
750
+ URL: {url}
751
+ Title: {title_text}
752
+ Description: {description}
753
 
754
+ PAGE ANALYSIS:
755
  - This appears to be a {title_text.lower()} website
756
  - Contains {len(content_sections)} main content sections
757
  - Has {len(nav_links)} navigation links
758
  - Includes {len(images)} images
759
+
760
+ IMAGES FOUND (use these exact URLs in your redesign):
761
+ {chr(10).join([f"• {img['alt'] or 'Image'} - {img['src']}" for img in working_images[:20]]) if working_images else "No working images found"}
762
+
763
+ ALL IMAGES (including potentially broken ones):
764
+ {chr(10).join([f"• {img['alt'] or 'Image'} - {img['src']}" for img in images[:20]]) if images else "No images found"}
765
+
766
+ ORIGINAL HTML CODE (use this as the base for redesign):
767
+ ```html
768
+ {cleaned_html}
769
+ ```
770
+
771
+ REDESIGN INSTRUCTIONS:
772
+ Please redesign this website with a modern, responsive layout while:
773
+ 1. Preserving all the original content and structure
774
+ 2. Maintaining the same navigation and functionality
775
+ 3. Using the original images and their URLs (listed above)
776
+ 4. Creating a modern, clean design with improved typography and spacing
777
+ 5. Making it fully responsive for mobile devices
778
+ 6. Using modern CSS frameworks and best practices
779
+ 7. Keeping the same semantic structure but with enhanced styling
780
+
781
+ IMPORTANT: All image URLs in the HTML code above have been converted to absolute URLs and are ready to use. Make sure to preserve these exact image URLs in your redesigned version.
782
+
783
+ The HTML code above contains the complete original website structure with all images properly linked. Use it as your starting point and create a modernized version.
784
  """
785
 
786
  return website_content.strip()
 
916
 
917
  # URL input for website redesign
918
  website_url_input = gr.Textbox(
919
+ label="Website URL (extracts HTML for redesign)",
920
  placeholder="https://example.com",
921
  lines=1,
922
  visible=True