Spaces:
Running
Running
update
Browse files
app.py
CHANGED
@@ -29,11 +29,15 @@ When asked to create an application, you should:
|
|
29 |
5. Ensure the code is functional and follows best practices
|
30 |
|
31 |
For website redesign tasks:
|
32 |
-
-
|
33 |
-
-
|
34 |
-
-
|
|
|
|
|
35 |
- Use modern CSS frameworks and design patterns
|
36 |
- Ensure accessibility and mobile responsiveness
|
|
|
|
|
37 |
|
38 |
If an image is provided, analyze it and use the visual information to better understand the user's requirements.
|
39 |
|
@@ -52,12 +56,16 @@ When asked to create an application, you should:
|
|
52 |
6. Ensure the code is functional and follows best practices
|
53 |
|
54 |
For website redesign tasks:
|
55 |
-
-
|
|
|
|
|
|
|
56 |
- Use web search to find current design trends and best practices for the specific type of website
|
57 |
-
- Create a modern, responsive design
|
58 |
-
- Maintain the core functionality and content while enhancing the user experience
|
59 |
- Use modern CSS frameworks and design patterns
|
60 |
- Ensure accessibility and mobile responsiveness
|
|
|
|
|
61 |
|
62 |
If an image is provided, analyze it and use the visual information to better understand the user's requirements.
|
63 |
|
@@ -462,7 +470,7 @@ def extract_text_from_file(file_path):
|
|
462 |
return f"Error extracting text: {e}"
|
463 |
|
464 |
def extract_website_content(url: str) -> str:
|
465 |
-
"""Extract content from a website URL"""
|
466 |
try:
|
467 |
# Validate URL
|
468 |
parsed_url = urlparse(url)
|
@@ -508,12 +516,42 @@ def extract_website_content(url: str) -> str:
|
|
508 |
else:
|
509 |
raise
|
510 |
|
511 |
-
#
|
512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
-
#
|
515 |
-
|
516 |
-
|
|
|
|
|
517 |
|
518 |
# Extract title
|
519 |
title = soup.find('title')
|
@@ -523,10 +561,8 @@ def extract_website_content(url: str) -> str:
|
|
523 |
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
524 |
description = meta_desc.get('content', '') if meta_desc else ""
|
525 |
|
526 |
-
# Extract main content areas
|
527 |
content_sections = []
|
528 |
-
|
529 |
-
# Look for common content containers
|
530 |
main_selectors = [
|
531 |
'main', 'article', '.content', '.main-content', '.post-content',
|
532 |
'#content', '#main', '.entry-content', '.post-body'
|
@@ -539,16 +575,7 @@ def extract_website_content(url: str) -> str:
|
|
539 |
if len(text) > 100: # Only include substantial content
|
540 |
content_sections.append(text)
|
541 |
|
542 |
-
#
|
543 |
-
if not content_sections:
|
544 |
-
body = soup.find('body')
|
545 |
-
if body:
|
546 |
-
# Remove navigation, footer, and other non-content elements
|
547 |
-
for element in body.find_all(['nav', 'footer', 'header', 'aside']):
|
548 |
-
element.decompose()
|
549 |
-
content_sections.append(body.get_text().strip())
|
550 |
-
|
551 |
-
# Extract navigation links
|
552 |
nav_links = []
|
553 |
nav_elements = soup.find_all(['nav', 'header'])
|
554 |
for nav in nav_elements:
|
@@ -559,41 +586,201 @@ def extract_website_content(url: str) -> str:
|
|
559 |
if link_text and link_href:
|
560 |
nav_links.append(f"{link_text}: {link_href}")
|
561 |
|
562 |
-
# Extract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
images = []
|
564 |
img_elements = soup.find_all('img')
|
565 |
for img in img_elements:
|
566 |
src = img.get('src', '')
|
567 |
alt = img.get('alt', '')
|
568 |
if src:
|
569 |
-
|
570 |
-
if not src.startswith(('http://', 'https://')):
|
571 |
-
src = urljoin(url, src)
|
572 |
-
images.append(f"Image: {alt} ({src})")
|
573 |
|
574 |
-
#
|
575 |
-
|
576 |
-
|
577 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
|
579 |
URL: {url}
|
580 |
Title: {title_text}
|
581 |
-
Description: {description}
|
582 |
|
583 |
-
|
584 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
585 |
|
586 |
-
|
587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
588 |
|
589 |
-
|
590 |
-
|
|
|
591 |
|
592 |
-
PAGE
|
593 |
- This appears to be a {title_text.lower()} website
|
594 |
- Contains {len(content_sections)} main content sections
|
595 |
- Has {len(nav_links)} navigation links
|
596 |
- Includes {len(images)} images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
597 |
"""
|
598 |
|
599 |
return website_content.strip()
|
@@ -729,7 +916,7 @@ with gr.Blocks(
|
|
729 |
|
730 |
# URL input for website redesign
|
731 |
website_url_input = gr.Textbox(
|
732 |
-
label="Website URL (for redesign)",
|
733 |
placeholder="https://example.com",
|
734 |
lines=1,
|
735 |
visible=True
|
|
|
29 |
5. Ensure the code is functional and follows best practices
|
30 |
|
31 |
For website redesign tasks:
|
32 |
+
- Use the provided original HTML code as the starting point for redesign
|
33 |
+
- Preserve all original content, structure, and functionality
|
34 |
+
- Keep the same semantic HTML structure but enhance the styling
|
35 |
+
- Reuse all original images and their URLs from the HTML code
|
36 |
+
- Create a modern, responsive design with improved typography and spacing
|
37 |
- Use modern CSS frameworks and design patterns
|
38 |
- Ensure accessibility and mobile responsiveness
|
39 |
+
- Maintain the same navigation and user flow
|
40 |
+
- Enhance the visual design while keeping the original layout structure
|
41 |
|
42 |
If an image is provided, analyze it and use the visual information to better understand the user's requirements.
|
43 |
|
|
|
56 |
6. Ensure the code is functional and follows best practices
|
57 |
|
58 |
For website redesign tasks:
|
59 |
+
- Use the provided original HTML code as the starting point for redesign
|
60 |
+
- Preserve all original content, structure, and functionality
|
61 |
+
- Keep the same semantic HTML structure but enhance the styling
|
62 |
+
- Reuse all original images and their URLs from the HTML code
|
63 |
- Use web search to find current design trends and best practices for the specific type of website
|
64 |
+
- Create a modern, responsive design with improved typography and spacing
|
|
|
65 |
- Use modern CSS frameworks and design patterns
|
66 |
- Ensure accessibility and mobile responsiveness
|
67 |
+
- Maintain the same navigation and user flow
|
68 |
+
- Enhance the visual design while keeping the original layout structure
|
69 |
|
70 |
If an image is provided, analyze it and use the visual information to better understand the user's requirements.
|
71 |
|
|
|
470 |
return f"Error extracting text: {e}"
|
471 |
|
472 |
def extract_website_content(url: str) -> str:
|
473 |
+
"""Extract HTML code and content from a website URL"""
|
474 |
try:
|
475 |
# Validate URL
|
476 |
parsed_url = urlparse(url)
|
|
|
516 |
else:
|
517 |
raise
|
518 |
|
519 |
+
# Get the raw HTML content with proper encoding
|
520 |
+
try:
|
521 |
+
# Try to get the content with automatic encoding detection
|
522 |
+
response.encoding = response.apparent_encoding
|
523 |
+
raw_html = response.text
|
524 |
+
except:
|
525 |
+
# Fallback to UTF-8 if encoding detection fails
|
526 |
+
raw_html = response.content.decode('utf-8', errors='ignore')
|
527 |
+
|
528 |
+
# Debug: Check if we got valid HTML
|
529 |
+
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
|
530 |
+
print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
|
531 |
+
print(f"Response headers: {dict(response.headers)}")
|
532 |
+
print(f"Response encoding: {response.encoding}")
|
533 |
+
print(f"Apparent encoding: {response.apparent_encoding}")
|
534 |
+
|
535 |
+
# Try alternative approaches
|
536 |
+
try:
|
537 |
+
raw_html = response.content.decode('latin-1', errors='ignore')
|
538 |
+
print("Tried latin-1 decoding")
|
539 |
+
except:
|
540 |
+
try:
|
541 |
+
raw_html = response.content.decode('utf-8', errors='ignore')
|
542 |
+
print("Tried UTF-8 decoding")
|
543 |
+
except:
|
544 |
+
raw_html = response.content.decode('cp1252', errors='ignore')
|
545 |
+
print("Tried cp1252 decoding")
|
546 |
+
|
547 |
+
# Parse HTML content for analysis
|
548 |
+
soup = BeautifulSoup(raw_html, 'html.parser')
|
549 |
|
550 |
+
# Check if this is a JavaScript-heavy site
|
551 |
+
script_tags = soup.find_all('script')
|
552 |
+
if len(script_tags) > 10:
|
553 |
+
print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
|
554 |
+
print("The content might be loaded dynamically and not available in the initial HTML")
|
555 |
|
556 |
# Extract title
|
557 |
title = soup.find('title')
|
|
|
561 |
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
562 |
description = meta_desc.get('content', '') if meta_desc else ""
|
563 |
|
564 |
+
# Extract main content areas for analysis
|
565 |
content_sections = []
|
|
|
|
|
566 |
main_selectors = [
|
567 |
'main', 'article', '.content', '.main-content', '.post-content',
|
568 |
'#content', '#main', '.entry-content', '.post-body'
|
|
|
575 |
if len(text) > 100: # Only include substantial content
|
576 |
content_sections.append(text)
|
577 |
|
578 |
+
# Extract navigation links for analysis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
nav_links = []
|
580 |
nav_elements = soup.find_all(['nav', 'header'])
|
581 |
for nav in nav_elements:
|
|
|
586 |
if link_text and link_href:
|
587 |
nav_links.append(f"{link_text}: {link_href}")
|
588 |
|
589 |
+
# Extract and fix image URLs in the HTML
|
590 |
+
img_elements = soup.find_all('img')
|
591 |
+
for img in img_elements:
|
592 |
+
src = img.get('src', '')
|
593 |
+
if src:
|
594 |
+
# Handle different URL formats
|
595 |
+
if src.startswith('//'):
|
596 |
+
# Protocol-relative URL
|
597 |
+
absolute_src = 'https:' + src
|
598 |
+
img['src'] = absolute_src
|
599 |
+
elif src.startswith('/'):
|
600 |
+
# Root-relative URL
|
601 |
+
absolute_src = urljoin(url, src)
|
602 |
+
img['src'] = absolute_src
|
603 |
+
elif not src.startswith(('http://', 'https://')):
|
604 |
+
# Relative URL
|
605 |
+
absolute_src = urljoin(url, src)
|
606 |
+
img['src'] = absolute_src
|
607 |
+
# If it's already absolute, keep it as is
|
608 |
+
|
609 |
+
# Also check for data-src (lazy loading) and other common attributes
|
610 |
+
data_src = img.get('data-src', '')
|
611 |
+
if data_src and not src:
|
612 |
+
# Use data-src if src is empty
|
613 |
+
if data_src.startswith('//'):
|
614 |
+
absolute_data_src = 'https:' + data_src
|
615 |
+
img['src'] = absolute_data_src
|
616 |
+
elif data_src.startswith('/'):
|
617 |
+
absolute_data_src = urljoin(url, data_src)
|
618 |
+
img['src'] = absolute_data_src
|
619 |
+
elif not data_src.startswith(('http://', 'https://')):
|
620 |
+
absolute_data_src = urljoin(url, data_src)
|
621 |
+
img['src'] = absolute_data_src
|
622 |
+
else:
|
623 |
+
img['src'] = data_src
|
624 |
+
|
625 |
+
# Also fix background image URLs in style attributes
|
626 |
+
elements_with_style = soup.find_all(attrs={'style': True})
|
627 |
+
for element in elements_with_style:
|
628 |
+
style_attr = element.get('style', '')
|
629 |
+
# Find and replace relative URLs in background-image
|
630 |
+
import re
|
631 |
+
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
632 |
+
matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
|
633 |
+
for match in matches:
|
634 |
+
if match:
|
635 |
+
if match.startswith('//'):
|
636 |
+
absolute_bg = 'https:' + match
|
637 |
+
style_attr = style_attr.replace(match, absolute_bg)
|
638 |
+
elif match.startswith('/'):
|
639 |
+
absolute_bg = urljoin(url, match)
|
640 |
+
style_attr = style_attr.replace(match, absolute_bg)
|
641 |
+
elif not match.startswith(('http://', 'https://')):
|
642 |
+
absolute_bg = urljoin(url, match)
|
643 |
+
style_attr = style_attr.replace(match, absolute_bg)
|
644 |
+
element['style'] = style_attr
|
645 |
+
|
646 |
+
# Fix background images in <style> tags
|
647 |
+
style_elements = soup.find_all('style')
|
648 |
+
for style in style_elements:
|
649 |
+
if style.string:
|
650 |
+
style_content = style.string
|
651 |
+
# Find and replace relative URLs in background-image
|
652 |
+
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
653 |
+
matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
|
654 |
+
for match in matches:
|
655 |
+
if match:
|
656 |
+
if match.startswith('//'):
|
657 |
+
absolute_bg = 'https:' + match
|
658 |
+
style_content = style_content.replace(match, absolute_bg)
|
659 |
+
elif match.startswith('/'):
|
660 |
+
absolute_bg = urljoin(url, match)
|
661 |
+
style_content = style_content.replace(match, absolute_bg)
|
662 |
+
elif not match.startswith(('http://', 'https://')):
|
663 |
+
absolute_bg = urljoin(url, match)
|
664 |
+
style_content = style_content.replace(match, absolute_bg)
|
665 |
+
style.string = style_content
|
666 |
+
|
667 |
+
# Extract images for analysis (after fixing URLs)
|
668 |
images = []
|
669 |
img_elements = soup.find_all('img')
|
670 |
for img in img_elements:
|
671 |
src = img.get('src', '')
|
672 |
alt = img.get('alt', '')
|
673 |
if src:
|
674 |
+
images.append({'src': src, 'alt': alt})
|
|
|
|
|
|
|
675 |
|
676 |
+
# Debug: Print some image URLs to see what we're getting
|
677 |
+
print(f"Found {len(images)} images:")
|
678 |
+
for i, img in enumerate(images[:5]): # Show first 5 images
|
679 |
+
print(f" {i+1}. {img['alt'] or 'No alt'} - {img['src']}")
|
680 |
+
|
681 |
+
# Test a few image URLs to see if they're accessible
|
682 |
+
def test_image_url(img_url):
|
683 |
+
try:
|
684 |
+
test_response = requests.head(img_url, timeout=5, allow_redirects=True)
|
685 |
+
return test_response.status_code == 200
|
686 |
+
except:
|
687 |
+
return False
|
688 |
+
|
689 |
+
# Test first few images
|
690 |
+
working_images = []
|
691 |
+
for img in images[:10]: # Test first 10 images
|
692 |
+
if test_image_url(img['src']):
|
693 |
+
working_images.append(img)
|
694 |
+
else:
|
695 |
+
print(f" ❌ Broken image: {img['src']}")
|
696 |
+
|
697 |
+
print(f"Working images: {len(working_images)} out of {len(images)}")
|
698 |
+
|
699 |
+
# Get the modified HTML with absolute URLs
|
700 |
+
modified_html = str(soup)
|
701 |
+
|
702 |
+
# Clean and format the HTML for better readability
|
703 |
+
# Remove unnecessary whitespace and comments
|
704 |
+
import re
|
705 |
+
cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL) # Remove HTML comments
|
706 |
+
cleaned_html = re.sub(r'\s+', ' ', cleaned_html) # Normalize whitespace
|
707 |
+
cleaned_html = re.sub(r'>\s+<', '><', cleaned_html) # Remove whitespace between tags
|
708 |
+
|
709 |
+
# Limit HTML size to avoid token limits (keep first 15000 chars)
|
710 |
+
if len(cleaned_html) > 15000:
|
711 |
+
cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
|
712 |
+
|
713 |
+
# Check if we got any meaningful content
|
714 |
+
if not title_text or title_text == "No title found":
|
715 |
+
title_text = url.split('/')[-1] or url.split('/')[-2] or "Website"
|
716 |
+
|
717 |
+
# If we couldn't extract any meaningful content, provide a fallback
|
718 |
+
if len(cleaned_html.strip()) < 100:
|
719 |
+
website_content = f"""
|
720 |
+
WEBSITE REDESIGN - EXTRACTION FAILED
|
721 |
+
====================================
|
722 |
|
723 |
URL: {url}
|
724 |
Title: {title_text}
|
|
|
725 |
|
726 |
+
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
|
727 |
+
1. The website uses heavy JavaScript to load content dynamically
|
728 |
+
2. The website has anti-bot protection
|
729 |
+
3. The website requires authentication
|
730 |
+
4. The website is using advanced compression or encoding
|
731 |
+
|
732 |
+
FALLBACK APPROACH:
|
733 |
+
Please create a modern, responsive website design for a {title_text.lower()} website. Since I couldn't extract the original content, you can:
|
734 |
+
|
735 |
+
1. Create a typical layout for this type of website
|
736 |
+
2. Use placeholder content that would be appropriate
|
737 |
+
3. Include modern design elements and responsive features
|
738 |
+
4. Use a clean, professional design with good typography
|
739 |
+
5. Make it mobile-friendly and accessible
|
740 |
|
741 |
+
The website appears to be: {title_text}
|
742 |
+
"""
|
743 |
+
return website_content.strip()
|
744 |
+
|
745 |
+
# Compile the extracted content with the actual HTML code
|
746 |
+
website_content = f"""
|
747 |
+
WEBSITE REDESIGN - ORIGINAL HTML CODE
|
748 |
+
=====================================
|
749 |
|
750 |
+
URL: {url}
|
751 |
+
Title: {title_text}
|
752 |
+
Description: {description}
|
753 |
|
754 |
+
PAGE ANALYSIS:
|
755 |
- This appears to be a {title_text.lower()} website
|
756 |
- Contains {len(content_sections)} main content sections
|
757 |
- Has {len(nav_links)} navigation links
|
758 |
- Includes {len(images)} images
|
759 |
+
|
760 |
+
IMAGES FOUND (use these exact URLs in your redesign):
|
761 |
+
{chr(10).join([f"• {img['alt'] or 'Image'} - {img['src']}" for img in working_images[:20]]) if working_images else "No working images found"}
|
762 |
+
|
763 |
+
ALL IMAGES (including potentially broken ones):
|
764 |
+
{chr(10).join([f"• {img['alt'] or 'Image'} - {img['src']}" for img in images[:20]]) if images else "No images found"}
|
765 |
+
|
766 |
+
ORIGINAL HTML CODE (use this as the base for redesign):
|
767 |
+
```html
|
768 |
+
{cleaned_html}
|
769 |
+
```
|
770 |
+
|
771 |
+
REDESIGN INSTRUCTIONS:
|
772 |
+
Please redesign this website with a modern, responsive layout while:
|
773 |
+
1. Preserving all the original content and structure
|
774 |
+
2. Maintaining the same navigation and functionality
|
775 |
+
3. Using the original images and their URLs (listed above)
|
776 |
+
4. Creating a modern, clean design with improved typography and spacing
|
777 |
+
5. Making it fully responsive for mobile devices
|
778 |
+
6. Using modern CSS frameworks and best practices
|
779 |
+
7. Keeping the same semantic structure but with enhanced styling
|
780 |
+
|
781 |
+
IMPORTANT: All image URLs in the HTML code above have been converted to absolute URLs and are ready to use. Make sure to preserve these exact image URLs in your redesigned version.
|
782 |
+
|
783 |
+
The HTML code above contains the complete original website structure with all images properly linked. Use it as your starting point and create a modernized version.
|
784 |
"""
|
785 |
|
786 |
return website_content.strip()
|
|
|
916 |
|
917 |
# URL input for website redesign
|
918 |
website_url_input = gr.Textbox(
|
919 |
+
label="Website URL (extracts HTML for redesign)",
|
920 |
placeholder="https://example.com",
|
921 |
lines=1,
|
922 |
visible=True
|