Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,27 +4,27 @@ from urllib.parse import urlparse, urljoin
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
|
6 |
def is_valid_url(url):
|
7 |
-
"""
|
8 |
try:
|
9 |
result = urlparse(url)
|
10 |
-
return all([result.scheme, result.netloc]) #
|
11 |
except:
|
12 |
return False
|
13 |
|
14 |
def extract_additional_resources(url):
|
15 |
-
"""
|
16 |
try:
|
17 |
response = requests.get(url)
|
18 |
response.raise_for_status()
|
19 |
soup = BeautifulSoup(response.text, "html.parser")
|
20 |
|
21 |
-
#
|
22 |
css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs]
|
23 |
|
24 |
-
#
|
25 |
js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs]
|
26 |
|
27 |
-
#
|
28 |
img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs]
|
29 |
|
30 |
return css_links, js_links, img_links
|
@@ -33,84 +33,84 @@ def extract_additional_resources(url):
|
|
33 |
|
34 |
def convert_to_text(url):
|
35 |
if not is_valid_url(url):
|
36 |
-
return "
|
37 |
|
38 |
try:
|
39 |
-
#
|
40 |
headers = {
|
41 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
42 |
}
|
43 |
response = requests.get(url, headers=headers)
|
44 |
-
response.raise_for_status() #
|
45 |
|
46 |
-
#
|
47 |
-
status = f"
|
48 |
-
content_length = f"
|
49 |
results = f"{status}\n{content_length}"
|
50 |
|
51 |
-
#
|
52 |
file_path = "downloaded_content.txt"
|
53 |
with open(file_path, "w", encoding="utf-8") as file:
|
54 |
file.write(response.text)
|
55 |
|
56 |
-
#
|
57 |
css_links, js_links, img_links = extract_additional_resources(url)
|
58 |
|
59 |
return results, response.text, file_path, css_links, js_links, img_links
|
60 |
except requests.exceptions.RequestException as e:
|
61 |
-
return f"
|
62 |
|
63 |
-
# HTML
|
64 |
copy_button_html = """
|
65 |
<script>
|
66 |
function copyCode() {
|
67 |
const text = document.querySelector("#output-text textarea").value;
|
68 |
navigator.clipboard.writeText(text).then(() => {
|
69 |
-
alert("
|
70 |
}).catch(() => {
|
71 |
-
alert("
|
72 |
});
|
73 |
}
|
74 |
</script>
|
75 |
-
<button onclick="copyCode()"
|
76 |
"""
|
77 |
|
78 |
-
#
|
79 |
css = "app.css"
|
80 |
|
81 |
-
#
|
82 |
with gr.Blocks(css=css) as demo:
|
83 |
-
gr.Markdown("##
|
84 |
-
gr.Markdown("
|
85 |
|
86 |
with gr.Row():
|
87 |
-
url_input = gr.Textbox(label="
|
88 |
|
89 |
with gr.Row():
|
90 |
-
results_output = gr.Textbox(label="
|
91 |
-
text_output = gr.Textbox(label="
|
92 |
|
93 |
with gr.Row():
|
94 |
-
gr.HTML(copy_button_html) #
|
95 |
-
file_output = gr.File(label="
|
96 |
|
97 |
-
submit_button = gr.Button("
|
98 |
submit_button.click(
|
99 |
fn=convert_to_text,
|
100 |
inputs=url_input,
|
101 |
-
outputs=[results_output, text_output, file_output, gr.Textbox(label="CSS
|
102 |
)
|
103 |
|
104 |
-
#
|
105 |
-
with gr.Accordion("
|
106 |
-
gr.Markdown("### CSS
|
107 |
-
css_output = gr.Textbox(label="CSS
|
108 |
|
109 |
-
gr.Markdown("### JS
|
110 |
-
js_output = gr.Textbox(label="JS
|
111 |
|
112 |
-
gr.Markdown("###
|
113 |
-
img_output = gr.Textbox(label="
|
114 |
|
115 |
-
#
|
116 |
demo.launch()
|
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
|
6 |
def is_valid_url(url):
|
7 |
+
"""Checks if the string is a valid URL."""
|
8 |
try:
|
9 |
result = urlparse(url)
|
10 |
+
return all([result.scheme, result.netloc]) # Check for scheme and domain
|
11 |
except:
|
12 |
return False
|
13 |
|
14 |
def extract_additional_resources(url):
|
15 |
+
"""Extracts links to CSS, JS, and images from HTML code."""
|
16 |
try:
|
17 |
response = requests.get(url)
|
18 |
response.raise_for_status()
|
19 |
soup = BeautifulSoup(response.text, "html.parser")
|
20 |
|
21 |
+
# Extract CSS links
|
22 |
css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs]
|
23 |
|
24 |
+
# Extract JS links
|
25 |
js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs]
|
26 |
|
27 |
+
# Extract image links
|
28 |
img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs]
|
29 |
|
30 |
return css_links, js_links, img_links
|
|
|
33 |
|
34 |
def convert_to_text(url):
|
35 |
if not is_valid_url(url):
|
36 |
+
return "Error: Please enter a valid URL.", "", None, [], [], [] # Return error message and empty data
|
37 |
|
38 |
try:
|
39 |
+
# Set headers to mimic a browser request
|
40 |
headers = {
|
41 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
42 |
}
|
43 |
response = requests.get(url, headers=headers)
|
44 |
+
response.raise_for_status() # Check for HTTP errors (e.g., 404, 500)
|
45 |
|
46 |
+
# Return results
|
47 |
+
status = f"Request status: {response.status_code}"
|
48 |
+
content_length = f"Content size: {len(response.text)} characters"
|
49 |
results = f"{status}\n{content_length}"
|
50 |
|
51 |
+
# Save text content to a file
|
52 |
file_path = "downloaded_content.txt"
|
53 |
with open(file_path, "w", encoding="utf-8") as file:
|
54 |
file.write(response.text)
|
55 |
|
56 |
+
# Extract additional resources
|
57 |
css_links, js_links, img_links = extract_additional_resources(url)
|
58 |
|
59 |
return results, response.text, file_path, css_links, js_links, img_links
|
60 |
except requests.exceptions.RequestException as e:
|
61 |
+
return f"Error: {e}", "", None, [], [], [] # Return error message and empty data
|
62 |
|
63 |
+
# HTML and JavaScript for the "Copy Code" button
|
64 |
copy_button_html = """
|
65 |
<script>
|
66 |
function copyCode() {
|
67 |
const text = document.querySelector("#output-text textarea").value;
|
68 |
navigator.clipboard.writeText(text).then(() => {
|
69 |
+
alert("Text copied to clipboard!");
|
70 |
}).catch(() => {
|
71 |
+
alert("Failed to copy text.");
|
72 |
});
|
73 |
}
|
74 |
</script>
|
75 |
+
<button onclick="copyCode()">Copy Code</button>
|
76 |
"""
|
77 |
|
78 |
+
# Link to the CSS file
|
79 |
css = "app.css"
|
80 |
|
81 |
+
# Create the Gradio interface
|
82 |
with gr.Blocks(css=css) as demo:
|
83 |
+
gr.Markdown("## URL to Text Converter")
|
84 |
+
gr.Markdown("Enter a URL to fetch its text content and download it as a .txt file.")
|
85 |
|
86 |
with gr.Row():
|
87 |
+
url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
|
88 |
|
89 |
with gr.Row():
|
90 |
+
results_output = gr.Textbox(label="Request Results", interactive=False)
|
91 |
+
text_output = gr.Textbox(label="Text Content", interactive=True, elem_id="output-text")
|
92 |
|
93 |
with gr.Row():
|
94 |
+
gr.HTML(copy_button_html) # Add the "Copy Code" button
|
95 |
+
file_output = gr.File(label="Download File", visible=False) # Hidden file download component
|
96 |
|
97 |
+
submit_button = gr.Button("Fetch Content")
|
98 |
submit_button.click(
|
99 |
fn=convert_to_text,
|
100 |
inputs=url_input,
|
101 |
+
outputs=[results_output, text_output, file_output, gr.Textbox(label="CSS Files"), gr.Textbox(label="JS Files"), gr.Textbox(label="Images")]
|
102 |
)
|
103 |
|
104 |
+
# Add an Accordion to show/hide additional resources
|
105 |
+
with gr.Accordion("Show/Hide Additional Resources", open=False):
|
106 |
+
gr.Markdown("### CSS Files")
|
107 |
+
css_output = gr.Textbox(label="CSS Files", interactive=False)
|
108 |
|
109 |
+
gr.Markdown("### JS Files")
|
110 |
+
js_output = gr.Textbox(label="JS Files", interactive=False)
|
111 |
|
112 |
+
gr.Markdown("### Images")
|
113 |
+
img_output = gr.Textbox(label="Images", interactive=False)
|
114 |
|
115 |
+
# Launch the interface
|
116 |
demo.launch()
|