Spaces:

Felguk
/

Felguk-url-to-text

Running

Felguk commited on Jan 17

Commit

4e43700

verified ·

1 Parent(s): 4f25906

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,22 +26,28 @@ async def extract_additional_resources(url):
     try:
         response = await asyncio.to_thread(requests.get, url, timeout=5)
         response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-        # Extract CSS links (limit to 5)
-        css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs][:5]
-        # Extract JS links (limit to 5)
-        js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs][:5]
-        # Extract image links (limit to 5)
-        img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs][:5]
-        # Fetch CSS and JS content asynchronously
-        css_content = await asyncio.gather(*[fetch_file_content(link) for link in css_links])
-        js_content = await asyncio.gather(*[fetch_file_content(link) for link in js_links])
-        return css_links, js_links, img_links, css_content, js_content
     except Exception as e:
         return [], [], [], [], []

     try:
         response = await asyncio.to_thread(requests.get, url, timeout=5)
         response.raise_for_status()
+        # Check if the content is HTML
+        if 'text/html' in response.headers.get('Content-Type', ''):
+            soup = BeautifulSoup(response.text, "html.parser")
+            # Extract CSS links (limit to 5)
+            css_links = [urljoin(url, link["href"]) for link in soup.find_all("link", rel="stylesheet") if "href" in link.attrs][:5]
+            # Extract JS links (limit to 5)
+            js_links = [urljoin(url, script["src"]) for script in soup.find_all("script") if "src" in script.attrs][:5]
+            # Extract image links (limit to 5)
+            img_links = [urljoin(url, img["src"]) for img in soup.find_all("img") if "src" in img.attrs][:5]
+            # Fetch CSS and JS content asynchronously
+            css_content = await asyncio.gather(*[fetch_file_content(link) for link in css_links])
+            js_content = await asyncio.gather(*[fetch_file_content(link) for link in js_links])
+            return css_links, js_links, img_links, css_content, js_content
+        else:
+            # If it's not HTML, treat it as a file
+            return [], [], [], [response.text], []
     except Exception as e:
         return [], [], [], [], []