Spaces:

dang-w
/

ai-content-summariser-api

Sleeping

File size: 1,360 Bytes

9cf5fee

import httpx
from bs4 import BeautifulSoup
import re

class URLExtractorService:
    def __init__(self):
        self.client = httpx.AsyncClient(timeout=30.0)

    async def extract_content(self, url):
        """
        Extract the main content from a URL.

        Args:
            url (str): The URL to extract content from

        Returns:
            str: The extracted text content
        """
        try:
            response = await self.client.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style", "header", "footer", "nav"]):
                script.extract()

            # Get text and clean it
            text = soup.get_text()

            # Break into lines and remove leading/trailing space
            lines = (line.strip() for line in text.splitlines())
            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # Remove blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)

            return text
        except Exception as e:
            raise Exception(f"Failed to extract content from URL: {str(e)}")
        finally:
            await self.client.aclose()