File size: 1,360 Bytes
9cf5fee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import httpx
from bs4 import BeautifulSoup
import re

class URLExtractorService:
    def __init__(self):
        self.client = httpx.AsyncClient(timeout=30.0)

    async def extract_content(self, url):
        """
        Extract the main content from a URL.

        Args:
            url (str): The URL to extract content from

        Returns:
            str: The extracted text content
        """
        try:
            response = await self.client.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style", "header", "footer", "nav"]):
                script.extract()

            # Get text and clean it
            text = soup.get_text()

            # Break into lines and remove leading/trailing space
            lines = (line.strip() for line in text.splitlines())
            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # Remove blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)

            return text
        except Exception as e:
            raise Exception(f"Failed to extract content from URL: {str(e)}")
        finally:
            await self.client.aclose()