File size: 3,454 Bytes
6f0ac93
 
 
 
 
 
 
9cf5fee
 
6f0ac93
 
 
9cf5fee
 
6f0ac93
 
 
 
 
 
 
 
 
 
9cf5fee
6f0ac93
 
 
 
 
 
9cf5fee
6f0ac93
 
9cf5fee
6f0ac93
 
 
 
 
9cf5fee
6f0ac93
 
9cf5fee
6f0ac93
 
 
9cf5fee
6f0ac93
 
 
9cf5fee
6f0ac93
 
9cf5fee
6f0ac93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
try:
    import aiohttp
    AIOHTTP_AVAILABLE = True
except ImportError:
    AIOHTTP_AVAILABLE = False
    import requests

from bs4 import BeautifulSoup
import re
import logging

logger = logging.getLogger(__name__)

class URLExtractorService:
    async def extract_content(self, url: str) -> str:
        """Extract the main content from a URL."""
        try:
            if AIOHTTP_AVAILABLE:
                return await self._extract_with_aiohttp(url)
            else:
                return self._extract_with_requests(url)
        except Exception as e:
            logger.error(f"Error extracting content from URL {url}: {str(e)}")
            return ""

    async def _extract_with_aiohttp(self, url: str) -> str:
        """Extract content using aiohttp."""
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status != 200:
                    return ""

                html = await response.text()
                return self._parse_html(html)

    def _extract_with_requests(self, url: str) -> str:
        """Extract content using requests as fallback."""
        response = requests.get(url)
        if response.status_code != 200:
            return ""

        html = response.text
        return self._parse_html(html)

    def _parse_html(self, html: str) -> str:
        """Parse HTML and extract main content."""
        soup = BeautifulSoup(html, 'html.parser')

        # Remove elements that typically contain comments or irrelevant content
        for element in soup.select('footer, .comments, #comments, .comment, .respond, .reply, .sidebar, nav, header, script, style, [id*=comment], [class*=comment]'):
            element.decompose()

        # Try to find the main content using common article containers
        main_content = None

        # Look for article tag first
        if soup.find('article'):
            main_content = soup.find('article')
        # Then try common content div classes/ids
        elif soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I)):
            main_content = soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I))
        # Then try main tag
        elif soup.find('main'):
            main_content = soup.find('main')

        if main_content:
            # Extract text from the main content
            text = main_content.get_text(separator=' ', strip=True)
        else:
            # Fallback to body if no main content container is found
            text = soup.body.get_text(separator=' ', strip=True)

        # Clean up the text
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
        text = re.sub(r'(\.|\?|!)\s+', r'\1\n\n', text)  # Add paragraph breaks after sentences

        # Remove common web page boilerplate text
        text = re.sub(r'Skip to (content|main).*?»', '', text)
        text = re.sub(r'Search for:.*?Search', '', text)
        text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)

        # Remove comment sections (often start with phrases like "X responses to")
        text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)

        # Remove form fields and subscription prompts
        text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)

        return text