|
try: |
|
import aiohttp |
|
AIOHTTP_AVAILABLE = True |
|
except ImportError: |
|
AIOHTTP_AVAILABLE = False |
|
import requests |
|
|
|
from bs4 import BeautifulSoup |
|
import re |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class URLExtractorService: |
|
async def extract_content(self, url: str) -> str: |
|
"""Extract the main content from a URL.""" |
|
try: |
|
if AIOHTTP_AVAILABLE: |
|
return await self._extract_with_aiohttp(url) |
|
else: |
|
return self._extract_with_requests(url) |
|
except Exception as e: |
|
logger.error(f"Error extracting content from URL {url}: {str(e)}") |
|
return "" |
|
|
|
async def _extract_with_aiohttp(self, url: str) -> str: |
|
"""Extract content using aiohttp.""" |
|
async with aiohttp.ClientSession() as session: |
|
async with session.get(url) as response: |
|
if response.status != 200: |
|
return "" |
|
|
|
html = await response.text() |
|
return self._parse_html(html) |
|
|
|
def _extract_with_requests(self, url: str) -> str: |
|
"""Extract content using requests as fallback.""" |
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
return "" |
|
|
|
html = response.text |
|
return self._parse_html(html) |
|
|
|
def _parse_html(self, html: str) -> str: |
|
"""Parse HTML and extract main content.""" |
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
|
|
for element in soup.select('footer, .comments, #comments, .comment, .respond, .reply, .sidebar, nav, header, script, style, [id*=comment], [class*=comment]'): |
|
element.decompose() |
|
|
|
|
|
main_content = None |
|
|
|
|
|
if soup.find('article'): |
|
main_content = soup.find('article') |
|
|
|
elif soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I)): |
|
main_content = soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I)) |
|
|
|
elif soup.find('main'): |
|
main_content = soup.find('main') |
|
|
|
if main_content: |
|
|
|
text = main_content.get_text(separator=' ', strip=True) |
|
else: |
|
|
|
text = soup.body.get_text(separator=' ', strip=True) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'(\.|\?|!)\s+', r'\1\n\n', text) |
|
|
|
|
|
text = re.sub(r'Skip to (content|main).*?»', '', text) |
|
text = re.sub(r'Search for:.*?Search', '', text) |
|
text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL) |
|
|
|
|
|
text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL) |
|
|
|
|
|
text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL) |
|
|
|
return text |