Dan Walsh
Initial deployment of AI Content Summariser API
9cf5fee
raw
history blame
1.36 kB
import httpx
from bs4 import BeautifulSoup
import re
class URLExtractorService:
def __init__(self):
self.client = httpx.AsyncClient(timeout=30.0)
async def extract_content(self, url):
"""
Extract the main content from a URL.
Args:
url (str): The URL to extract content from
Returns:
str: The extracted text content
"""
try:
response = await self.client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "header", "footer", "nav"]):
script.extract()
# Get text and clean it
text = soup.get_text()
# Break into lines and remove leading/trailing space
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Remove blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
raise Exception(f"Failed to extract content from URL: {str(e)}")
finally:
await self.client.aclose()