Spaces:

dang-w
/

ai-content-summariser-api

Sleeping

Dan Walsh

Initial deployment of AI Content Summariser API

9cf5fee 5 months ago

1.36 kB

	import httpx
	from bs4 import BeautifulSoup
	import re

	class URLExtractorService:
	def __init__(self):
	self.client = httpx.AsyncClient(timeout=30.0)

	async def extract_content(self, url):
	"""
	Extract the main content from a URL.

	Args:
	url (str): The URL to extract content from

	Returns:
	str: The extracted text content
	"""
	try:
	response = await self.client.get(url)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style", "header", "footer", "nav"]):
	script.extract()

	# Get text and clean it
	text = soup.get_text()

	# Break into lines and remove leading/trailing space
	lines = (line.strip() for line in text.splitlines())
	# Break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# Remove blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text
	except Exception as e:
	raise Exception(f"Failed to extract content from URL: {str(e)}")
	finally:
	await self.client.aclose()