Final_Assignment

Sleeping

App Files Files Community

Final_Assignment / tools /wikipedia.py

tdziwok

first commit

31af2b2 about 2 months ago

raw

history blame contribute delete

4.72 kB

	import re
	from bs4 import BeautifulSoup, Tag
	from smolagents import tool
	import requests
	import markdownify

	@tool
	def get_wikipedia_article_sections(url: str) -> str:
	"""
	Visit a specific wikipedia article, and return a structured list of sections (table of contents)
	describing what information can be found on this page.

	Args:
	url (str): The url of the wikipedia article to visit.

	Returns:
	str: Table of contents, or error message.
	"""

	if "wikipedia." not in url:
	return f"the provided url does not appear to be a valid Wikipedia page."

	try:
	resp = requests.get(url)
	except Exception as e:
	return f"got an error: {str(e)}"
	if resp.status_code >= 400:
	return f"got an error (http status {resp.status_code}): {resp.text})"

	soup = BeautifulSoup(resp.text, "html.parser")

	def recurse_toc(ul, level=0):
	lines = []
	for li in ul.find_all('li', recursive=False):
	a = li.find("a", class_="vector-toc-link")
	if a:
	href = a.get("href", "")
	if href.startswith("#") and href != "#":
	indent = " " * level
	lines.append(f"{indent}- {href[1:]}")
	# Recurse into child ULs
	child_ul = li.find("ul", recursive=False)
	if child_ul:
	lines.extend(recurse_toc(child_ul, level + 1))
	return lines

	toc_ul = soup.find(id="mw-panel-toc-list")
	if toc_ul is None:
	return "could not extract table of contents; ensure that this is a valid link to a wikipedia article."

	links = recurse_toc(toc_ul)
	if not links:
	return "table of contents is empty or could not be parsed."

	toc_lines = ["Sections within article:"]
	for link in links:
	toc_lines.append(link)

	return "\n".join(toc_lines)

	@tool
	def extract_wikipedia_section(url: str, section_id: str) -> str:
	"""
	Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections

	Args:
	url (str): The url of the wikipedia article to visit.
	section_id (str): The id of the section to retrieve.

	Returns:
	str: the contents of the section.
	"""

	if "wikipedia." not in url:
	return f"the provided url does not appear to be a valid Wikipedia page."

	try:
	resp = requests.get(url)
	except Exception as e:
	return f"got an error: {str(e)}"
	if resp.status_code >= 400:
	return f"got an error (http status {resp.status_code}): {resp.text})"

	def get_classes(tag):
	if "class" not in tag.attrs:
	return []
	return [str(c) for c in tag.get("class")]

	def get_heading_hierarchy(tag) -> int:
	classes = get_classes(tag)
	for c in classes:
	m = re.search("mw-heading(\d+)", c)
	if not m:
	continue
	return int(m.group(1))

	resp = requests.get(url)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	section_id = section_id.strip("-").strip()

	# Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span>
	heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id)
	if not heading:
	return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections."

	parent = heading.parent
	start_hierarchy = get_heading_hierarchy(parent)
	if start_hierarchy is None:
	return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format"

	section_content = []
	# Gather all siblings after the heading, stopping at the next heading of equal/higher rank
	for sibling in parent.next_siblings:
	if not isinstance(sibling, Tag):
	# this element is not an html tag, - probably just some text content
	section_content.append(str(sibling))
	continue
	hierarchy = get_heading_hierarchy(sibling)
	if hierarchy is None:
	# this element is not a section header - add it.
	section_content.append(str(sibling))
	continue
	if hierarchy > start_hierarchy:
	# this is lower in the hierarchy than the requested section, add it
	section_content.append(str(sibling))
	continue
	break

	content_html = '\n'.join(section_content).strip()
	res = markdownify.markdownify(content_html)
	res = re.sub(r"\n{3,}", "\n\n", res)
	return res