Final_Assignment / tools /wikipedia.py
tdziwok's picture
first commit
31af2b2
import re
from bs4 import BeautifulSoup, Tag
from smolagents import tool
import requests
import markdownify
@tool
def get_wikipedia_article_sections(url: str) -> str:
"""
Visit a specific wikipedia article, and return a structured list of sections (table of contents)
describing what information can be found on this page.
Args:
url (str): The url of the wikipedia article to visit.
Returns:
str: Table of contents, or error message.
"""
if "wikipedia." not in url:
return f"the provided url does not appear to be a valid Wikipedia page."
try:
resp = requests.get(url)
except Exception as e:
return f"got an error: {str(e)}"
if resp.status_code >= 400:
return f"got an error (http status {resp.status_code}): {resp.text})"
soup = BeautifulSoup(resp.text, "html.parser")
def recurse_toc(ul, level=0):
lines = []
for li in ul.find_all('li', recursive=False):
a = li.find("a", class_="vector-toc-link")
if a:
href = a.get("href", "")
if href.startswith("#") and href != "#":
indent = " " * level
lines.append(f"{indent}- {href[1:]}")
# Recurse into child ULs
child_ul = li.find("ul", recursive=False)
if child_ul:
lines.extend(recurse_toc(child_ul, level + 1))
return lines
toc_ul = soup.find(id="mw-panel-toc-list")
if toc_ul is None:
return "could not extract table of contents; ensure that this is a valid link to a wikipedia article."
links = recurse_toc(toc_ul)
if not links:
return "table of contents is empty or could not be parsed."
toc_lines = ["Sections within article:"]
for link in links:
toc_lines.append(link)
return "\n".join(toc_lines)
@tool
def extract_wikipedia_section(url: str, section_id: str) -> str:
"""
Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections
Args:
url (str): The url of the wikipedia article to visit.
section_id (str): The id of the section to retrieve.
Returns:
str: the contents of the section.
"""
if "wikipedia." not in url:
return f"the provided url does not appear to be a valid Wikipedia page."
try:
resp = requests.get(url)
except Exception as e:
return f"got an error: {str(e)}"
if resp.status_code >= 400:
return f"got an error (http status {resp.status_code}): {resp.text})"
def get_classes(tag):
if "class" not in tag.attrs:
return []
return [str(c) for c in tag.get("class")]
def get_heading_hierarchy(tag) -> int:
classes = get_classes(tag)
for c in classes:
m = re.search("mw-heading(\d+)", c)
if not m:
continue
return int(m.group(1))
resp = requests.get(url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
section_id = section_id.strip("-").strip()
# Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span>
heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id)
if not heading:
return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections."
parent = heading.parent
start_hierarchy = get_heading_hierarchy(parent)
if start_hierarchy is None:
return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format"
section_content = []
# Gather all siblings after the heading, stopping at the next heading of equal/higher rank
for sibling in parent.next_siblings:
if not isinstance(sibling, Tag):
# this element is not an html tag, - probably just some text content
section_content.append(str(sibling))
continue
hierarchy = get_heading_hierarchy(sibling)
if hierarchy is None:
# this element is not a section header - add it.
section_content.append(str(sibling))
continue
if hierarchy > start_hierarchy:
# this is lower in the hierarchy than the requested section, add it
section_content.append(str(sibling))
continue
break
content_html = '\n'.join(section_content).strip()
res = markdownify.markdownify(content_html)
res = re.sub(r"\n{3,}", "\n\n", res)
return res