Spaces:
Sleeping
Sleeping
import re | |
from bs4 import BeautifulSoup, Tag | |
from smolagents import tool | |
import requests | |
import markdownify | |
def get_wikipedia_article_sections(url: str) -> str: | |
""" | |
Visit a specific wikipedia article, and return a structured list of sections (table of contents) | |
describing what information can be found on this page. | |
Args: | |
url (str): The url of the wikipedia article to visit. | |
Returns: | |
str: Table of contents, or error message. | |
""" | |
if "wikipedia." not in url: | |
return f"the provided url does not appear to be a valid Wikipedia page." | |
try: | |
resp = requests.get(url) | |
except Exception as e: | |
return f"got an error: {str(e)}" | |
if resp.status_code >= 400: | |
return f"got an error (http status {resp.status_code}): {resp.text})" | |
soup = BeautifulSoup(resp.text, "html.parser") | |
def recurse_toc(ul, level=0): | |
lines = [] | |
for li in ul.find_all('li', recursive=False): | |
a = li.find("a", class_="vector-toc-link") | |
if a: | |
href = a.get("href", "") | |
if href.startswith("#") and href != "#": | |
indent = " " * level | |
lines.append(f"{indent}- {href[1:]}") | |
# Recurse into child ULs | |
child_ul = li.find("ul", recursive=False) | |
if child_ul: | |
lines.extend(recurse_toc(child_ul, level + 1)) | |
return lines | |
toc_ul = soup.find(id="mw-panel-toc-list") | |
if toc_ul is None: | |
return "could not extract table of contents; ensure that this is a valid link to a wikipedia article." | |
links = recurse_toc(toc_ul) | |
if not links: | |
return "table of contents is empty or could not be parsed." | |
toc_lines = ["Sections within article:"] | |
for link in links: | |
toc_lines.append(link) | |
return "\n".join(toc_lines) | |
def extract_wikipedia_section(url: str, section_id: str) -> str: | |
""" | |
Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections | |
Args: | |
url (str): The url of the wikipedia article to visit. | |
section_id (str): The id of the section to retrieve. | |
Returns: | |
str: the contents of the section. | |
""" | |
if "wikipedia." not in url: | |
return f"the provided url does not appear to be a valid Wikipedia page." | |
try: | |
resp = requests.get(url) | |
except Exception as e: | |
return f"got an error: {str(e)}" | |
if resp.status_code >= 400: | |
return f"got an error (http status {resp.status_code}): {resp.text})" | |
def get_classes(tag): | |
if "class" not in tag.attrs: | |
return [] | |
return [str(c) for c in tag.get("class")] | |
def get_heading_hierarchy(tag) -> int: | |
classes = get_classes(tag) | |
for c in classes: | |
m = re.search("mw-heading(\d+)", c) | |
if not m: | |
continue | |
return int(m.group(1)) | |
resp = requests.get(url) | |
resp.raise_for_status() | |
soup = BeautifulSoup(resp.text, "html.parser") | |
section_id = section_id.strip("-").strip() | |
# Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span> | |
heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id) | |
if not heading: | |
return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections." | |
parent = heading.parent | |
start_hierarchy = get_heading_hierarchy(parent) | |
if start_hierarchy is None: | |
return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format" | |
section_content = [] | |
# Gather all siblings after the heading, stopping at the next heading of equal/higher rank | |
for sibling in parent.next_siblings: | |
if not isinstance(sibling, Tag): | |
# this element is not an html tag, - probably just some text content | |
section_content.append(str(sibling)) | |
continue | |
hierarchy = get_heading_hierarchy(sibling) | |
if hierarchy is None: | |
# this element is not a section header - add it. | |
section_content.append(str(sibling)) | |
continue | |
if hierarchy > start_hierarchy: | |
# this is lower in the hierarchy than the requested section, add it | |
section_content.append(str(sibling)) | |
continue | |
break | |
content_html = '\n'.join(section_content).strip() | |
res = markdownify.markdownify(content_html) | |
res = re.sub(r"\n{3,}", "\n\n", res) | |
return res | |