Spaces:
Sleeping
Sleeping
File size: 4,719 Bytes
31af2b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import re
from bs4 import BeautifulSoup, Tag
from smolagents import tool
import requests
import markdownify
@tool
def get_wikipedia_article_sections(url: str) -> str:
"""
Visit a specific wikipedia article, and return a structured list of sections (table of contents)
describing what information can be found on this page.
Args:
url (str): The url of the wikipedia article to visit.
Returns:
str: Table of contents, or error message.
"""
if "wikipedia." not in url:
return f"the provided url does not appear to be a valid Wikipedia page."
try:
resp = requests.get(url)
except Exception as e:
return f"got an error: {str(e)}"
if resp.status_code >= 400:
return f"got an error (http status {resp.status_code}): {resp.text})"
soup = BeautifulSoup(resp.text, "html.parser")
def recurse_toc(ul, level=0):
lines = []
for li in ul.find_all('li', recursive=False):
a = li.find("a", class_="vector-toc-link")
if a:
href = a.get("href", "")
if href.startswith("#") and href != "#":
indent = " " * level
lines.append(f"{indent}- {href[1:]}")
# Recurse into child ULs
child_ul = li.find("ul", recursive=False)
if child_ul:
lines.extend(recurse_toc(child_ul, level + 1))
return lines
toc_ul = soup.find(id="mw-panel-toc-list")
if toc_ul is None:
return "could not extract table of contents; ensure that this is a valid link to a wikipedia article."
links = recurse_toc(toc_ul)
if not links:
return "table of contents is empty or could not be parsed."
toc_lines = ["Sections within article:"]
for link in links:
toc_lines.append(link)
return "\n".join(toc_lines)
@tool
def extract_wikipedia_section(url: str, section_id: str) -> str:
"""
Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections
Args:
url (str): The url of the wikipedia article to visit.
section_id (str): The id of the section to retrieve.
Returns:
str: the contents of the section.
"""
if "wikipedia." not in url:
return f"the provided url does not appear to be a valid Wikipedia page."
try:
resp = requests.get(url)
except Exception as e:
return f"got an error: {str(e)}"
if resp.status_code >= 400:
return f"got an error (http status {resp.status_code}): {resp.text})"
def get_classes(tag):
if "class" not in tag.attrs:
return []
return [str(c) for c in tag.get("class")]
def get_heading_hierarchy(tag) -> int:
classes = get_classes(tag)
for c in classes:
m = re.search("mw-heading(\d+)", c)
if not m:
continue
return int(m.group(1))
resp = requests.get(url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
section_id = section_id.strip("-").strip()
# Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span>
heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id)
if not heading:
return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections."
parent = heading.parent
start_hierarchy = get_heading_hierarchy(parent)
if start_hierarchy is None:
return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format"
section_content = []
# Gather all siblings after the heading, stopping at the next heading of equal/higher rank
for sibling in parent.next_siblings:
if not isinstance(sibling, Tag):
# this element is not an html tag, - probably just some text content
section_content.append(str(sibling))
continue
hierarchy = get_heading_hierarchy(sibling)
if hierarchy is None:
# this element is not a section header - add it.
section_content.append(str(sibling))
continue
if hierarchy > start_hierarchy:
# this is lower in the hierarchy than the requested section, add it
section_content.append(str(sibling))
continue
break
content_html = '\n'.join(section_content).strip()
res = markdownify.markdownify(content_html)
res = re.sub(r"\n{3,}", "\n\n", res)
return res
|