Final_Assignment

Sleeping

File size: 4,719 Bytes

31af2b2

import re
from bs4 import BeautifulSoup, Tag
from  smolagents import tool
import requests
import markdownify

@tool
def get_wikipedia_article_sections(url: str) -> str:
    """
    Visit a specific wikipedia article, and return a structured list of sections (table of contents)
    describing what information can be found on this page.

    Args:
        url (str): The url of the wikipedia article to visit.

    Returns:
        str: Table of contents, or error message.
    """

    if "wikipedia." not in url:
        return f"the provided url does not appear to be a valid Wikipedia page."

    try:
        resp = requests.get(url)
    except Exception as e:
        return f"got an error: {str(e)}"
    if resp.status_code >= 400:
        return f"got an error (http status {resp.status_code}): {resp.text})"

    soup = BeautifulSoup(resp.text, "html.parser")

    def recurse_toc(ul, level=0):
        lines = []
        for li in ul.find_all('li', recursive=False):
            a = li.find("a", class_="vector-toc-link")
            if a:
                href = a.get("href", "")
                if href.startswith("#") and href != "#":
                    indent = "  " * level
                    lines.append(f"{indent}- {href[1:]}")
            # Recurse into child ULs
            child_ul = li.find("ul", recursive=False)
            if child_ul:
                lines.extend(recurse_toc(child_ul, level + 1))
        return lines

    toc_ul = soup.find(id="mw-panel-toc-list")
    if toc_ul is None:
        return "could not extract table of contents; ensure that this is a valid link to a wikipedia article."

    links = recurse_toc(toc_ul)
    if not links:
        return "table of contents is empty or could not be parsed."

    toc_lines = ["Sections within article:"]
    for link in links:
        toc_lines.append(link)

    return "\n".join(toc_lines)

@tool
def extract_wikipedia_section(url: str, section_id: str) -> str:
    """
    Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections

    Args:
        url (str): The url of the wikipedia article to visit.
        section_id (str): The id of the section to retrieve.

    Returns:
        str: the contents of the section.
    """

    if "wikipedia." not in url:
        return f"the provided url does not appear to be a valid Wikipedia page."

    try:
        resp = requests.get(url)
    except Exception as e:
        return f"got an error: {str(e)}"
    if resp.status_code >= 400:
        return f"got an error (http status {resp.status_code}): {resp.text})"

    def get_classes(tag):
        if "class" not in tag.attrs:
            return []
        return [str(c) for c in tag.get("class")]

    def get_heading_hierarchy(tag) -> int:
        classes = get_classes(tag)
        for c in classes:
            m =  re.search("mw-heading(\d+)", c)
            if not m:
                continue
            return int(m.group(1))

    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    section_id = section_id.strip("-").strip()

    # Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span>
    heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id)
    if not heading:
        return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections."

    parent = heading.parent
    start_hierarchy = get_heading_hierarchy(parent)
    if start_hierarchy is None:
        return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format"

    section_content = []
    # Gather all siblings after the heading, stopping at the next heading of equal/higher rank
    for sibling in parent.next_siblings:
        if not isinstance(sibling, Tag):
            # this element is not an html tag, - probably just some text content
            section_content.append(str(sibling))
            continue
        hierarchy = get_heading_hierarchy(sibling)
        if hierarchy is None:
            # this element is not a section header - add it.
            section_content.append(str(sibling))
            continue
        if hierarchy > start_hierarchy:
            # this is lower in the hierarchy than the requested section, add it
            section_content.append(str(sibling))
            continue
        break

    content_html = '\n'.join(section_content).strip()
    res = markdownify.markdownify(content_html)
    res = re.sub(r"\n{3,}", "\n\n", res)
    return res