File size: 4,719 Bytes
31af2b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re
from bs4 import BeautifulSoup, Tag
from  smolagents import tool
import requests
import markdownify

@tool
def get_wikipedia_article_sections(url: str) -> str:
    """
    Visit a specific wikipedia article, and return a structured list of sections (table of contents)
    describing what information can be found on this page.

    Args:
        url (str): The url of the wikipedia article to visit.

    Returns:
        str: Table of contents, or error message.
    """

    if "wikipedia." not in url:
        return f"the provided url does not appear to be a valid Wikipedia page."

    try:
        resp = requests.get(url)
    except Exception as e:
        return f"got an error: {str(e)}"
    if resp.status_code >= 400:
        return f"got an error (http status {resp.status_code}): {resp.text})"

    soup = BeautifulSoup(resp.text, "html.parser")

    def recurse_toc(ul, level=0):
        lines = []
        for li in ul.find_all('li', recursive=False):
            a = li.find("a", class_="vector-toc-link")
            if a:
                href = a.get("href", "")
                if href.startswith("#") and href != "#":
                    indent = "  " * level
                    lines.append(f"{indent}- {href[1:]}")
            # Recurse into child ULs
            child_ul = li.find("ul", recursive=False)
            if child_ul:
                lines.extend(recurse_toc(child_ul, level + 1))
        return lines

    toc_ul = soup.find(id="mw-panel-toc-list")
    if toc_ul is None:
        return "could not extract table of contents; ensure that this is a valid link to a wikipedia article."

    links = recurse_toc(toc_ul)
    if not links:
        return "table of contents is empty or could not be parsed."

    toc_lines = ["Sections within article:"]
    for link in links:
        toc_lines.append(link)

    return "\n".join(toc_lines)

@tool
def extract_wikipedia_section(url: str, section_id: str) -> str:
    """
    Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections

    Args:
        url (str): The url of the wikipedia article to visit.
        section_id (str): The id of the section to retrieve.

    Returns:
        str: the contents of the section.
    """

    if "wikipedia." not in url:
        return f"the provided url does not appear to be a valid Wikipedia page."

    try:
        resp = requests.get(url)
    except Exception as e:
        return f"got an error: {str(e)}"
    if resp.status_code >= 400:
        return f"got an error (http status {resp.status_code}): {resp.text})"

    def get_classes(tag):
        if "class" not in tag.attrs:
            return []
        return [str(c) for c in tag.get("class")]

    def get_heading_hierarchy(tag) -> int:
        classes = get_classes(tag)
        for c in classes:
            m =  re.search("mw-heading(\d+)", c)
            if not m:
                continue
            return int(m.group(1))

    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    section_id = section_id.strip("-").strip()

    # Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span>
    heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id)
    if not heading:
        return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections."

    parent = heading.parent
    start_hierarchy = get_heading_hierarchy(parent)
    if start_hierarchy is None:
        return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format"

    section_content = []
    # Gather all siblings after the heading, stopping at the next heading of equal/higher rank
    for sibling in parent.next_siblings:
        if not isinstance(sibling, Tag):
            # this element is not an html tag, - probably just some text content
            section_content.append(str(sibling))
            continue
        hierarchy = get_heading_hierarchy(sibling)
        if hierarchy is None:
            # this element is not a section header - add it.
            section_content.append(str(sibling))
            continue
        if hierarchy > start_hierarchy:
            # this is lower in the hierarchy than the requested section, add it
            section_content.append(str(sibling))
            continue
        break

    content_html = '\n'.join(section_content).strip()
    res = markdownify.markdownify(content_html)
    res = re.sub(r"\n{3,}", "\n\n", res)
    return res