File size: 2,525 Bytes
3a09141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import aiohttp
import io
from markitdown import MarkItDown


class PDFFetcher:
    def __init__(self):
        self.md = MarkItDown(enable_plugins=True)

    def read_local(self, pdf_path: str) -> str:
        result = self.md.convert(pdf_path)

        markdown = self.postprocess(result.text_content)

        return markdown

    def postprocess(self, markdown: str) -> str:
        pages = markdown.split("\f")
        markdown = "\n".join(pages)
        return markdown.strip()

    async def fetch(self, pdf_url: str) -> str:
        async with aiohttp.ClientSession() as session:
            async with session.get(pdf_url) as res:
                if res.status != 200:
                    raise Exception(f"Failed to download PDF: {res.status}")

                pdf_content = await res.read()

        markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content

        markdown = self.postprocess(markdown)

        return markdown


class HTMLFetcher:
    def __init__(self):
        self.md = MarkItDown(enable_plugins=True)

    async def fetch(self, html_url: str) -> str:
        async with aiohttp.ClientSession() as session:
            async with session.get(html_url) as res:
                if res.status != 200:
                    raise Exception(f"Failed to download HTML: {res.status}")

                data = await res.read()

            markdown = self.md.convert_stream(io.BytesIO(data))

        return markdown.text_content


class AutoFetcher:
    def __init__(self):
        self.pdf_fetcher = PDFFetcher()
        self.html_fetcher = HTMLFetcher()

        self.md = MarkItDown(enable_plugins=True)

    async def fetch(self, url: str) -> str:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as res:
                if res.status != 200:
                    raise Exception(f"Failed to download HTML: {res.status}")

                data = await res.read()
                content_type = res.headers.get(
                    "Content-Type",
                    res.headers.get("content-type", "text/plain"),
                )

        if "application/pdf" in content_type:
            return self.pdf_fetcher.postprocess(
                self.md.convert_stream(io.BytesIO(data)).text_content
            )

        elif "text/html" in content_type:
            return self.md.convert_stream(io.BytesIO(data)).text_content

        else:
            # plain?
            return self.md.convert_stream(io.BytesIO(data)).text_content