Spaces:
Running
Running
File size: 2,525 Bytes
3a09141 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import aiohttp
import io
from markitdown import MarkItDown
class PDFFetcher:
def __init__(self):
self.md = MarkItDown(enable_plugins=True)
def read_local(self, pdf_path: str) -> str:
result = self.md.convert(pdf_path)
markdown = self.postprocess(result.text_content)
return markdown
def postprocess(self, markdown: str) -> str:
pages = markdown.split("\f")
markdown = "\n".join(pages)
return markdown.strip()
async def fetch(self, pdf_url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(pdf_url) as res:
if res.status != 200:
raise Exception(f"Failed to download PDF: {res.status}")
pdf_content = await res.read()
markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content
markdown = self.postprocess(markdown)
return markdown
class HTMLFetcher:
def __init__(self):
self.md = MarkItDown(enable_plugins=True)
async def fetch(self, html_url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(html_url) as res:
if res.status != 200:
raise Exception(f"Failed to download HTML: {res.status}")
data = await res.read()
markdown = self.md.convert_stream(io.BytesIO(data))
return markdown.text_content
class AutoFetcher:
def __init__(self):
self.pdf_fetcher = PDFFetcher()
self.html_fetcher = HTMLFetcher()
self.md = MarkItDown(enable_plugins=True)
async def fetch(self, url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
if res.status != 200:
raise Exception(f"Failed to download HTML: {res.status}")
data = await res.read()
content_type = res.headers.get(
"Content-Type",
res.headers.get("content-type", "text/plain"),
)
if "application/pdf" in content_type:
return self.pdf_fetcher.postprocess(
self.md.convert_stream(io.BytesIO(data)).text_content
)
elif "text/html" in content_type:
return self.md.convert_stream(io.BytesIO(data)).text_content
else:
# plain?
return self.md.convert_stream(io.BytesIO(data)).text_content
|