import aiohttp import io from markitdown import MarkItDown class PDFFetcher: def __init__(self): self.md = MarkItDown(enable_plugins=True) def read_local(self, pdf_path: str) -> str: result = self.md.convert(pdf_path) markdown = self.postprocess(result.text_content) return markdown def postprocess(self, markdown: str) -> str: pages = markdown.split("\f") markdown = "\n".join(pages) return markdown.strip() async def fetch(self, pdf_url: str) -> str: async with aiohttp.ClientSession() as session: async with session.get(pdf_url) as res: if res.status != 200: raise Exception(f"Failed to download PDF: {res.status}") pdf_content = await res.read() markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content markdown = self.postprocess(markdown) return markdown class HTMLFetcher: def __init__(self): self.md = MarkItDown(enable_plugins=True) async def fetch(self, html_url: str) -> str: async with aiohttp.ClientSession() as session: async with session.get(html_url) as res: if res.status != 200: raise Exception(f"Failed to download HTML: {res.status}") data = await res.read() markdown = self.md.convert_stream(io.BytesIO(data)) return markdown.text_content class AutoFetcher: def __init__(self): self.pdf_fetcher = PDFFetcher() self.html_fetcher = HTMLFetcher() self.md = MarkItDown(enable_plugins=True) async def fetch(self, url: str) -> str: async with aiohttp.ClientSession() as session: async with session.get(url) as res: if res.status != 200: raise Exception(f"Failed to download HTML: {res.status}") data = await res.read() content_type = res.headers.get( "Content-Type", res.headers.get("content-type", "text/plain"), ) if "application/pdf" in content_type: return self.pdf_fetcher.postprocess( self.md.convert_stream(io.BytesIO(data)).text_content ) elif "text/html" in content_type: return self.md.convert_stream(io.BytesIO(data)).text_content else: # plain? return self.md.convert_stream(io.BytesIO(data)).text_content