PodcastVox / src /fetcher.py
Plat
init
3a09141
import aiohttp
import io
from markitdown import MarkItDown
class PDFFetcher:
def __init__(self):
self.md = MarkItDown(enable_plugins=True)
def read_local(self, pdf_path: str) -> str:
result = self.md.convert(pdf_path)
markdown = self.postprocess(result.text_content)
return markdown
def postprocess(self, markdown: str) -> str:
pages = markdown.split("\f")
markdown = "\n".join(pages)
return markdown.strip()
async def fetch(self, pdf_url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(pdf_url) as res:
if res.status != 200:
raise Exception(f"Failed to download PDF: {res.status}")
pdf_content = await res.read()
markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content
markdown = self.postprocess(markdown)
return markdown
class HTMLFetcher:
def __init__(self):
self.md = MarkItDown(enable_plugins=True)
async def fetch(self, html_url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(html_url) as res:
if res.status != 200:
raise Exception(f"Failed to download HTML: {res.status}")
data = await res.read()
markdown = self.md.convert_stream(io.BytesIO(data))
return markdown.text_content
class AutoFetcher:
def __init__(self):
self.pdf_fetcher = PDFFetcher()
self.html_fetcher = HTMLFetcher()
self.md = MarkItDown(enable_plugins=True)
async def fetch(self, url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
if res.status != 200:
raise Exception(f"Failed to download HTML: {res.status}")
data = await res.read()
content_type = res.headers.get(
"Content-Type",
res.headers.get("content-type", "text/plain"),
)
if "application/pdf" in content_type:
return self.pdf_fetcher.postprocess(
self.md.convert_stream(io.BytesIO(data)).text_content
)
elif "text/html" in content_type:
return self.md.convert_stream(io.BytesIO(data)).text_content
else:
# plain?
return self.md.convert_stream(io.BytesIO(data)).text_content