Spaces:
Running
Running
import fitz # PyMuPDF | |
import docx | |
from io import BytesIO | |
import logging | |
from fastapi import HTTPException | |
def parse_docx(file: BytesIO): | |
doc = docx.Document(file) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
def parse_pdf(file: BytesIO): | |
try: | |
doc = fitz.open(stream=file, filetype="pdf") | |
text = "" | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text += page.get_text() | |
return text | |
except Exception as e: | |
logging.error(f"Error while processing PDF: {str(e)}") | |
raise HTTPException( | |
status_code=500, detail="Error processing PDF file") | |
def parse_txt(file: BytesIO): | |
return file.read().decode("utf-8") | |
def end_symbol_for_NP_text(text: str) -> str: | |
text = text.strip() | |
if not text.endswith("।"): | |
text += "।" | |
return text | |