Spaces:
Sleeping
Sleeping
from flask import Flask, request, jsonify, send_file, send_from_directory | |
from flask_cors import CORS | |
from werkzeug.utils import secure_filename | |
import os | |
import traceback | |
from pdf_html import PDFToHTMLConverter | |
from pdf_word import PDFToWordConverter | |
from pdf_json import PDFToJSONConverter | |
from pdf_excel import PDFToExcelConverter | |
app = Flask(__name__, static_folder='static') | |
CORS(app) | |
# Configure file size limits and folders | |
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit | |
app.config['UPLOAD_FOLDER'] = 'uploads' | |
app.config['OUTPUT_FOLDER'] = 'outputs' | |
app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production! | |
# Create necessary directories if they don't exist | |
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) | |
os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True) | |
# Placeholder for Hugging Face API Token | |
HF_TOKEN = "Api_token" # Replace with your actual token | |
# Define allowed file extensions for uploads | |
ALLOWED_EXTENSIONS = {'pdf'} | |
def allowed_file(filename): | |
"""Checks if the uploaded file has an allowed extension.""" | |
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
def serve_index(): | |
"""Serves the main index.html file.""" | |
return send_from_directory('static', 'index.html') | |
def serve_static(filename): | |
"""Serves other static files (CSS, JS, etc.).""" | |
return send_from_directory('static', filename) | |
def convert_pdf(): | |
""" | |
Handles PDF conversion requests. | |
Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'. | |
""" | |
try: | |
# Check if a file was included in the request | |
if 'file' not in request.files: | |
return jsonify({'success': False, 'error': 'No file uploaded.'}), 400 | |
file = request.files['file'] | |
format_type = request.form.get('format') | |
output_name = request.form.get('output_name', 'converted_file') | |
# Validate file and format | |
if file.filename == '': | |
return jsonify({'success': False, 'error': 'No file selected.'}), 400 | |
if not format_type or format_type not in ['html', 'word', 'json', 'excel']: | |
return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400 | |
if not allowed_file(file.filename): | |
return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400 | |
# Securely save the uploaded file | |
filename_secured = secure_filename(file.filename) | |
input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured) | |
file.save(input_path) | |
# Define output file extensions based on format | |
extensions = { | |
'html': '.html', | |
'word': '.docx', | |
'json': '.json', | |
'excel': '.xlsx' | |
} | |
output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}" | |
output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename) | |
success_message = "" | |
try: | |
# Perform conversion based on the requested format | |
if format_type == 'html': | |
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN) | |
try: | |
# First try with HF models | |
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True) | |
except AttributeError as ae: | |
if '_group_overlapping_text' in str(ae): | |
# Fall back to non-HF mode if the method is missing | |
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
else: | |
raise | |
success_message = "Successfully converted to HTML!" | |
elif format_type == 'word': | |
converter = PDFToWordConverter(huggingface_token=HF_TOKEN) | |
converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
success_message = "Successfully converted to Word!" | |
elif format_type == 'json': | |
converter = PDFToJSONConverter(huggingface_token=HF_TOKEN) | |
converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
success_message = "Successfully converted to JSON!" | |
elif format_type == 'excel': | |
converter = PDFToExcelConverter(huggingface_token=HF_TOKEN) | |
converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
success_message = "Successfully converted to Excel!" | |
except Exception as conv_e: | |
# Clean up the output file if conversion failed | |
if os.path.exists(output_path): | |
try: | |
os.remove(output_path) | |
except Exception as e: | |
print(f"Warning: Could not remove output file {output_path}: {e}") | |
raise conv_e | |
# Clean up the uploaded input file | |
try: | |
os.remove(input_path) | |
except Exception as e: | |
print(f"Warning: Could not remove input file {input_path}: {e}") | |
# Return success response with download URL | |
return jsonify({ | |
'success': True, | |
'message': success_message, | |
'download_url': f'/download/{output_filename}' | |
}), 200 | |
except Exception as e: | |
# Clean up input file in case of error | |
if 'input_path' in locals() and os.path.exists(input_path): | |
try: | |
os.remove(input_path) | |
except Exception as cleanup_e: | |
print(f"Error during error cleanup for {input_path}: {cleanup_e}") | |
traceback.print_exc() | |
error_msg = str(e) | |
if '_group_overlapping_text' in error_msg: | |
error_msg = "HTML conversion failed due to incompatible converter version. Please try another format." | |
return jsonify({ | |
'success': False, | |
'error': f'Conversion failed: {error_msg}' | |
}), 500 | |
def download_file(filename): | |
"""Allows downloading of converted files.""" | |
try: | |
file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename) | |
if os.path.exists(file_path): | |
return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True) | |
return jsonify({'error': 'File not found.'}), 404 | |
except Exception as e: | |
traceback.print_exc() | |
return jsonify({'error': str(e)}), 500 | |
def health_check(): | |
"""Simple health check endpoint.""" | |
return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200 | |
def too_large(e): | |
"""Handles file too large errors.""" | |
return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413 | |
def internal_error(e): | |
"""Handles general internal server errors.""" | |
traceback.print_exc() | |
return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500 | |
if __name__ == '__main__': | |
app.run(debug=True, host='0.0.0.0', port=5000) |