Spaces:
Sleeping
Sleeping
File size: 7,416 Bytes
96c003e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename
import os
import traceback
from pdf_html import PDFToHTMLConverter
from pdf_word import PDFToWordConverter
from pdf_json import PDFToJSONConverter
from pdf_excel import PDFToExcelConverter
app = Flask(__name__, static_folder='static')
CORS(app)
# Configure file size limits and folders
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['OUTPUT_FOLDER'] = 'outputs'
app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production!
# Create necessary directories if they don't exist
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
# Placeholder for Hugging Face API Token
HF_TOKEN = "Api_token" # Replace with your actual token
# Define allowed file extensions for uploads
ALLOWED_EXTENSIONS = {'pdf'}
def allowed_file(filename):
"""Checks if the uploaded file has an allowed extension."""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def serve_index():
"""Serves the main index.html file."""
return send_from_directory('static', 'index.html')
@app.route('/<path:filename>')
def serve_static(filename):
"""Serves other static files (CSS, JS, etc.)."""
return send_from_directory('static', filename)
@app.route('/convert', methods=['POST'])
def convert_pdf():
"""
Handles PDF conversion requests.
Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'.
"""
try:
# Check if a file was included in the request
if 'file' not in request.files:
return jsonify({'success': False, 'error': 'No file uploaded.'}), 400
file = request.files['file']
format_type = request.form.get('format')
output_name = request.form.get('output_name', 'converted_file')
# Validate file and format
if file.filename == '':
return jsonify({'success': False, 'error': 'No file selected.'}), 400
if not format_type or format_type not in ['html', 'word', 'json', 'excel']:
return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400
if not allowed_file(file.filename):
return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400
# Securely save the uploaded file
filename_secured = secure_filename(file.filename)
input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured)
file.save(input_path)
# Define output file extensions based on format
extensions = {
'html': '.html',
'word': '.docx',
'json': '.json',
'excel': '.xlsx'
}
output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}"
output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename)
success_message = ""
try:
# Perform conversion based on the requested format
if format_type == 'html':
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
try:
# First try with HF models
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True)
except AttributeError as ae:
if '_group_overlapping_text' in str(ae):
# Fall back to non-HF mode if the method is missing
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False)
else:
raise
success_message = "Successfully converted to HTML!"
elif format_type == 'word':
converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False)
success_message = "Successfully converted to Word!"
elif format_type == 'json':
converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False)
success_message = "Successfully converted to JSON!"
elif format_type == 'excel':
converter = PDFToExcelConverter(huggingface_token=HF_TOKEN)
converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False)
success_message = "Successfully converted to Excel!"
except Exception as conv_e:
# Clean up the output file if conversion failed
if os.path.exists(output_path):
try:
os.remove(output_path)
except Exception as e:
print(f"Warning: Could not remove output file {output_path}: {e}")
raise conv_e
# Clean up the uploaded input file
try:
os.remove(input_path)
except Exception as e:
print(f"Warning: Could not remove input file {input_path}: {e}")
# Return success response with download URL
return jsonify({
'success': True,
'message': success_message,
'download_url': f'/download/{output_filename}'
}), 200
except Exception as e:
# Clean up input file in case of error
if 'input_path' in locals() and os.path.exists(input_path):
try:
os.remove(input_path)
except Exception as cleanup_e:
print(f"Error during error cleanup for {input_path}: {cleanup_e}")
traceback.print_exc()
error_msg = str(e)
if '_group_overlapping_text' in error_msg:
error_msg = "HTML conversion failed due to incompatible converter version. Please try another format."
return jsonify({
'success': False,
'error': f'Conversion failed: {error_msg}'
}), 500
@app.route('/download/<filename>')
def download_file(filename):
"""Allows downloading of converted files."""
try:
file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
if os.path.exists(file_path):
return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True)
return jsonify({'error': 'File not found.'}), 404
except Exception as e:
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/health')
def health_check():
"""Simple health check endpoint."""
return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200
@app.errorhandler(413)
def too_large(e):
"""Handles file too large errors."""
return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413
@app.errorhandler(500)
def internal_error(e):
"""Handles general internal server errors."""
traceback.print_exc()
return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000) |