new-project / app.py
amit01Xindus's picture
Upload 8 files
96c003e verified
from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename
import os
import traceback
from pdf_html import PDFToHTMLConverter
from pdf_word import PDFToWordConverter
from pdf_json import PDFToJSONConverter
from pdf_excel import PDFToExcelConverter
app = Flask(__name__, static_folder='static')
CORS(app)
# Configure file size limits and folders
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['OUTPUT_FOLDER'] = 'outputs'
app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production!
# Create necessary directories if they don't exist
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
# Placeholder for Hugging Face API Token
HF_TOKEN = "Api_token" # Replace with your actual token
# Define allowed file extensions for uploads
ALLOWED_EXTENSIONS = {'pdf'}
def allowed_file(filename):
"""Checks if the uploaded file has an allowed extension."""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def serve_index():
"""Serves the main index.html file."""
return send_from_directory('static', 'index.html')
@app.route('/<path:filename>')
def serve_static(filename):
"""Serves other static files (CSS, JS, etc.)."""
return send_from_directory('static', filename)
@app.route('/convert', methods=['POST'])
def convert_pdf():
"""
Handles PDF conversion requests.
Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'.
"""
try:
# Check if a file was included in the request
if 'file' not in request.files:
return jsonify({'success': False, 'error': 'No file uploaded.'}), 400
file = request.files['file']
format_type = request.form.get('format')
output_name = request.form.get('output_name', 'converted_file')
# Validate file and format
if file.filename == '':
return jsonify({'success': False, 'error': 'No file selected.'}), 400
if not format_type or format_type not in ['html', 'word', 'json', 'excel']:
return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400
if not allowed_file(file.filename):
return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400
# Securely save the uploaded file
filename_secured = secure_filename(file.filename)
input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured)
file.save(input_path)
# Define output file extensions based on format
extensions = {
'html': '.html',
'word': '.docx',
'json': '.json',
'excel': '.xlsx'
}
output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}"
output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename)
success_message = ""
try:
# Perform conversion based on the requested format
if format_type == 'html':
converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
try:
# First try with HF models
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True)
except AttributeError as ae:
if '_group_overlapping_text' in str(ae):
# Fall back to non-HF mode if the method is missing
converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False)
else:
raise
success_message = "Successfully converted to HTML!"
elif format_type == 'word':
converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False)
success_message = "Successfully converted to Word!"
elif format_type == 'json':
converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False)
success_message = "Successfully converted to JSON!"
elif format_type == 'excel':
converter = PDFToExcelConverter(huggingface_token=HF_TOKEN)
converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False)
success_message = "Successfully converted to Excel!"
except Exception as conv_e:
# Clean up the output file if conversion failed
if os.path.exists(output_path):
try:
os.remove(output_path)
except Exception as e:
print(f"Warning: Could not remove output file {output_path}: {e}")
raise conv_e
# Clean up the uploaded input file
try:
os.remove(input_path)
except Exception as e:
print(f"Warning: Could not remove input file {input_path}: {e}")
# Return success response with download URL
return jsonify({
'success': True,
'message': success_message,
'download_url': f'/download/{output_filename}'
}), 200
except Exception as e:
# Clean up input file in case of error
if 'input_path' in locals() and os.path.exists(input_path):
try:
os.remove(input_path)
except Exception as cleanup_e:
print(f"Error during error cleanup for {input_path}: {cleanup_e}")
traceback.print_exc()
error_msg = str(e)
if '_group_overlapping_text' in error_msg:
error_msg = "HTML conversion failed due to incompatible converter version. Please try another format."
return jsonify({
'success': False,
'error': f'Conversion failed: {error_msg}'
}), 500
@app.route('/download/<filename>')
def download_file(filename):
"""Allows downloading of converted files."""
try:
file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
if os.path.exists(file_path):
return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True)
return jsonify({'error': 'File not found.'}), 404
except Exception as e:
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/health')
def health_check():
"""Simple health check endpoint."""
return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200
@app.errorhandler(413)
def too_large(e):
"""Handles file too large errors."""
return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413
@app.errorhandler(500)
def internal_error(e):
"""Handles general internal server errors."""
traceback.print_exc()
return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000)