from flask import Flask, request, jsonify, send_file, send_from_directory from flask_cors import CORS from werkzeug.utils import secure_filename import os import traceback from pdf_html import PDFToHTMLConverter from pdf_word import PDFToWordConverter from pdf_json import PDFToJSONConverter from pdf_excel import PDFToExcelConverter app = Flask(__name__, static_folder='static') CORS(app) # Configure file size limits and folders app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit app.config['UPLOAD_FOLDER'] = 'uploads' app.config['OUTPUT_FOLDER'] = 'outputs' app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production! # Create necessary directories if they don't exist os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True) # Placeholder for Hugging Face API Token HF_TOKEN = "Api_token" # Replace with your actual token # Define allowed file extensions for uploads ALLOWED_EXTENSIONS = {'pdf'} def allowed_file(filename): """Checks if the uploaded file has an allowed extension.""" return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS @app.route('/') def serve_index(): """Serves the main index.html file.""" return send_from_directory('static', 'index.html') @app.route('/') def serve_static(filename): """Serves other static files (CSS, JS, etc.).""" return send_from_directory('static', filename) @app.route('/convert', methods=['POST']) def convert_pdf(): """ Handles PDF conversion requests. Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'. """ try: # Check if a file was included in the request if 'file' not in request.files: return jsonify({'success': False, 'error': 'No file uploaded.'}), 400 file = request.files['file'] format_type = request.form.get('format') output_name = request.form.get('output_name', 'converted_file') # Validate file and format if file.filename == '': return jsonify({'success': False, 'error': 'No file selected.'}), 400 if not format_type or format_type not in ['html', 'word', 'json', 'excel']: return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400 if not allowed_file(file.filename): return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400 # Securely save the uploaded file filename_secured = secure_filename(file.filename) input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured) file.save(input_path) # Define output file extensions based on format extensions = { 'html': '.html', 'word': '.docx', 'json': '.json', 'excel': '.xlsx' } output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}" output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename) success_message = "" try: # Perform conversion based on the requested format if format_type == 'html': converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN) try: # First try with HF models converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True) except AttributeError as ae: if '_group_overlapping_text' in str(ae): # Fall back to non-HF mode if the method is missing converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False) else: raise success_message = "Successfully converted to HTML!" elif format_type == 'word': converter = PDFToWordConverter(huggingface_token=HF_TOKEN) converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False) success_message = "Successfully converted to Word!" elif format_type == 'json': converter = PDFToJSONConverter(huggingface_token=HF_TOKEN) converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False) success_message = "Successfully converted to JSON!" elif format_type == 'excel': converter = PDFToExcelConverter(huggingface_token=HF_TOKEN) converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False) success_message = "Successfully converted to Excel!" except Exception as conv_e: # Clean up the output file if conversion failed if os.path.exists(output_path): try: os.remove(output_path) except Exception as e: print(f"Warning: Could not remove output file {output_path}: {e}") raise conv_e # Clean up the uploaded input file try: os.remove(input_path) except Exception as e: print(f"Warning: Could not remove input file {input_path}: {e}") # Return success response with download URL return jsonify({ 'success': True, 'message': success_message, 'download_url': f'/download/{output_filename}' }), 200 except Exception as e: # Clean up input file in case of error if 'input_path' in locals() and os.path.exists(input_path): try: os.remove(input_path) except Exception as cleanup_e: print(f"Error during error cleanup for {input_path}: {cleanup_e}") traceback.print_exc() error_msg = str(e) if '_group_overlapping_text' in error_msg: error_msg = "HTML conversion failed due to incompatible converter version. Please try another format." return jsonify({ 'success': False, 'error': f'Conversion failed: {error_msg}' }), 500 @app.route('/download/') def download_file(filename): """Allows downloading of converted files.""" try: file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename) if os.path.exists(file_path): return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True) return jsonify({'error': 'File not found.'}), 404 except Exception as e: traceback.print_exc() return jsonify({'error': str(e)}), 500 @app.route('/health') def health_check(): """Simple health check endpoint.""" return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200 @app.errorhandler(413) def too_large(e): """Handles file too large errors.""" return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413 @app.errorhandler(500) def internal_error(e): """Handles general internal server errors.""" traceback.print_exc() return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500 if __name__ == '__main__': app.run(debug=True, host='0.0.0.0', port=5000)