File size: 7,416 Bytes
96c003e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from flask import Flask, request, jsonify, send_file, send_from_directory
from flask_cors import CORS
from werkzeug.utils import secure_filename
import os
import traceback
from pdf_html import PDFToHTMLConverter
from pdf_word import PDFToWordConverter
from pdf_json import PDFToJSONConverter
from pdf_excel import PDFToExcelConverter

app = Flask(__name__, static_folder='static')
CORS(app)

# Configure file size limits and folders
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024  # 100 MB limit
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['OUTPUT_FOLDER'] = 'outputs'
app.config['SECRET_KEY'] = 'your-secret-key-here'  # IMPORTANT: Change this in production!

# Create necessary directories if they don't exist
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)

# Placeholder for Hugging Face API Token
HF_TOKEN = "Api_token"  # Replace with your actual token

# Define allowed file extensions for uploads
ALLOWED_EXTENSIONS = {'pdf'}

def allowed_file(filename):
    """Checks if the uploaded file has an allowed extension."""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route('/')
def serve_index():
    """Serves the main index.html file."""
    return send_from_directory('static', 'index.html')

@app.route('/<path:filename>')
def serve_static(filename):
    """Serves other static files (CSS, JS, etc.)."""
    return send_from_directory('static', filename)

@app.route('/convert', methods=['POST'])
def convert_pdf():
    """
    Handles PDF conversion requests.
    Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'.
    """
    try:
        # Check if a file was included in the request
        if 'file' not in request.files:
            return jsonify({'success': False, 'error': 'No file uploaded.'}), 400

        file = request.files['file']
        format_type = request.form.get('format')
        output_name = request.form.get('output_name', 'converted_file')

        # Validate file and format
        if file.filename == '':
            return jsonify({'success': False, 'error': 'No file selected.'}), 400

        if not format_type or format_type not in ['html', 'word', 'json', 'excel']:
            return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400

        if not allowed_file(file.filename):
            return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400

        # Securely save the uploaded file
        filename_secured = secure_filename(file.filename)
        input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured)
        file.save(input_path)

        # Define output file extensions based on format
        extensions = {
            'html': '.html',
            'word': '.docx',
            'json': '.json',
            'excel': '.xlsx'
        }
        output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}"
        output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename)

        success_message = ""

        try:
            # Perform conversion based on the requested format
            if format_type == 'html':
                converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN)
                try:
                    # First try with HF models
                    converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True)
                except AttributeError as ae:
                    if '_group_overlapping_text' in str(ae):
                        # Fall back to non-HF mode if the method is missing
                        converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False)
                    else:
                        raise
                success_message = "Successfully converted to HTML!"
            elif format_type == 'word':
                converter = PDFToWordConverter(huggingface_token=HF_TOKEN)
                converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False)
                success_message = "Successfully converted to Word!"
            elif format_type == 'json':
                converter = PDFToJSONConverter(huggingface_token=HF_TOKEN)
                converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False)
                success_message = "Successfully converted to JSON!"
            elif format_type == 'excel':
                converter = PDFToExcelConverter(huggingface_token=HF_TOKEN)
                converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False)
                success_message = "Successfully converted to Excel!"
        except Exception as conv_e:
            # Clean up the output file if conversion failed
            if os.path.exists(output_path):
                try:
                    os.remove(output_path)
                except Exception as e:
                    print(f"Warning: Could not remove output file {output_path}: {e}")
            raise conv_e

        # Clean up the uploaded input file
        try:
            os.remove(input_path)
        except Exception as e:
            print(f"Warning: Could not remove input file {input_path}: {e}")

        # Return success response with download URL
        return jsonify({
            'success': True,
            'message': success_message,
            'download_url': f'/download/{output_filename}'
        }), 200

    except Exception as e:
        # Clean up input file in case of error
        if 'input_path' in locals() and os.path.exists(input_path):
            try:
                os.remove(input_path)
            except Exception as cleanup_e:
                print(f"Error during error cleanup for {input_path}: {cleanup_e}")

        traceback.print_exc()
        error_msg = str(e)
        if '_group_overlapping_text' in error_msg:
            error_msg = "HTML conversion failed due to incompatible converter version. Please try another format."
        return jsonify({
            'success': False,
            'error': f'Conversion failed: {error_msg}'
        }), 500

@app.route('/download/<filename>')
def download_file(filename):
    """Allows downloading of converted files."""
    try:
        file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename)
        if os.path.exists(file_path):
            return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True)
        return jsonify({'error': 'File not found.'}), 404
    except Exception as e:
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500

@app.route('/health')
def health_check():
    """Simple health check endpoint."""
    return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200

@app.errorhandler(413)
def too_large(e):
    """Handles file too large errors."""
    return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413

@app.errorhandler(500)
def internal_error(e):
    """Handles general internal server errors."""
    traceback.print_exc()
    return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)