Spaces:

SlouchyBuffalo
/

pages-converter-pro

Running on Zero

File size: 13,681 Bytes

# app.py - Corrected CloudConvert API Integration
import gradio as gr
import os
import spaces
import tempfile
import requests
import time
from huggingface_hub import InferenceClient
from pathlib import Path

# Debug tokens
hf_token = os.getenv("HF_TOKEN")
cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None
print(f"Debug: HF Token exists = {hf_token is not None}")
print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}")

# Initialize the client with Cerebras
client = InferenceClient(
    "meta-llama/Llama-3.3-70B-Instruct",
    provider="cerebras",
    token=hf_token
)

def convert_pages_to_text(file_path, api_key):
    """Convert .pages file to text using CloudConvert API - Correct Format"""
    base_url = "https://api.cloudconvert.com/v2"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    try:
        # Step 1: Create a job with correct task structure
        job_data = {
            "tasks": {
                "import-file": {
                    "operation": "import/upload"
                },
                "convert-file": {
                    "operation": "convert",
                    "input": "import-file",
                    "input_format": "pages",
                    "output_format": "txt"
                },
                "export-file": {
                    "operation": "export/url",
                    "input": "convert-file"
                }
            }
        }
        
        print("Creating CloudConvert job...")
        response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data)
        print(f"Job creation response: {response.status_code}")
        
        if not response.ok:
            print(f"Job creation failed: {response.text}")
            response.raise_for_status()
        
        job = response.json()
        print(f"Job created successfully: {job['data']['id']}")
        
        # Step 2: Upload the file
        upload_task = None
        for task in job["data"]["tasks"]:
            if task["operation"] == "import/upload":
                upload_task = task
                break
        
        if not upload_task:
            raise Exception("Upload task not found in job")
        
        upload_url = upload_task["result"]["form"]["url"]
        form_data = upload_task["result"]["form"]["parameters"]
        
        print("Uploading file to CloudConvert...")
        with open(file_path, 'rb') as f:
            files = {"file": f}
            upload_response = requests.post(upload_url, data=form_data, files=files)
        
        if not upload_response.ok:
            print(f"Upload failed: {upload_response.text}")
            upload_response.raise_for_status()
        
        print("File uploaded successfully")
        
        # Step 3: Wait for conversion to complete
        job_id = job["data"]["id"]
        print(f"Waiting for job {job_id} to complete...")
        
        max_attempts = 30  # Wait up to 1 minute
        for attempt in range(max_attempts):
            status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers)
            status_response.raise_for_status()
            job_status = status_response.json()
            
            print(f"Job status: {job_status['data']['status']}")
            
            if job_status["data"]["status"] == "finished":
                print("Conversion completed successfully")
                break
            elif job_status["data"]["status"] == "error":
                error_msg = job_status['data'].get('message', 'Unknown error')
                print(f"Conversion failed: {error_msg}")
                
                # Check task-level errors
                for task in job_status.get('data', {}).get('tasks', []):
                    if task.get('status') == 'error':
                        task_error = task.get('message', 'Unknown task error')
                        print(f"Task {task.get('operation')} error: {task_error}")
                
                raise Exception(f"Conversion failed: {error_msg}")
            
            time.sleep(2)  # Wait 2 seconds before checking again
        else:
            raise Exception("Conversion timeout - job took too long")
        
        # Step 4: Download the converted text
        for task in job_status["data"]["tasks"]:
            if task["operation"] == "export/url" and task["status"] == "finished":
                download_url = task["result"]["files"][0]["url"]
                print(f"Downloading result from: {download_url}")
                
                download_response = requests.get(download_url)
                download_response.raise_for_status()
                
                text_content = download_response.text
                print(f"Downloaded {len(text_content)} characters")
                return text_content
        
        raise Exception("No converted file found in completed job")
        
    except requests.exceptions.RequestException as e:
        print(f"HTTP error: {e}")
        raise Exception(f"CloudConvert HTTP error: {str(e)}")
    except Exception as e:
        print(f"General error: {e}")
        raise Exception(f"CloudConvert error: {str(e)}")

@spaces.GPU
def convert_pages_document(file, output_format, progress=gr.Progress()):
    """Convert Pages document using CloudConvert + Novita"""
    if not file:
        return None, "❌ Please upload a .pages file"
    
    if not cloudconvert_token:
        return None, "❌ CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets."
    
    try:
        progress(0.1, desc="📤 Converting with CloudConvert...")
        
        # Use CloudConvert to extract text from .pages file
        print(f"Converting file: {file.name}")
        text_content = convert_pages_to_text(file.name, cloudconvert_token)
        
        if not text_content or len(text_content.strip()) < 10:
            return None, "❌ Could not extract content from .pages file"
        
        print(f"Extracted text preview: {text_content[:200]}...")
        
        progress(0.5, desc="🤖 Converting format with Cerebras AI...")
        
        # Create format-specific prompt
        prompt = create_conversion_prompt(text_content, output_format)
        
        progress(0.7, desc="⚡ Processing with ZeroGPU...")
        
        # Convert using Cerebras
        try:
            messages = [{"role": "user", "content": prompt}]
            response = client.chat_completion(
                messages=messages,
                max_tokens=4096,
                temperature=0.1
            )
            converted_text = response.choices[0].message.content
        except Exception as e:
            print(f"Cerebras error: {e}")
            return None, f"❌ AI conversion error: {str(e)}"
        
        progress(0.9, desc="💾 Creating output file...")
        
        # Create output file
        output_path = create_output_file(converted_text, output_format)
        
        progress(1.0, desc="✅ Conversion complete!")
        
        return output_path, f"✅ Successfully converted to {output_format}!"
        
    except Exception as e:
        print(f"Conversion error: {e}")
        return None, f"❌ Error: {str(e)}"

def create_conversion_prompt(content, output_format):
    """Create optimized prompt for format conversion"""
    return f"""You are a document formatter. Convert the following text to {output_format} format.

IMPORTANT:
1. Keep ALL original content - do not summarize or remove text
2. Only adjust formatting for {output_format}
3. Preserve all important information, names, and details

Original text:
{content}

Formatted {output_format} output:"""

def create_output_file(content, output_format):
    """Create output file in specified format"""
    content = content.strip()
    
    if output_format == "PDF":
        from reportlab.pdfgen import canvas
        from reportlab.lib.pagesizes import letter
        import textwrap
        
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
            pdf = canvas.Canvas(f.name, pagesize=letter)
            width, height = letter
            y = height - 50
            
            # Better paragraph handling
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                if paragraph.strip():
                    lines = textwrap.wrap(paragraph.strip(), width=90)
                    for line in lines:
                        if y < 50:
                            pdf.showPage()
                            y = height - 50
                        pdf.drawString(50, y, line)
                        y -= 20
                    y -= 10  # Space between paragraphs
            
            pdf.save()
            return f.name
    
    elif output_format == "DOCX":
        from docx import Document
        
        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
            doc = Document()
            
            # Add paragraphs
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                if paragraph.strip():
                    doc.add_paragraph(paragraph.strip())
            
            doc.save(f.name)
            return f.name
    
    else:
        # For TXT, HTML, Markdown
        ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"}
        ext = ext_map.get(output_format, ".txt")
        
        with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
            f.write(content)
            return f.name

# Create the Gradio interface
with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app:
    # Header
    gr.HTML("""
    <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;">
        <h1>📄 Pages Converter Pro</h1>
        <p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p>
        <p style="font-size: 0.9em; opacity: 0.9;">✨ Professional .pages parsing + AI-powered format conversion</p>
    </div>
    """)
    
    # Status indicator
    with gr.Row():
        gr.HTML(f"""
        <div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;">
            <strong>CloudConvert API:</strong> {'✅ Connected and Ready' if cloudconvert_token else '❌ API Key Missing'}
        </div>
        """)
    
    # Main interface
    with gr.Row():
        with gr.Column(scale=2):
            gr.HTML("<h3>📎 Upload & Convert</h3>")
            
            file_input = gr.File(
                label="Select .pages file",
                file_types=[".pages"]
            )
            
            output_format = gr.Radio(
                choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
                value="PDF",
                label="🎯 Output Format"
            )
            
            convert_btn = gr.Button(
                "🚀 Convert Document",
                variant="primary",
                size="lg"
            )
        
        with gr.Column(scale=1):
            gr.HTML("""
            <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
                <h3>✨ Features</h3>
                <ul style="color: #666;">
                    <li>✅ <strong>100% reliable</strong> .pages parsing</li>
                    <li>⚡ ZeroGPU acceleration</li>
                    <li>🤖 AI-powered formatting</li>
                    <li>🎨 Professional output quality</li>
                    <li>🔒 Secure processing</li>
                </ul>
                
                <div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;">
                    <h4 style="margin-top: 0;">💡 How it works:</h4>
                    <ol style="font-size: 0.9em; color: #555; margin-bottom: 0;">
                        <li>CloudConvert extracts text from .pages</li>
                        <li>Cerebras AI formats for your chosen output</li>
                        <li>Download your professionally converted file</li>
                    </ol>
                </div>
            </div>
            """)
    
    # Output section
    with gr.Row():
        output_file = gr.File(
            label="📁 Download Your Converted File"
        )
    
    with gr.Row():
        status_html = gr.HTML(
            value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>"
        )
    
    # Connect the interface
    convert_btn.click(
        fn=convert_pages_document,
        inputs=[file_input, output_format],
        outputs=[output_file, status_html],
        show_progress=True
    )
    
    # Footer
    gr.HTML("""
    <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;">
        <p style="margin-bottom: 0.5rem;">🔧 <strong>Technical Stack:</strong></p>
        <p style="font-size: 0.9em; color: #666; margin-bottom: 0;">
            CloudConvert API for reliable .pages parsing • HuggingFace ZeroGPU for AI processing • Cerebras for lightning-fast inference
        </p>
    </div>
    """)

# Launch the app
if __name__ == "__main__":
    app.launch()