Spaces:

SlouchyBuffalo
/

pages-converter-pro

Sleeping

App Files Files Community

SlouchyBuffalo commited on May 12

Commit

393e4f2

verified ·

1 Parent(s): b60ce77

Create app.py

Browse files

Files changed (1) hide show

app.py +357 -0

app.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# app.py - Corrected CloudConvert API Integration
+import gradio as gr
+import os
+import spaces
+import tempfile
+import requests
+import time
+from huggingface_hub import InferenceClient
+from pathlib import Path
+# Debug tokens
+hf_token = os.getenv("HF_TOKEN")
+cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None
+print(f"Debug: HF Token exists = {hf_token is not None}")
+print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}")
+# Initialize the client with Cerebras
+client = InferenceClient(
+    "meta-llama/Llama-3.3-70B-Instruct",
+    provider="cerebras",
+    token=hf_token
+)
+def convert_pages_to_text(file_path, api_key):
+    """Convert .pages file to text using CloudConvert API - Correct Format"""
+    base_url = "https://api.cloudconvert.com/v2"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    try:
+        # Step 1: Create a job with correct task structure
+        job_data = {
+            "tasks": {
+                "import-file": {
+                    "operation": "import/upload"
+                },
+                "convert-file": {
+                    "operation": "convert",
+                    "input": "import-file",
+                    "input_format": "pages",
+                    "output_format": "txt"
+                },
+                "export-file": {
+                    "operation": "export/url",
+                    "input": "convert-file"
+                }
+            }
+        }
+        print("Creating CloudConvert job...")
+        response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data)
+        print(f"Job creation response: {response.status_code}")
+        if not response.ok:
+            print(f"Job creation failed: {response.text}")
+            response.raise_for_status()
+        job = response.json()
+        print(f"Job created successfully: {job['data']['id']}")
+        # Step 2: Upload the file
+        upload_task = None
+        for task in job["data"]["tasks"]:
+            if task["operation"] == "import/upload":
+                upload_task = task
+                break
+        if not upload_task:
+            raise Exception("Upload task not found in job")
+        upload_url = upload_task["result"]["form"]["url"]
+        form_data = upload_task["result"]["form"]["parameters"]
+        print("Uploading file to CloudConvert...")
+        with open(file_path, 'rb') as f:
+            files = {"file": f}
+            upload_response = requests.post(upload_url, data=form_data, files=files)
+        if not upload_response.ok:
+            print(f"Upload failed: {upload_response.text}")
+            upload_response.raise_for_status()
+        print("File uploaded successfully")
+        # Step 3: Wait for conversion to complete
+        job_id = job["data"]["id"]
+        print(f"Waiting for job {job_id} to complete...")
+        max_attempts = 30  # Wait up to 1 minute
+        for attempt in range(max_attempts):
+            status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers)
+            status_response.raise_for_status()
+            job_status = status_response.json()
+            print(f"Job status: {job_status['data']['status']}")
+            if job_status["data"]["status"] == "finished":
+                print("Conversion completed successfully")
+                break
+            elif job_status["data"]["status"] == "error":
+                error_msg = job_status['data'].get('message', 'Unknown error')
+                print(f"Conversion failed: {error_msg}")
+                # Check task-level errors
+                for task in job_status.get('data', {}).get('tasks', []):
+                    if task.get('status') == 'error':
+                        task_error = task.get('message', 'Unknown task error')
+                        print(f"Task {task.get('operation')} error: {task_error}")
+                raise Exception(f"Conversion failed: {error_msg}")
+            time.sleep(2)  # Wait 2 seconds before checking again
+        else:
+            raise Exception("Conversion timeout - job took too long")
+        # Step 4: Download the converted text
+        for task in job_status["data"]["tasks"]:
+            if task["operation"] == "export/url" and task["status"] == "finished":
+                download_url = task["result"]["files"][0]["url"]
+                print(f"Downloading result from: {download_url}")
+                download_response = requests.get(download_url)
+                download_response.raise_for_status()
+                text_content = download_response.text
+                print(f"Downloaded {len(text_content)} characters")
+                return text_content
+        raise Exception("No converted file found in completed job")
+    except requests.exceptions.RequestException as e:
+        print(f"HTTP error: {e}")
+        raise Exception(f"CloudConvert HTTP error: {str(e)}")
+    except Exception as e:
+        print(f"General error: {e}")
+        raise Exception(f"CloudConvert error: {str(e)}")
+@spaces.GPU
+def convert_pages_document(file, output_format, progress=gr.Progress()):
+    """Convert Pages document using CloudConvert + Cerebras"""
+    if not file:
+        return None, "❌ Please upload a .pages file"
+    if not cloudconvert_token:
+        return None, "❌ CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets."
+    try:
+        progress(0.1, desc="📤 Converting with CloudConvert...")
+        # Use CloudConvert to extract text from .pages file
+        print(f"Converting file: {file.name}")
+        text_content = convert_pages_to_text(file.name, cloudconvert_token)
+        if not text_content or len(text_content.strip()) < 10:
+            return None, "❌ Could not extract content from .pages file"
+        print(f"Extracted text preview: {text_content[:200]}...")
+        progress(0.5, desc="🤖 Converting format with Cerebras AI...")
+        # Create format-specific prompt
+        prompt = create_conversion_prompt(text_content, output_format)
+        progress(0.7, desc="⚡ Processing with ZeroGPU...")
+        # Convert using Cerebras
+        try:
+            messages = [{"role": "user", "content": prompt}]
+            response = client.chat_completion(
+                messages=messages,
+                max_tokens=4096,
+                temperature=0.1
+            )
+            converted_text = response.choices[0].message.content
+        except Exception as e:
+            print(f"Cerebras error: {e}")
+            return None, f"❌ AI conversion error: {str(e)}"
+        progress(0.9, desc="💾 Creating output file...")
+        # Create output file
+        output_path = create_output_file(converted_text, output_format)
+        progress(1.0, desc="✅ Conversion complete!")
+        return output_path, f"✅ Successfully converted to {output_format}!"
+    except Exception as e:
+        print(f"Conversion error: {e}")
+        return None, f"❌ Error: {str(e)}"
+def create_conversion_prompt(content, output_format):
+    """Create optimized prompt for format conversion"""
+    return f"""You are a document formatter. Convert the following text to {output_format} format.
+IMPORTANT:
+1. Keep ALL original content - do not summarize or remove text
+2. Only adjust formatting for {output_format}
+3. Preserve all important information, names, and details
+Original text:
+{content}
+Formatted {output_format} output:"""
+def create_output_file(content, output_format):
+    """Create output file in specified format"""
+    content = content.strip()
+    if output_format == "PDF":
+        from reportlab.pdfgen import canvas
+        from reportlab.lib.pagesizes import letter
+        import textwrap
+        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
+            pdf = canvas.Canvas(f.name, pagesize=letter)
+            width, height = letter
+            y = height - 50
+            # Better paragraph handling
+            paragraphs = content.split('\n\n')
+            for paragraph in paragraphs:
+                if paragraph.strip():
+                    lines = textwrap.wrap(paragraph.strip(), width=90)
+                    for line in lines:
+                        if y < 50:
+                            pdf.showPage()
+                            y = height - 50
+                        pdf.drawString(50, y, line)
+                        y -= 20
+                    y -= 10  # Space between paragraphs
+            pdf.save()
+            return f.name
+    elif output_format == "DOCX":
+        from docx import Document
+        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
+            doc = Document()
+            # Add paragraphs
+            paragraphs = content.split('\n\n')
+            for paragraph in paragraphs:
+                if paragraph.strip():
+                    doc.add_paragraph(paragraph.strip())
+            doc.save(f.name)
+            return f.name
+    else:
+        # For TXT, HTML, Markdown
+        ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"}
+        ext = ext_map.get(output_format, ".txt")
+        with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
+            f.write(content)
+            return f.name
+# Create the Gradio interface
+with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app:
+    # Header
+    gr.HTML("""
+    <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;">
+        <h1>📄 Pages Converter Pro</h1>
+        <p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p>
+        <p style="font-size: 0.9em; opacity: 0.9;">✨ Professional .pages parsing + AI-powered format conversion</p>
+    </div>
+    """)
+    # Status indicator
+    with gr.Row():
+        gr.HTML(f"""
+        <div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;">
+            <strong>CloudConvert API:</strong> {'✅ Connected and Ready' if cloudconvert_token else '❌ API Key Missing'}
+        </div>
+        """)
+    # Main interface
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.HTML("<h3>📎 Upload & Convert</h3>")
+            file_input = gr.File(
+                label="Select .pages file",
+                file_types=[".pages"]
+            )
+            output_format = gr.Radio(
+                choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
+                value="PDF",
+                label="🎯 Output Format"
+            )
+            convert_btn = gr.Button(
+                "🚀 Convert Document",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column(scale=1):
+            gr.HTML("""
+            <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
+                <h3>✨ Features</h3>
+                <ul style="color: #666;">
+                    <li>✅ <strong>100% reliable</strong> .pages parsing</li>
+                    <li>⚡ ZeroGPU acceleration</li>
+                    <li>🤖 AI-powered formatting</li>
+                    <li>🎨 Professional output quality</li>
+                    <li>🔒 Secure processing</li>
+                </ul>
+                <div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;">
+                    <h4 style="margin-top: 0;">💡 How it works:</h4>
+                    <ol style="font-size: 0.9em; color: #555; margin-bottom: 0;">
+                        <li>CloudConvert extracts text from .pages</li>
+                        <li>Cerebras AI formats for your chosen output</li>
+                        <li>Download your professionally converted file</li>
+                    </ol>
+                </div>
+            </div>
+            """)
+    # Output section
+    with gr.Row():
+        output_file = gr.File(
+            label="📁 Download Your Converted File"
+        )
+    with gr.Row():
+        status_html = gr.HTML(
+            value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>"
+        )
+    # Connect the interface
+    convert_btn.click(
+        fn=convert_pages_document,
+        inputs=[file_input, output_format],
+        outputs=[output_file, status_html],
+        show_progress=True
+    )
+    # Footer
+    gr.HTML("""
+    <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;">
+        <p style="margin-bottom: 0.5rem;">🔧 <strong>Technical Stack:</strong></p>
+        <p style="font-size: 0.9em; color: #666; margin-bottom: 0;">
+            CloudConvert API for reliable .pages parsing • HuggingFace ZeroGPU for AI processing • Cerebras for lightning-fast inference
+        </p>
+    </div>
+    """)
+# Launch the app
+if __name__ == "__main__":
+    app.launch()