File size: 13,681 Bytes
393e4f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b6bffb
393e4f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38374bc
393e4f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# app.py - Corrected CloudConvert API Integration
import gradio as gr
import os
import spaces
import tempfile
import requests
import time
from huggingface_hub import InferenceClient
from pathlib import Path

# Debug tokens
hf_token = os.getenv("HF_TOKEN")
cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None
print(f"Debug: HF Token exists = {hf_token is not None}")
print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}")

# Initialize the client with Cerebras
client = InferenceClient(
    "meta-llama/Llama-3.3-70B-Instruct",
    provider="cerebras",
    token=hf_token
)

def convert_pages_to_text(file_path, api_key):
    """Convert .pages file to text using CloudConvert API - Correct Format"""
    base_url = "https://api.cloudconvert.com/v2"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    try:
        # Step 1: Create a job with correct task structure
        job_data = {
            "tasks": {
                "import-file": {
                    "operation": "import/upload"
                },
                "convert-file": {
                    "operation": "convert",
                    "input": "import-file",
                    "input_format": "pages",
                    "output_format": "txt"
                },
                "export-file": {
                    "operation": "export/url",
                    "input": "convert-file"
                }
            }
        }
        
        print("Creating CloudConvert job...")
        response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data)
        print(f"Job creation response: {response.status_code}")
        
        if not response.ok:
            print(f"Job creation failed: {response.text}")
            response.raise_for_status()
        
        job = response.json()
        print(f"Job created successfully: {job['data']['id']}")
        
        # Step 2: Upload the file
        upload_task = None
        for task in job["data"]["tasks"]:
            if task["operation"] == "import/upload":
                upload_task = task
                break
        
        if not upload_task:
            raise Exception("Upload task not found in job")
        
        upload_url = upload_task["result"]["form"]["url"]
        form_data = upload_task["result"]["form"]["parameters"]
        
        print("Uploading file to CloudConvert...")
        with open(file_path, 'rb') as f:
            files = {"file": f}
            upload_response = requests.post(upload_url, data=form_data, files=files)
        
        if not upload_response.ok:
            print(f"Upload failed: {upload_response.text}")
            upload_response.raise_for_status()
        
        print("File uploaded successfully")
        
        # Step 3: Wait for conversion to complete
        job_id = job["data"]["id"]
        print(f"Waiting for job {job_id} to complete...")
        
        max_attempts = 30  # Wait up to 1 minute
        for attempt in range(max_attempts):
            status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers)
            status_response.raise_for_status()
            job_status = status_response.json()
            
            print(f"Job status: {job_status['data']['status']}")
            
            if job_status["data"]["status"] == "finished":
                print("Conversion completed successfully")
                break
            elif job_status["data"]["status"] == "error":
                error_msg = job_status['data'].get('message', 'Unknown error')
                print(f"Conversion failed: {error_msg}")
                
                # Check task-level errors
                for task in job_status.get('data', {}).get('tasks', []):
                    if task.get('status') == 'error':
                        task_error = task.get('message', 'Unknown task error')
                        print(f"Task {task.get('operation')} error: {task_error}")
                
                raise Exception(f"Conversion failed: {error_msg}")
            
            time.sleep(2)  # Wait 2 seconds before checking again
        else:
            raise Exception("Conversion timeout - job took too long")
        
        # Step 4: Download the converted text
        for task in job_status["data"]["tasks"]:
            if task["operation"] == "export/url" and task["status"] == "finished":
                download_url = task["result"]["files"][0]["url"]
                print(f"Downloading result from: {download_url}")
                
                download_response = requests.get(download_url)
                download_response.raise_for_status()
                
                text_content = download_response.text
                print(f"Downloaded {len(text_content)} characters")
                return text_content
        
        raise Exception("No converted file found in completed job")
        
    except requests.exceptions.RequestException as e:
        print(f"HTTP error: {e}")
        raise Exception(f"CloudConvert HTTP error: {str(e)}")
    except Exception as e:
        print(f"General error: {e}")
        raise Exception(f"CloudConvert error: {str(e)}")

@spaces.GPU
def convert_pages_document(file, output_format, progress=gr.Progress()):
    """Convert Pages document using CloudConvert + Novita"""
    if not file:
        return None, "❌ Please upload a .pages file"
    
    if not cloudconvert_token:
        return None, "❌ CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets."
    
    try:
        progress(0.1, desc="πŸ“€ Converting with CloudConvert...")
        
        # Use CloudConvert to extract text from .pages file
        print(f"Converting file: {file.name}")
        text_content = convert_pages_to_text(file.name, cloudconvert_token)
        
        if not text_content or len(text_content.strip()) < 10:
            return None, "❌ Could not extract content from .pages file"
        
        print(f"Extracted text preview: {text_content[:200]}...")
        
        progress(0.5, desc="πŸ€– Converting format with Cerebras AI...")
        
        # Create format-specific prompt
        prompt = create_conversion_prompt(text_content, output_format)
        
        progress(0.7, desc="⚑ Processing with ZeroGPU...")
        
        # Convert using Cerebras
        try:
            messages = [{"role": "user", "content": prompt}]
            response = client.chat_completion(
                messages=messages,
                max_tokens=4096,
                temperature=0.1
            )
            converted_text = response.choices[0].message.content
        except Exception as e:
            print(f"Cerebras error: {e}")
            return None, f"❌ AI conversion error: {str(e)}"
        
        progress(0.9, desc="πŸ’Ύ Creating output file...")
        
        # Create output file
        output_path = create_output_file(converted_text, output_format)
        
        progress(1.0, desc="βœ… Conversion complete!")
        
        return output_path, f"βœ… Successfully converted to {output_format}!"
        
    except Exception as e:
        print(f"Conversion error: {e}")
        return None, f"❌ Error: {str(e)}"

def create_conversion_prompt(content, output_format):
    """Create optimized prompt for format conversion"""
    return f"""You are a document formatter. Convert the following text to {output_format} format.

IMPORTANT:
1. Keep ALL original content - do not summarize or remove text
2. Only adjust formatting for {output_format}
3. Preserve all important information, names, and details

Original text:
{content}

Formatted {output_format} output:"""

def create_output_file(content, output_format):
    """Create output file in specified format"""
    content = content.strip()
    
    if output_format == "PDF":
        from reportlab.pdfgen import canvas
        from reportlab.lib.pagesizes import letter
        import textwrap
        
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
            pdf = canvas.Canvas(f.name, pagesize=letter)
            width, height = letter
            y = height - 50
            
            # Better paragraph handling
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                if paragraph.strip():
                    lines = textwrap.wrap(paragraph.strip(), width=90)
                    for line in lines:
                        if y < 50:
                            pdf.showPage()
                            y = height - 50
                        pdf.drawString(50, y, line)
                        y -= 20
                    y -= 10  # Space between paragraphs
            
            pdf.save()
            return f.name
    
    elif output_format == "DOCX":
        from docx import Document
        
        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
            doc = Document()
            
            # Add paragraphs
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                if paragraph.strip():
                    doc.add_paragraph(paragraph.strip())
            
            doc.save(f.name)
            return f.name
    
    else:
        # For TXT, HTML, Markdown
        ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"}
        ext = ext_map.get(output_format, ".txt")
        
        with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
            f.write(content)
            return f.name

# Create the Gradio interface
with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app:
    # Header
    gr.HTML("""
    <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;">
        <h1>πŸ“„ Pages Converter Pro</h1>
        <p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p>
        <p style="font-size: 0.9em; opacity: 0.9;">✨ Professional .pages parsing + AI-powered format conversion</p>
    </div>
    """)
    
    # Status indicator
    with gr.Row():
        gr.HTML(f"""
        <div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;">
            <strong>CloudConvert API:</strong> {'βœ… Connected and Ready' if cloudconvert_token else '❌ API Key Missing'}
        </div>
        """)
    
    # Main interface
    with gr.Row():
        with gr.Column(scale=2):
            gr.HTML("<h3>πŸ“Ž Upload & Convert</h3>")
            
            file_input = gr.File(
                label="Select .pages file",
                file_types=[".pages"]
            )
            
            output_format = gr.Radio(
                choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
                value="PDF",
                label="🎯 Output Format"
            )
            
            convert_btn = gr.Button(
                "πŸš€ Convert Document",
                variant="primary",
                size="lg"
            )
        
        with gr.Column(scale=1):
            gr.HTML("""
            <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
                <h3>✨ Features</h3>
                <ul style="color: #666;">
                    <li>βœ… <strong>100% reliable</strong> .pages parsing</li>
                    <li>⚑ ZeroGPU acceleration</li>
                    <li>πŸ€– AI-powered formatting</li>
                    <li>🎨 Professional output quality</li>
                    <li>πŸ”’ Secure processing</li>
                </ul>
                
                <div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;">
                    <h4 style="margin-top: 0;">πŸ’‘ How it works:</h4>
                    <ol style="font-size: 0.9em; color: #555; margin-bottom: 0;">
                        <li>CloudConvert extracts text from .pages</li>
                        <li>Cerebras AI formats for your chosen output</li>
                        <li>Download your professionally converted file</li>
                    </ol>
                </div>
            </div>
            """)
    
    # Output section
    with gr.Row():
        output_file = gr.File(
            label="πŸ“ Download Your Converted File"
        )
    
    with gr.Row():
        status_html = gr.HTML(
            value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>"
        )
    
    # Connect the interface
    convert_btn.click(
        fn=convert_pages_document,
        inputs=[file_input, output_format],
        outputs=[output_file, status_html],
        show_progress=True
    )
    
    # Footer
    gr.HTML("""
    <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;">
        <p style="margin-bottom: 0.5rem;">πŸ”§ <strong>Technical Stack:</strong></p>
        <p style="font-size: 0.9em; color: #666; margin-bottom: 0;">
            CloudConvert API for reliable .pages parsing β€’ HuggingFace ZeroGPU for AI processing β€’ Cerebras for lightning-fast inference
        </p>
    </div>
    """)

# Launch the app
if __name__ == "__main__":
    app.launch()