SlouchyBuffalo commited on
Commit
393e4f2
Β·
verified Β·
1 Parent(s): b60ce77

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +357 -0
app.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Corrected CloudConvert API Integration
2
+ import gradio as gr
3
+ import os
4
+ import spaces
5
+ import tempfile
6
+ import requests
7
+ import time
8
+ from huggingface_hub import InferenceClient
9
+ from pathlib import Path
10
+
11
+ # Debug tokens
12
+ hf_token = os.getenv("HF_TOKEN")
13
+ cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None
14
+ print(f"Debug: HF Token exists = {hf_token is not None}")
15
+ print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}")
16
+
17
+ # Initialize the client with Cerebras
18
+ client = InferenceClient(
19
+ "meta-llama/Llama-3.3-70B-Instruct",
20
+ provider="cerebras",
21
+ token=hf_token
22
+ )
23
+
24
+ def convert_pages_to_text(file_path, api_key):
25
+ """Convert .pages file to text using CloudConvert API - Correct Format"""
26
+ base_url = "https://api.cloudconvert.com/v2"
27
+ headers = {
28
+ "Authorization": f"Bearer {api_key}",
29
+ "Content-Type": "application/json"
30
+ }
31
+
32
+ try:
33
+ # Step 1: Create a job with correct task structure
34
+ job_data = {
35
+ "tasks": {
36
+ "import-file": {
37
+ "operation": "import/upload"
38
+ },
39
+ "convert-file": {
40
+ "operation": "convert",
41
+ "input": "import-file",
42
+ "input_format": "pages",
43
+ "output_format": "txt"
44
+ },
45
+ "export-file": {
46
+ "operation": "export/url",
47
+ "input": "convert-file"
48
+ }
49
+ }
50
+ }
51
+
52
+ print("Creating CloudConvert job...")
53
+ response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data)
54
+ print(f"Job creation response: {response.status_code}")
55
+
56
+ if not response.ok:
57
+ print(f"Job creation failed: {response.text}")
58
+ response.raise_for_status()
59
+
60
+ job = response.json()
61
+ print(f"Job created successfully: {job['data']['id']}")
62
+
63
+ # Step 2: Upload the file
64
+ upload_task = None
65
+ for task in job["data"]["tasks"]:
66
+ if task["operation"] == "import/upload":
67
+ upload_task = task
68
+ break
69
+
70
+ if not upload_task:
71
+ raise Exception("Upload task not found in job")
72
+
73
+ upload_url = upload_task["result"]["form"]["url"]
74
+ form_data = upload_task["result"]["form"]["parameters"]
75
+
76
+ print("Uploading file to CloudConvert...")
77
+ with open(file_path, 'rb') as f:
78
+ files = {"file": f}
79
+ upload_response = requests.post(upload_url, data=form_data, files=files)
80
+
81
+ if not upload_response.ok:
82
+ print(f"Upload failed: {upload_response.text}")
83
+ upload_response.raise_for_status()
84
+
85
+ print("File uploaded successfully")
86
+
87
+ # Step 3: Wait for conversion to complete
88
+ job_id = job["data"]["id"]
89
+ print(f"Waiting for job {job_id} to complete...")
90
+
91
+ max_attempts = 30 # Wait up to 1 minute
92
+ for attempt in range(max_attempts):
93
+ status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers)
94
+ status_response.raise_for_status()
95
+ job_status = status_response.json()
96
+
97
+ print(f"Job status: {job_status['data']['status']}")
98
+
99
+ if job_status["data"]["status"] == "finished":
100
+ print("Conversion completed successfully")
101
+ break
102
+ elif job_status["data"]["status"] == "error":
103
+ error_msg = job_status['data'].get('message', 'Unknown error')
104
+ print(f"Conversion failed: {error_msg}")
105
+
106
+ # Check task-level errors
107
+ for task in job_status.get('data', {}).get('tasks', []):
108
+ if task.get('status') == 'error':
109
+ task_error = task.get('message', 'Unknown task error')
110
+ print(f"Task {task.get('operation')} error: {task_error}")
111
+
112
+ raise Exception(f"Conversion failed: {error_msg}")
113
+
114
+ time.sleep(2) # Wait 2 seconds before checking again
115
+ else:
116
+ raise Exception("Conversion timeout - job took too long")
117
+
118
+ # Step 4: Download the converted text
119
+ for task in job_status["data"]["tasks"]:
120
+ if task["operation"] == "export/url" and task["status"] == "finished":
121
+ download_url = task["result"]["files"][0]["url"]
122
+ print(f"Downloading result from: {download_url}")
123
+
124
+ download_response = requests.get(download_url)
125
+ download_response.raise_for_status()
126
+
127
+ text_content = download_response.text
128
+ print(f"Downloaded {len(text_content)} characters")
129
+ return text_content
130
+
131
+ raise Exception("No converted file found in completed job")
132
+
133
+ except requests.exceptions.RequestException as e:
134
+ print(f"HTTP error: {e}")
135
+ raise Exception(f"CloudConvert HTTP error: {str(e)}")
136
+ except Exception as e:
137
+ print(f"General error: {e}")
138
+ raise Exception(f"CloudConvert error: {str(e)}")
139
+
140
+ @spaces.GPU
141
+ def convert_pages_document(file, output_format, progress=gr.Progress()):
142
+ """Convert Pages document using CloudConvert + Cerebras"""
143
+ if not file:
144
+ return None, "❌ Please upload a .pages file"
145
+
146
+ if not cloudconvert_token:
147
+ return None, "❌ CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets."
148
+
149
+ try:
150
+ progress(0.1, desc="πŸ“€ Converting with CloudConvert...")
151
+
152
+ # Use CloudConvert to extract text from .pages file
153
+ print(f"Converting file: {file.name}")
154
+ text_content = convert_pages_to_text(file.name, cloudconvert_token)
155
+
156
+ if not text_content or len(text_content.strip()) < 10:
157
+ return None, "❌ Could not extract content from .pages file"
158
+
159
+ print(f"Extracted text preview: {text_content[:200]}...")
160
+
161
+ progress(0.5, desc="πŸ€– Converting format with Cerebras AI...")
162
+
163
+ # Create format-specific prompt
164
+ prompt = create_conversion_prompt(text_content, output_format)
165
+
166
+ progress(0.7, desc="⚑ Processing with ZeroGPU...")
167
+
168
+ # Convert using Cerebras
169
+ try:
170
+ messages = [{"role": "user", "content": prompt}]
171
+ response = client.chat_completion(
172
+ messages=messages,
173
+ max_tokens=4096,
174
+ temperature=0.1
175
+ )
176
+ converted_text = response.choices[0].message.content
177
+ except Exception as e:
178
+ print(f"Cerebras error: {e}")
179
+ return None, f"❌ AI conversion error: {str(e)}"
180
+
181
+ progress(0.9, desc="πŸ’Ύ Creating output file...")
182
+
183
+ # Create output file
184
+ output_path = create_output_file(converted_text, output_format)
185
+
186
+ progress(1.0, desc="βœ… Conversion complete!")
187
+
188
+ return output_path, f"βœ… Successfully converted to {output_format}!"
189
+
190
+ except Exception as e:
191
+ print(f"Conversion error: {e}")
192
+ return None, f"❌ Error: {str(e)}"
193
+
194
+ def create_conversion_prompt(content, output_format):
195
+ """Create optimized prompt for format conversion"""
196
+ return f"""You are a document formatter. Convert the following text to {output_format} format.
197
+
198
+ IMPORTANT:
199
+ 1. Keep ALL original content - do not summarize or remove text
200
+ 2. Only adjust formatting for {output_format}
201
+ 3. Preserve all important information, names, and details
202
+
203
+ Original text:
204
+ {content}
205
+
206
+ Formatted {output_format} output:"""
207
+
208
+ def create_output_file(content, output_format):
209
+ """Create output file in specified format"""
210
+ content = content.strip()
211
+
212
+ if output_format == "PDF":
213
+ from reportlab.pdfgen import canvas
214
+ from reportlab.lib.pagesizes import letter
215
+ import textwrap
216
+
217
+ with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
218
+ pdf = canvas.Canvas(f.name, pagesize=letter)
219
+ width, height = letter
220
+ y = height - 50
221
+
222
+ # Better paragraph handling
223
+ paragraphs = content.split('\n\n')
224
+ for paragraph in paragraphs:
225
+ if paragraph.strip():
226
+ lines = textwrap.wrap(paragraph.strip(), width=90)
227
+ for line in lines:
228
+ if y < 50:
229
+ pdf.showPage()
230
+ y = height - 50
231
+ pdf.drawString(50, y, line)
232
+ y -= 20
233
+ y -= 10 # Space between paragraphs
234
+
235
+ pdf.save()
236
+ return f.name
237
+
238
+ elif output_format == "DOCX":
239
+ from docx import Document
240
+
241
+ with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
242
+ doc = Document()
243
+
244
+ # Add paragraphs
245
+ paragraphs = content.split('\n\n')
246
+ for paragraph in paragraphs:
247
+ if paragraph.strip():
248
+ doc.add_paragraph(paragraph.strip())
249
+
250
+ doc.save(f.name)
251
+ return f.name
252
+
253
+ else:
254
+ # For TXT, HTML, Markdown
255
+ ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"}
256
+ ext = ext_map.get(output_format, ".txt")
257
+
258
+ with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
259
+ f.write(content)
260
+ return f.name
261
+
262
+ # Create the Gradio interface
263
+ with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app:
264
+ # Header
265
+ gr.HTML("""
266
+ <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;">
267
+ <h1>πŸ“„ Pages Converter Pro</h1>
268
+ <p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p>
269
+ <p style="font-size: 0.9em; opacity: 0.9;">✨ Professional .pages parsing + AI-powered format conversion</p>
270
+ </div>
271
+ """)
272
+
273
+ # Status indicator
274
+ with gr.Row():
275
+ gr.HTML(f"""
276
+ <div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;">
277
+ <strong>CloudConvert API:</strong> {'βœ… Connected and Ready' if cloudconvert_token else '❌ API Key Missing'}
278
+ </div>
279
+ """)
280
+
281
+ # Main interface
282
+ with gr.Row():
283
+ with gr.Column(scale=2):
284
+ gr.HTML("<h3>πŸ“Ž Upload & Convert</h3>")
285
+
286
+ file_input = gr.File(
287
+ label="Select .pages file",
288
+ file_types=[".pages"]
289
+ )
290
+
291
+ output_format = gr.Radio(
292
+ choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
293
+ value="PDF",
294
+ label="🎯 Output Format"
295
+ )
296
+
297
+ convert_btn = gr.Button(
298
+ "πŸš€ Convert Document",
299
+ variant="primary",
300
+ size="lg"
301
+ )
302
+
303
+ with gr.Column(scale=1):
304
+ gr.HTML("""
305
+ <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
306
+ <h3>✨ Features</h3>
307
+ <ul style="color: #666;">
308
+ <li>βœ… <strong>100% reliable</strong> .pages parsing</li>
309
+ <li>⚑ ZeroGPU acceleration</li>
310
+ <li>πŸ€– AI-powered formatting</li>
311
+ <li>🎨 Professional output quality</li>
312
+ <li>πŸ”’ Secure processing</li>
313
+ </ul>
314
+
315
+ <div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;">
316
+ <h4 style="margin-top: 0;">πŸ’‘ How it works:</h4>
317
+ <ol style="font-size: 0.9em; color: #555; margin-bottom: 0;">
318
+ <li>CloudConvert extracts text from .pages</li>
319
+ <li>Cerebras AI formats for your chosen output</li>
320
+ <li>Download your professionally converted file</li>
321
+ </ol>
322
+ </div>
323
+ </div>
324
+ """)
325
+
326
+ # Output section
327
+ with gr.Row():
328
+ output_file = gr.File(
329
+ label="πŸ“ Download Your Converted File"
330
+ )
331
+
332
+ with gr.Row():
333
+ status_html = gr.HTML(
334
+ value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>"
335
+ )
336
+
337
+ # Connect the interface
338
+ convert_btn.click(
339
+ fn=convert_pages_document,
340
+ inputs=[file_input, output_format],
341
+ outputs=[output_file, status_html],
342
+ show_progress=True
343
+ )
344
+
345
+ # Footer
346
+ gr.HTML("""
347
+ <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;">
348
+ <p style="margin-bottom: 0.5rem;">πŸ”§ <strong>Technical Stack:</strong></p>
349
+ <p style="font-size: 0.9em; color: #666; margin-bottom: 0;">
350
+ CloudConvert API for reliable .pages parsing β€’ HuggingFace ZeroGPU for AI processing β€’ Cerebras for lightning-fast inference
351
+ </p>
352
+ </div>
353
+ """)
354
+
355
+ # Launch the app
356
+ if __name__ == "__main__":
357
+ app.launch()