Spaces:
Running
on
Zero
Running
on
Zero
# app.py - Corrected CloudConvert API Integration | |
import gradio as gr | |
import os | |
import spaces | |
import tempfile | |
import requests | |
import time | |
from huggingface_hub import InferenceClient | |
from pathlib import Path | |
# Debug tokens | |
hf_token = os.getenv("HF_TOKEN") | |
cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None | |
print(f"Debug: HF Token exists = {hf_token is not None}") | |
print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}") | |
# Initialize the client with Cerebras | |
client = InferenceClient( | |
"meta-llama/Llama-3.3-70B-Instruct", | |
provider="cerebras", | |
token=hf_token | |
) | |
def convert_pages_to_text(file_path, api_key): | |
"""Convert .pages file to text using CloudConvert API - Correct Format""" | |
base_url = "https://api.cloudconvert.com/v2" | |
headers = { | |
"Authorization": f"Bearer {api_key}", | |
"Content-Type": "application/json" | |
} | |
try: | |
# Step 1: Create a job with correct task structure | |
job_data = { | |
"tasks": { | |
"import-file": { | |
"operation": "import/upload" | |
}, | |
"convert-file": { | |
"operation": "convert", | |
"input": "import-file", | |
"input_format": "pages", | |
"output_format": "txt" | |
}, | |
"export-file": { | |
"operation": "export/url", | |
"input": "convert-file" | |
} | |
} | |
} | |
print("Creating CloudConvert job...") | |
response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data) | |
print(f"Job creation response: {response.status_code}") | |
if not response.ok: | |
print(f"Job creation failed: {response.text}") | |
response.raise_for_status() | |
job = response.json() | |
print(f"Job created successfully: {job['data']['id']}") | |
# Step 2: Upload the file | |
upload_task = None | |
for task in job["data"]["tasks"]: | |
if task["operation"] == "import/upload": | |
upload_task = task | |
break | |
if not upload_task: | |
raise Exception("Upload task not found in job") | |
upload_url = upload_task["result"]["form"]["url"] | |
form_data = upload_task["result"]["form"]["parameters"] | |
print("Uploading file to CloudConvert...") | |
with open(file_path, 'rb') as f: | |
files = {"file": f} | |
upload_response = requests.post(upload_url, data=form_data, files=files) | |
if not upload_response.ok: | |
print(f"Upload failed: {upload_response.text}") | |
upload_response.raise_for_status() | |
print("File uploaded successfully") | |
# Step 3: Wait for conversion to complete | |
job_id = job["data"]["id"] | |
print(f"Waiting for job {job_id} to complete...") | |
max_attempts = 30 # Wait up to 1 minute | |
for attempt in range(max_attempts): | |
status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers) | |
status_response.raise_for_status() | |
job_status = status_response.json() | |
print(f"Job status: {job_status['data']['status']}") | |
if job_status["data"]["status"] == "finished": | |
print("Conversion completed successfully") | |
break | |
elif job_status["data"]["status"] == "error": | |
error_msg = job_status['data'].get('message', 'Unknown error') | |
print(f"Conversion failed: {error_msg}") | |
# Check task-level errors | |
for task in job_status.get('data', {}).get('tasks', []): | |
if task.get('status') == 'error': | |
task_error = task.get('message', 'Unknown task error') | |
print(f"Task {task.get('operation')} error: {task_error}") | |
raise Exception(f"Conversion failed: {error_msg}") | |
time.sleep(2) # Wait 2 seconds before checking again | |
else: | |
raise Exception("Conversion timeout - job took too long") | |
# Step 4: Download the converted text | |
for task in job_status["data"]["tasks"]: | |
if task["operation"] == "export/url" and task["status"] == "finished": | |
download_url = task["result"]["files"][0]["url"] | |
print(f"Downloading result from: {download_url}") | |
download_response = requests.get(download_url) | |
download_response.raise_for_status() | |
text_content = download_response.text | |
print(f"Downloaded {len(text_content)} characters") | |
return text_content | |
raise Exception("No converted file found in completed job") | |
except requests.exceptions.RequestException as e: | |
print(f"HTTP error: {e}") | |
raise Exception(f"CloudConvert HTTP error: {str(e)}") | |
except Exception as e: | |
print(f"General error: {e}") | |
raise Exception(f"CloudConvert error: {str(e)}") | |
def convert_pages_document(file, output_format, progress=gr.Progress()): | |
"""Convert Pages document using CloudConvert + Novita""" | |
if not file: | |
return None, "β Please upload a .pages file" | |
if not cloudconvert_token: | |
return None, "β CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets." | |
try: | |
progress(0.1, desc="π€ Converting with CloudConvert...") | |
# Use CloudConvert to extract text from .pages file | |
print(f"Converting file: {file.name}") | |
text_content = convert_pages_to_text(file.name, cloudconvert_token) | |
if not text_content or len(text_content.strip()) < 10: | |
return None, "β Could not extract content from .pages file" | |
print(f"Extracted text preview: {text_content[:200]}...") | |
progress(0.5, desc="π€ Converting format with Cerebras AI...") | |
# Create format-specific prompt | |
prompt = create_conversion_prompt(text_content, output_format) | |
progress(0.7, desc="β‘ Processing with ZeroGPU...") | |
# Convert using Cerebras | |
try: | |
messages = [{"role": "user", "content": prompt}] | |
response = client.chat_completion( | |
messages=messages, | |
max_tokens=4096, | |
temperature=0.1 | |
) | |
converted_text = response.choices[0].message.content | |
except Exception as e: | |
print(f"Cerebras error: {e}") | |
return None, f"β AI conversion error: {str(e)}" | |
progress(0.9, desc="πΎ Creating output file...") | |
# Create output file | |
output_path = create_output_file(converted_text, output_format) | |
progress(1.0, desc="β Conversion complete!") | |
return output_path, f"β Successfully converted to {output_format}!" | |
except Exception as e: | |
print(f"Conversion error: {e}") | |
return None, f"β Error: {str(e)}" | |
def create_conversion_prompt(content, output_format): | |
"""Create optimized prompt for format conversion""" | |
return f"""You are a document formatter. Convert the following text to {output_format} format. | |
IMPORTANT: | |
1. Keep ALL original content - do not summarize or remove text | |
2. Only adjust formatting for {output_format} | |
3. Preserve all important information, names, and details | |
Original text: | |
{content} | |
Formatted {output_format} output:""" | |
def create_output_file(content, output_format): | |
"""Create output file in specified format""" | |
content = content.strip() | |
if output_format == "PDF": | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.pagesizes import letter | |
import textwrap | |
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f: | |
pdf = canvas.Canvas(f.name, pagesize=letter) | |
width, height = letter | |
y = height - 50 | |
# Better paragraph handling | |
paragraphs = content.split('\n\n') | |
for paragraph in paragraphs: | |
if paragraph.strip(): | |
lines = textwrap.wrap(paragraph.strip(), width=90) | |
for line in lines: | |
if y < 50: | |
pdf.showPage() | |
y = height - 50 | |
pdf.drawString(50, y, line) | |
y -= 20 | |
y -= 10 # Space between paragraphs | |
pdf.save() | |
return f.name | |
elif output_format == "DOCX": | |
from docx import Document | |
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f: | |
doc = Document() | |
# Add paragraphs | |
paragraphs = content.split('\n\n') | |
for paragraph in paragraphs: | |
if paragraph.strip(): | |
doc.add_paragraph(paragraph.strip()) | |
doc.save(f.name) | |
return f.name | |
else: | |
# For TXT, HTML, Markdown | |
ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"} | |
ext = ext_map.get(output_format, ".txt") | |
with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f: | |
f.write(content) | |
return f.name | |
# Create the Gradio interface | |
with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app: | |
# Header | |
gr.HTML(""" | |
<div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;"> | |
<h1>π Pages Converter Pro</h1> | |
<p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p> | |
<p style="font-size: 0.9em; opacity: 0.9;">β¨ Professional .pages parsing + AI-powered format conversion</p> | |
</div> | |
""") | |
# Status indicator | |
with gr.Row(): | |
gr.HTML(f""" | |
<div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;"> | |
<strong>CloudConvert API:</strong> {'β Connected and Ready' if cloudconvert_token else 'β API Key Missing'} | |
</div> | |
""") | |
# Main interface | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.HTML("<h3>π Upload & Convert</h3>") | |
file_input = gr.File( | |
label="Select .pages file", | |
file_types=[".pages"] | |
) | |
output_format = gr.Radio( | |
choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"], | |
value="PDF", | |
label="π― Output Format" | |
) | |
convert_btn = gr.Button( | |
"π Convert Document", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=1): | |
gr.HTML(""" | |
<div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);"> | |
<h3>β¨ Features</h3> | |
<ul style="color: #666;"> | |
<li>β <strong>100% reliable</strong> .pages parsing</li> | |
<li>β‘ ZeroGPU acceleration</li> | |
<li>π€ AI-powered formatting</li> | |
<li>π¨ Professional output quality</li> | |
<li>π Secure processing</li> | |
</ul> | |
<div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;"> | |
<h4 style="margin-top: 0;">π‘ How it works:</h4> | |
<ol style="font-size: 0.9em; color: #555; margin-bottom: 0;"> | |
<li>CloudConvert extracts text from .pages</li> | |
<li>Cerebras AI formats for your chosen output</li> | |
<li>Download your professionally converted file</li> | |
</ol> | |
</div> | |
</div> | |
""") | |
# Output section | |
with gr.Row(): | |
output_file = gr.File( | |
label="π Download Your Converted File" | |
) | |
with gr.Row(): | |
status_html = gr.HTML( | |
value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>" | |
) | |
# Connect the interface | |
convert_btn.click( | |
fn=convert_pages_document, | |
inputs=[file_input, output_format], | |
outputs=[output_file, status_html], | |
show_progress=True | |
) | |
# Footer | |
gr.HTML(""" | |
<div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;"> | |
<p style="margin-bottom: 0.5rem;">π§ <strong>Technical Stack:</strong></p> | |
<p style="font-size: 0.9em; color: #666; margin-bottom: 0;"> | |
CloudConvert API for reliable .pages parsing β’ HuggingFace ZeroGPU for AI processing β’ Cerebras for lightning-fast inference | |
</p> | |
</div> | |
""") | |
# Launch the app | |
if __name__ == "__main__": | |
app.launch() |