# app.py - Fixed version for HF Spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from PIL import Image
import base64
import io
import time
import traceback

# Setup
device = "cpu"  # HF Spaces miễn phí chỉ có CPU
model = None
tokenizer = None
transform = None

def build_transform(input_size=448):
    """Build image transform pipeline"""
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if hasattr(img, 'mode') and img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])

def load_model():
    """Load Vintern model"""
    global model, tokenizer, transform
    try:
        print("🤖 Loading Vintern-1B-v3.5...")
        
        model_name = "5CD-AI/Vintern-1B-v3_5"
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True
        )
        
        # Load model  
        model = AutoModel.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        
        # Build transform
        transform = build_transform()
        
        print("✅ Model loaded successfully!")
        return True
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        traceback.print_exc()
        return False

def safe_image_processing(image):
    """Safely process image input"""
    try:
        # Handle different input types
        if image is None:
            return None, "❌ Không có ảnh đầu vào"
            
        # If it's a file path (string)
        if isinstance(image, str):
            if image.startswith('data:image'):
                # Base64 image
                image_data = image.split(',')[1]
                image_bytes = base64.b64decode(image_data)
                image = Image.open(io.BytesIO(image_bytes))
            else:
                # File path
                image = Image.open(image)
        
        # Ensure it's a PIL Image
        if not hasattr(image, 'mode'):
            return None, "❌ Định dạng ảnh không hợp lệ"
            
        # Convert to RGB if needed
        if image.mode != 'RGB':
            image = image.convert('RGB')
            
        return image, None
        
    except Exception as e:
        return None, f"❌ Lỗi xử lý ảnh: {str(e)}"

def analyze_image(image):
    """Analyze image with Vintern model"""
    if model is None:
        return "❌ Model chưa được tải. Vui lòng chờ..."
    
    try:
        start_time = time.time()
        
        # Safe image processing
        processed_image, error = safe_image_processing(image)
        if error:
            return error
            
        if processed_image is None:
            return "❌ Không thể xử lý ảnh đầu vào"
        
        # Transform image
        image_tensor = transform(processed_image).unsqueeze(0).to(device)
        
        with torch.no_grad():
            # Main description
            query = "Mô tả chi tiết những gì bạn thấy trong hình ảnh này:"
            
            try:
                description = model.chat(
                    tokenizer,
                    image_tensor,
                    query,
                    generation_config=dict(
                        max_new_tokens=200,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.9,
                        repetition_penalty=1.1
                    )
                )
            except Exception as chat_error:
                print(f"Chat method failed: {chat_error}")
                # Fallback to simple generation
                inputs = tokenizer(query, return_tensors="pt").to(device)
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    do_sample=True,
                    temperature=0.7
                )
                description = tokenizer.decode(outputs[0], skip_special_tokens=True)
                description = description.replace(query, "").strip()
            
            # Get objects
            try:
                object_query = "Liệt kê các đối tượng chính trong ảnh:"
                objects_text = model.chat(
                    tokenizer,
                    image_tensor,
                    object_query,
                    generation_config=dict(max_new_tokens=100, temperature=0.5)
                )
                objects = [obj.strip() for obj in objects_text.replace(',', ' ').split() if len(obj.strip()) > 2][:5]
                objects_str = ", ".join(objects) if objects else "Không có"
            except:
                objects_str = "Không có"
            
            processing_time = time.time() - start_time
            
            # Format output
            return f"""**📝 Mô tả từ Vintern AI:**
{description}

**🔍 Đối tượng nhận diện:**
{objects_str}

**⚡ Thời gian xử lý:** {processing_time:.2f}s
**🤖 Model:** Vintern-1B-v3.5 (Hugging Face Spaces)
**💻 Device:** {device.upper()}

---
*Để sử dụng cho video real-time, hãy sử dụng API endpoint của Space này với trangchu.html*
"""
    
    except Exception as e:
        error_msg = f"❌ Lỗi phân tích: {str(e)}"
        print(error_msg)
        traceback.print_exc()
        return error_msg

def analyze_for_api(image_file):
    """API-friendly analysis function"""
    try:
        result = analyze_image(image_file)
        # Return simple text for API consumption
        return result
    except Exception as e:
        return f"Error: {str(e)}"

# Load model when starting
print("🚀 Initializing Vintern-1B-v3.5...")
model_loaded = load_model()

# Create Gradio interface
with gr.Blocks(
    title="Vintern-1B-v3.5 Video Recognition",
    theme=gr.themes.Soft(),
    css="""
    .gradio-container {
        max-width: 1200px !important;
    }
    .upload-area {
        min-height: 300px;
    }
    """
) as demo:
    
    gr.Markdown("""
    # 🎥 Vintern-1B-v3.5 - Nhận Diện Ảnh Tiếng Việt
    
    **Powered by Hugging Face Spaces** | Model chạy hoàn toàn trên cloud
    """)
    
    if not model_loaded:
        gr.Markdown("⚠️ **Model đang được tải...** Vui lòng chờ vài phút và refresh trang.")
    else:
        gr.Markdown("✅ **Model đã sẵn sàng!** Upload ảnh để bắt đầu nhận diện.")
    
    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                type="pil", 
                label="📤 Upload Ảnh",
                elem_classes=["upload-area"]
            )
            
            with gr.Row():
                analyze_btn = gr.Button("🔍 Phân Tích", variant="primary", scale=2)
                clear_btn = gr.Button("🗑️ Xóa", variant="secondary", scale=1)
        
        with gr.Column(scale=1):
            result_output = gr.Textbox(
                label="📋 Kết Quả Phân Tích", 
                lines=15, 
                max_lines=20,
                show_copy_button=True
            )
    
    # Event handlers
    analyze_btn.click(
        fn=analyze_image,
        inputs=image_input,
        outputs=result_output
    )
    
    clear_btn.click(
        fn=lambda: (None, ""),
        outputs=[image_input, result_output]
    )
    
    # Auto-analyze on image upload
    image_input.change(
        fn=analyze_image,
        inputs=image_input,
        outputs=result_output
    )
    
    gr.Markdown("""
    ---
    ## 💡 Hướng dẫn sử dụng:
    
    ### 🖼️ Phân tích ảnh đơn lẻ:
    1. **Upload ảnh** từ máy tính hoặc drag & drop
    2. **Kết quả tự động** hiển thị sau khi upload
    3. **Hoặc nhấn "Phân Tích"** để chạy lại
    
    ### 🎥 Phân tích video real-time:
    1. **Copy URL Space này:** `https://nguyentantoan-vintern-video-recognition.hf.space`
    2. **Mở trangchu.html** đã được cung cấp
    3. **Thay URL** trong code JavaScript
    4. **Sử dụng camera** để phân tích real-time
    
    ### 🔗 API Usage:
    ```javascript
    // POST to: https://nguyentantoan-vintern-video-recognition.hf.space/api/predict
    // Body: FormData with image file
    ```
    
    **⚠️ Lưu ý:** Việc phân tích có thể mất 10-30 giây do chạy trên CPU miễn phí của HF Spaces.
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0", 
        server_port=7860,
        show_error=True,
        quiet=False
    )