Spaces:

nguyentantoan
/

vintern-video-recognition

Sleeping

App Files Files Community

nguyentantoan commited on May 23

Commit

8afdc67

verified ·

1 Parent(s): a7f24f5

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -189

app.py CHANGED Viewed

@@ -1,23 +1,22 @@
-# app.py - Fixed version for HF Spaces
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 import torchvision.transforms as T
 from torchvision.transforms.functional import InterpolationMode
 from PIL import Image
-import base64
-import io
 import time
 import traceback
 # Setup
-device = "cpu"  # HF Spaces miễn phí chỉ có CPU
 model = None
 tokenizer = None
 transform = None
 def build_transform(input_size=448):
-    """Build image transform pipeline"""
     IMAGENET_MEAN = (0.485, 0.456, 0.406)
     IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -29,20 +28,19 @@ def build_transform(input_size=448):
     ])
 def load_model():
-    """Load Vintern model"""
     global model, tokenizer, transform
     try:
-        print("🤖 Loading Vintern-1B-v3.5...")
-        model_name = "5CD-AI/Vintern-1B-v3_5"
-        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
             model_name,
             trust_remote_code=True
         )
-        # Load model
         model = AutoModel.from_pretrained(
             model_name,
             torch_dtype=torch.float32,
@@ -50,240 +48,119 @@ def load_model():
             low_cpu_mem_usage=True
         )
-        # Build transform
         transform = build_transform()
-        print("✅ Model loaded successfully!")
         return True
     except Exception as e:
-        print(f"❌ Error loading model: {e}")
         traceback.print_exc()
         return False
-def safe_image_processing(image):
-    """Safely process image input"""
-    try:
-        # Handle different input types
-        if image is None:
-            return None, "❌ Không có ảnh đầu vào"
-        # If it's a file path (string)
-        if isinstance(image, str):
-            if image.startswith('data:image'):
-                # Base64 image
-                image_data = image.split(',')[1]
-                image_bytes = base64.b64decode(image_data)
-                image = Image.open(io.BytesIO(image_bytes))
-            else:
-                # File path
-                image = Image.open(image)
-        # Ensure it's a PIL Image
-        if not hasattr(image, 'mode'):
-            return None, "❌ Định dạng ảnh không hợp lệ"
-        # Convert to RGB if needed
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        return image, None
-    except Exception as e:
-        return None, f"❌ Lỗi xử lý ảnh: {str(e)}"
-def analyze_image(image):
-    """Analyze image with Vintern model"""
     if model is None:
-        return "❌ Model chưa được tải. Vui lòng chờ..."
     try:
         start_time = time.time()
-        # Safe image processing
-        processed_image, error = safe_image_processing(image)
-        if error:
-            return error
-        if processed_image is None:
-            return "❌ Không thể xử lý ảnh đầu vào"
-        # Transform image
-        image_tensor = transform(processed_image).unsqueeze(0).to(device)
         with torch.no_grad():
-            # Main description
-            query = "Mô tả chi tiết những gì bạn thấy trong hình ảnh này:"
             try:
-                description = model.chat(
                     tokenizer,
                     image_tensor,
                     query,
                     generation_config=dict(
-                        max_new_tokens=200,
-                        do_sample=True,
                         temperature=0.7,
-                        top_p=0.9,
-                        repetition_penalty=1.1
                     )
                 )
-            except Exception as chat_error:
-                print(f"Chat method failed: {chat_error}")
-                # Fallback to simple generation
                 inputs = tokenizer(query, return_tensors="pt").to(device)
                 outputs = model.generate(
                     **inputs,
-                    max_new_tokens=150,
-                    do_sample=True,
-                    temperature=0.7
-                )
-                description = tokenizer.decode(outputs[0], skip_special_tokens=True)
-                description = description.replace(query, "").strip()
-            # Get objects
-            try:
-                object_query = "Liệt kê các đối tượng chính trong ảnh:"
-                objects_text = model.chat(
-                    tokenizer,
-                    image_tensor,
-                    object_query,
-                    generation_config=dict(max_new_tokens=100, temperature=0.5)
                 )
-                objects = [obj.strip() for obj in objects_text.replace(',', ' ').split() if len(obj.strip()) > 2][:5]
-                objects_str = ", ".join(objects) if objects else "Không có"
-            except:
-                objects_str = "Không có"
             processing_time = time.time() - start_time
-            # Format output
-            return f"""**📝 Mô tả từ Vintern AI:**
-{description}
-**🔍 Đối tượng nhận diện:**
-{objects_str}
-**⚡ Thời gian xử lý:** {processing_time:.2f}s
-**🤖 Model:** Vintern-1B-v3.5 (Hugging Face Spaces)
-**💻 Device:** {device.upper()}
 ---
-*Để sử dụng cho video real-time, hãy sử dụng API endpoint của Space này với trangchu.html*
 """
     except Exception as e:
-        error_msg = f"❌ Lỗi phân tích: {str(e)}"
-        print(error_msg)
-        traceback.print_exc()
-        return error_msg
-def analyze_for_api(image_file):
-    """API-friendly analysis function"""
-    try:
-        result = analyze_image(image_file)
-        # Return simple text for API consumption
-        return result
-    except Exception as e:
-        return f"Error: {str(e)}"
-# Load model when starting
-print("🚀 Initializing Vintern-1B-v3.5...")
 model_loaded = load_model()
-# Create Gradio interface
 with gr.Blocks(
-    title="Vintern-1B-v3.5 Video Recognition",
-    theme=gr.themes.Soft(),
-    css="""
-    .gradio-container {
-        max-width: 1200px !important;
-    }
-    .upload-area {
-        min-height: 300px;
-    }
-    """
 ) as demo:
-    gr.Markdown("""
-    # 🎥 Vintern-1B-v3.5 - Nhận Diện Ảnh Tiếng Việt
-    **Powered by Hugging Face Spaces** | Model chạy hoàn toàn trên cloud
-    """)
-    if not model_loaded:
-        gr.Markdown("⚠️ **Model đang được tải...** Vui lòng chờ vài phút và refresh trang.")
-    else:
-        gr.Markdown("✅ **Model đã sẵn sàng!** Upload ảnh để bắt đầu nhận diện.")
     with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(
-                type="pil",
-                label="📤 Upload Ảnh",
-                elem_classes=["upload-area"]
-            )
-            with gr.Row():
-                analyze_btn = gr.Button("🔍 Phân Tích", variant="primary", scale=2)
-                clear_btn = gr.Button("🗑️ Xóa", variant="secondary", scale=1)
-        with gr.Column(scale=1):
-            result_output = gr.Textbox(
-                label="📋 Kết Quả Phân Tích",
-                lines=15,
-                max_lines=20,
-                show_copy_button=True
-            )
-    # Event handlers
-    analyze_btn.click(
-        fn=analyze_image,
-        inputs=image_input,
-        outputs=result_output
-    )
-    clear_btn.click(
-        fn=lambda: (None, ""),
-        outputs=[image_input, result_output]
-    )
-    # Auto-analyze on image upload
     image_input.change(
-        fn=analyze_image,
         inputs=image_input,
         outputs=result_output
     )
     gr.Markdown("""
-    ---
-    ## 💡 Hướng dẫn sử dụng:
-    ### 🖼️ Phân tích ảnh đơn lẻ:
-    1. **Upload ảnh** từ máy tính hoặc drag & drop
-    2. **Kết quả tự động** hiển thị sau khi upload
-    3. **Hoặc nhấn "Phân Tích"** để chạy lại
-    ### 🎥 Phân tích video real-time:
-    1. **Copy URL Space này:** `https://nguyentantoan-vintern-video-recognition.hf.space`
-    2. **Mở trangchu.html** đã được cung cấp
-    3. **Thay URL** trong code JavaScript
-    4. **Sử dụng camera** để phân tích real-time
-    ### 🔗 API Usage:
-    ```javascript
-    // POST to: https://nguyentantoan-vintern-video-recognition.hf.space/api/predict
-    // Body: FormData with image file
-    ```
-    **⚠️ Lưu ý:** Việc phân tích có thể mất 10-30 giây do chạy trên CPU miễn phí của HF Spaces.
     """)
-# Launch the app
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True,
-        quiet=False
-    )

+# app_fast.py - Vintern-1B Fast Version
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 import torchvision.transforms as T
 from torchvision.transforms.functional import InterpolationMode
 from PIL import Image
 import time
+import json
 import traceback
 # Setup
+device = "cpu"
 model = None
 tokenizer = None
 transform = None
 def build_transform(input_size=448):
+    """Optimized transform"""
     IMAGENET_MEAN = (0.485, 0.456, 0.406)
     IMAGENET_STD = (0.229, 0.224, 0.225)
     ])
 def load_model():
+    """Load Vintern-1B (faster version)"""
     global model, tokenizer, transform
     try:
+        print("🚀 Loading Vintern-1B (Fast Version)...")
+        # Sử dụng model nhẹ hơn
+        model_name = "5CD-AI/Vintern-1B-v2"  # Thay vì v3.5
         tokenizer = AutoTokenizer.from_pretrained(
             model_name,
             trust_remote_code=True
         )
         model = AutoModel.from_pretrained(
             model_name,
             torch_dtype=torch.float32,
             low_cpu_mem_usage=True
         )
+        # Optimize model for inference
+        model.eval()
+        model = torch.jit.optimize_for_inference(model)
         transform = build_transform()
+        print("✅ Fast model loaded!")
         return True
     except Exception as e:
+        print(f"❌ Error: {e}")
         traceback.print_exc()
         return False
+def fast_analyze(image):
+    """Optimized analysis function"""
     if model is None:
+        return "❌ Model chưa sẵn sàng"
     try:
         start_time = time.time()
+        # Quick image processing
+        if image is None:
+            return "❌ Không có ảnh"
+        if hasattr(image, 'mode') and image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Fast transform
+        image_tensor = transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
+            # Shorter, faster generation
+            query = "Mô tả ngắn gọn:"
             try:
+                result = model.chat(
                     tokenizer,
                     image_tensor,
                     query,
                     generation_config=dict(
+                        max_new_tokens=100,  # Ngắn hơn → nhanh hơn
+                        do_sample=False,     # Greedy → nhanh hơn
                         temperature=0.7,
+                        num_beams=1         # No beam search → nhanh hơn
                     )
                 )
+            except:
+                # Fallback nhanh
                 inputs = tokenizer(query, return_tensors="pt").to(device)
                 outputs = model.generate(
                     **inputs,
+                    max_new_tokens=80,
+                    do_sample=False,
+                    num_beams=1
                 )
+                result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                result = result.replace(query, "").strip()
             processing_time = time.time() - start_time
+            return f"""**📝 Mô tả nhanh:**
+{result}
+**⚡ Thời gian:** {processing_time:.1f}s
+**🤖 Model:** Vintern-1B-v2 (Optimized)
+**💨 Tốc độ:** {1/processing_time:.1f} FPS
 ---
+*Model được tối ưu cho tốc độ - phù hợp real-time*
 """
     except Exception as e:
+        return f"❌ Lỗi: {str(e)}"
+# Load model
+print("🚀 Starting Fast Vintern Server...")
 model_loaded = load_model()
+# Lightweight Gradio interface
 with gr.Blocks(
+    title="Vintern-1B Fast",
+    theme=gr.themes.Base(),
 ) as demo:
+    gr.Markdown("# ⚡ Vintern-1B Fast - Tốc Độ Cao")
+    if model_loaded:
+        gr.Markdown("✅ **Model sẵn sàng!** Tối ưu cho tốc độ và real-time.")
     with gr.Row():
+        image_input = gr.Image(type="pil", label="📤 Upload Ảnh")
+        result_output = gr.Textbox(
+            label="📋 Kết Quả",
+            lines=8,
+            show_copy_button=True
+        )
+    # Auto-analyze on upload
     image_input.change(
+        fn=fast_analyze,
         inputs=image_input,
         outputs=result_output
     )
     gr.Markdown("""
+    ### ⚡ Tối ưu cho tốc độ:
+    - **Model nhẹ**: Vintern-1B-v2 (~1.5GB)
+    - **Fast generation**: Greedy decode, short output
+    - **Optimized**: JIT compilation, no beam search
+    - **Real-time ready**: ~2-5 giây/ảnh
     """)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)