VLV_Caption

Runtime error

App Files Files Community

lambertxiao commited on Jul 15

Commit

b98d7a4

verified ·

1 Parent(s): e19b803

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -305

app.py CHANGED Viewed

@@ -1,311 +1,26 @@
-import spaces
 import gradio as gr
 from transformers import AutoModel, AutoProcessor
 from PIL import Image
 import torch
 import numpy as np
-import sys
-import os
-from pathlib import Path
-import shutil
-import types
-# Set environment variables for better debugging
-os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
-# Model configuration
-model_name_or_path = "lambertxiao/Vision-Language-Vision-Captioner-Qwen2.5-3B"
-def create_fake_module_structure():
-    """Create fake module structure to handle import errors during download"""
-    # Create the module hierarchy that the model expects
-    transformers_modules = types.ModuleType('transformers_modules')
-    lambertxiao = types.ModuleType('lambertxiao')
-    vision_captioner = types.ModuleType('Vision-Language-Vision-Captioner-Qwen2')
-    # Set up the module hierarchy
-    sys.modules['transformers_modules'] = transformers_modules
-    sys.modules['transformers_modules.lambertxiao'] = lambertxiao
-    sys.modules['transformers_modules.lambertxiao.Vision-Language-Vision-Captioner-Qwen2'] = vision_captioner
-    # Link the modules
-    setattr(transformers_modules, 'lambertxiao', lambertxiao)
-    setattr(lambertxiao, 'Vision-Language-Vision-Captioner-Qwen2', vision_captioner)
-    # Also handle the dot notation
-    sys.modules['transformers_modules.lambertxiao.Vision-Language-Vision-Captioner-Qwen2.5-3B'] = vision_captioner
-def fix_imports_in_file(file_path):
-    """Fix import statements in a Python file"""
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-        original_content = content
-        # Fix relative imports
-        replacements = [
-            ("from .De_DiffusionV2_Image import", "from De_DiffusionV2_Image import"),
-            ("from .modeling_clip import", "from modeling_clip import"),
-            ("from .configuration_clip import", "from configuration_clip import"),
-            ("from .modeling_florence2 import", "from modeling_florence2 import"),
-            ("from .configuration_florence2 import", "from configuration_florence2 import"),
-            ("from .processing_florence2 import", "from processing_florence2 import"),
-            ("from .utils import", "from utils import"),
-            ("from .build_unfreeze import", "from build_unfreeze import"),
-            ("from .sd_config import", "from sd_config import"),
-        ]
-        for old, new in replacements:
-            content = content.replace(old, new)
-        # Remove or fix the problematic transformers_modules imports
-        content = content.replace(
-            "from transformers_modules.lambertxiao.Vision-Language-Vision-Captioner-Qwen2",
-            "# Fixed import - removed transformers_modules prefix\nfrom"
-        )
-        if content != original_content:
-            with open(file_path, 'w', encoding='utf-8') as f:
-                f.write(content)
-            return True
-    except Exception as e:
-        print(f"Error fixing {file_path}: {e}")
-    return False
-def monitor_and_fix_downloads():
-    """Monitor the cache directory and fix files as they are downloaded"""
-    cache_base = Path.home() / ".cache" / "huggingface" / "modules" / "transformers_modules"
-    # Create a set to track fixed files
-    fixed_files = set()
-    def fix_new_files():
-        # Look for Python files in the cache
-        for py_file in cache_base.rglob("*.py"):
-            if str(py_file) not in fixed_files:
-                if fix_imports_in_file(py_file):
-                    print(f"✓ Fixed imports in: {py_file.name}")
-                    fixed_files.add(str(py_file))
-    return fix_new_files
-# Create fake module structure first
-print("🔧 Setting up module structure...")
-create_fake_module_structure()
-# Setup file monitoring
-fix_files = monitor_and_fix_downloads()
-# Custom import hook to fix files on the fly
-class ImportFixer:
-    def __init__(self):
-        self.fixed_modules = set()
-    def find_spec(self, name, path, target=None):
-        # Fix files whenever an import is attempted
-        fix_files()
-        return None
-# Install the import hook
-import_fixer = ImportFixer()
-sys.meta_path.insert(0, import_fixer)
-print("📥 Downloading and loading model...")
-# First attempt - this might fail but will download files
-try:
-    from transformers import AutoConfig
-    # Add paths before attempting to load
-    cache_base = Path.home() / ".cache" / "huggingface" / "modules" / "transformers_modules"
-    possible_paths = [
-        cache_base / "lambertxiao" / "Vision-Language-Vision-Captioner-Qwen2.5-3B",
-        cache_base / "lambertxiao",
-        cache_base,
-    ]
-    for path in possible_paths:
-        if path.exists() and str(path) not in sys.path:
-            sys.path.insert(0, str(path))
-    # Try to load config - this triggers download
-    config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
-    print("✓ Config loaded successfully")
-except Exception as e:
-    print(f"⚠️ Initial load failed (expected): {e}")
-    print("🔧 Fixing downloaded files...")
-    # Fix all downloaded files
-    fix_files()
-    # Find and add all relevant directories to path
-    cache_base = Path.home() / ".cache" / "huggingface" / "modules" / "transformers_modules"
-    for subdir in cache_base.rglob("*"):
-        if subdir.is_dir() and "lambertxiao" in str(subdir):
-            if str(subdir) not in sys.path:
-                sys.path.insert(0, str(subdir))
-# Now load the model - should work after fixes
-print("\n📊 Loading model with fixed imports...")
-try:
-    # Remove the import hook to avoid interference
-    sys.meta_path.remove(import_fixer)
-    # Load the model
-    model = AutoModel.from_pretrained(
-        model_name_or_path,
-        trust_remote_code=True,
-        low_cpu_mem_usage=False,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-    )
-    # Move to GPU if available
-    if torch.cuda.is_available():
-        model = model.to("cuda")
-        print(f"✓ Model loaded on GPU: {torch.cuda.get_device_name(0)}")
-    else:
-        model = model.to("cpu")
-        print("✓ Model loaded on CPU")
-except Exception as e:
-    print(f"❌ Error loading model: {e}")
-    # Last resort - try with minimal setup
-    print("🔧 Attempting minimal setup...")
-    # Clear any problematic imports
-    modules_to_remove = [k for k in sys.modules.keys() if 'lambertxiao' in k or 'Vision-Language-Vision' in k]
-    for module in modules_to_remove:
-        del sys.modules[module]
-    # Re-create fake modules
-    create_fake_module_structure()
-    # Try one more time
-    model = AutoModel.from_pretrained(
-        model_name_or_path,
-        trust_remote_code=True,
-        device_map="auto"
-    )
-print("\n✅ Model setup complete!")
-def drop_incomplete_tail(text):
-    """Remove incomplete sentences from the end of text"""
-    if not text:
-        return ""
-    sentences = text.split('.')
-    complete_sentences = [s.strip() for s in sentences if s.strip()]
-    if not text.strip().endswith('.') and complete_sentences:
-        complete_sentences = complete_sentences[:-1]
-    result = '. '.join(complete_sentences)
-    if result and complete_sentences:
-        result += '.'
-    return result
-@spaces.GPU(duration=120)
-def caption_image(image):
-    """Generate caption for the image"""
-    try:
-        # Ensure model is on GPU when using spaces.GPU
-        if torch.cuda.is_available():
-            if hasattr(model, 'device') and model.device.type != 'cuda':
-                model.to("cuda")
-        with torch.no_grad():
-            try:
-                outputs = model([image], 77)
-            except RuntimeError as e:
-                if "CUDA error" in str(e) or "device-side assert" in str(e):
-                    print(f"⚠️ CUDA error: {e}")
-                    torch.cuda.empty_cache()
-                    torch.cuda.synchronize()
-                    # Retry with different approach
-                    outputs = model.generate(images=[image], max_length=77)
-                else:
-                    raise e
-            # Handle different output formats
-            if hasattr(outputs, 'generated_text'):
-                text = outputs.generated_text[0] if isinstance(outputs.generated_text, list) else outputs.generated_text
-            elif isinstance(outputs, list):
-                text = outputs[0]
-            elif isinstance(outputs, str):
-                text = outputs
-            else:
-                text = str(outputs)
-            return text
-    except Exception as e:
-        print(f"Error in caption_image: {e}")
-        return f"Error generating caption: {str(e)}"
-def process_image(image):
-    """Process input image and generate caption"""
-    try:
-        # Convert to PIL Image if needed
-        if isinstance(image, np.ndarray):
-            if image.dtype != np.uint8:
-                image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
-            if len(image.shape) == 2:
-                image = Image.fromarray(image, mode='L').convert('RGB')
-            elif len(image.shape) == 3:
-                if image.shape[2] == 4:
-                    image = Image.fromarray(image, mode='RGBA').convert('RGB')
-                else:
-                    image = Image.fromarray(image, mode='RGB')
-        elif isinstance(image, Image.Image):
-            if image.mode != 'RGB':
-                image = image.convert('RGB')
-        # Generate caption
-        raw_text = caption_image(image)
-        # Clean up the text
-        cleaned_text = drop_incomplete_tail(raw_text)
-        return cleaned_text
-    except Exception as e:
-        print(f"Error processing image: {e}")
-        return f"Error: {str(e)}"
-# Create Gradio interface
-demo = gr.Interface(
-    fn=process_image,
-    inputs=gr.Image(type="pil", label="Upload an image"),
-    outputs=gr.Textbox(label="Generated Caption", lines=3),
-    title="Vision-Language Image Captioner",
-    description="Upload an image to generate a detailed caption using Vision-Language-Vision-Captioner-Qwen2.5-3B",
-    examples=[],
-    cache_examples=False,
-    theme=gr.themes.Soft()
-)
-# GPU optimizations
-if torch.cuda.is_available():
-    device_name = torch.cuda.get_device_name(0)
-    print(f"\n🖥️ GPU detected: {device_name}")
-    if "H100" in device_name:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-        torch.cuda.set_per_process_memory_fraction(0.85)
-# Launch the app
-if __name__ == "__main__":
-    print("\n🌐 Launching Gradio interface...")
-    demo.launch(
-        share=False,
-        debug=True,
-        show_error=True
-    )

 import gradio as gr
 from transformers import AutoModel, AutoProcessor
 from PIL import Image
 import torch
 import numpy as np
+model_name_or_path = "lyttt/VLV_captioner"
+model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False)
+# @spaces.GPU(duration=120)
+def greet(image):
+    if image.dtype != np.uint8:
+        image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
+    image = Image.fromarray(image, mode='RGB')
+    with torch.no_grad():
+        outputs = model([image], 300).generated_text[0]
+    def drop_incomplete_tail(text):
+        sentences = text.split('.')
+        complete_sentences = [s.strip() for s in sentences if s.strip()]
+        if not text.strip().endswith('.'):
+            complete_sentences = complete_sentences[:-1]
+        return '. '.join(complete_sentences) + ('.' if complete_sentences else '')
+    return drop_incomplete_tail(outputs)
+demo = gr.Interface(fn=greet, inputs="image", outputs="text")
+demo.launch()