Spaces:

mknolan
/

internvl2-llama3-demo

Build error

App Files Files Community

mknolan commited on Mar 15

Commit

1cd032a

verified ·

1 Parent(s): e87e4c3

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +202 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+from PIL import Image
+import requests
+from io import BytesIO
+import gradio as gr
+import os
+import sys
+import time
+import warnings
+# Suppress warnings
+warnings.filterwarnings("ignore")
+print("Starting InternVL2 with Llama3-76B initialization...")
+print(f"Python version: {sys.version}")
+print(f"PyTorch version: {torch.__version__}")
+print(f"CUDA available: {torch.cuda.is_available()}")
+# Set up environment for CUDA
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
+# Check GPU availability
+def check_gpu():
+    if not torch.cuda.is_available():
+        print("CUDA is not available. This application requires GPU acceleration.")
+        return False
+    try:
+        # Test GPU with a simple operation
+        test_tensor = torch.rand(10, device="cuda")
+        _ = test_tensor + test_tensor
+        print(f"GPU is available: {torch.cuda.get_device_name(0)}")
+        return True
+    except Exception as e:
+        print(f"Error initializing GPU: {str(e)}")
+        return False
+# Global flag for GPU availability
+USE_GPU = check_gpu()
+# Import InternVL modules
+try:
+    from transformers import AutoModel, AutoProcessor
+    HAS_TRANSFORMERS = True
+    print("Successfully imported transformers")
+except ImportError as e:
+    print(f"Error importing transformers: {str(e)}")
+    HAS_TRANSFORMERS = False
+# Initialize models
+internvit_model = None
+llama_model = None
+processor = None
+def load_models():
+    global internvit_model, llama_model, processor
+    if not USE_GPU:
+        print("Cannot load models without GPU")
+        return False
+    try:
+        print("Loading InternViT-6B model for visual feature extraction...")
+        # Following the GitHub repo instructions for using InternViT-6B
+        processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
+        internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")
+        if USE_GPU:
+            internvit_model = internvit_model.to("cuda")
+        print("InternViT-6B model loaded successfully!")
+        # For demonstration purposes, we'll just extract visual features for now
+        # In a real implementation, we would load Llama3-76B here
+        print("Note: Llama3-76B model loading is commented out for this demonstration")
+        # llama_model = ...
+        return True
+    except Exception as e:
+        print(f"Error loading models: {str(e)}")
+        return False
+# Load models on startup
+MODELS_LOADED = load_models()
+def process_image(image_path, sample_url=None):
+    """Process an image using InternViT-6B for feature extraction"""
+    # Load image
+    if sample_url and not image_path:
+        # Load from URL if provided and no image uploaded
+        response = requests.get(sample_url)
+        image = Image.open(BytesIO(response.content))
+        print(f"Loaded sample image from URL: {sample_url}")
+    else:
+        # Use uploaded image
+        if isinstance(image_path, str):
+            image = Image.open(image_path)
+        else:
+            image = image_path
+    if not image:
+        return "No image provided"
+    if not MODELS_LOADED:
+        return "Models failed to load. Please check the logs."
+    try:
+        # Start timing
+        start_time = time.time()
+        # Process image through the visual encoder
+        print("Processing image through InternViT-6B...")
+        inputs = processor(images=image, return_tensors="pt")
+        if USE_GPU:
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = internvit_model(**inputs)
+        # Extract image features
+        image_features = outputs.last_hidden_state
+        pooled_output = outputs.pooler_output
+        # In a real implementation, we would pass these features to Llama3-76B
+        # For now, we'll just return info about the extracted features
+        feature_info = f"""
+        Image successfully processed through InternViT-6B:
+        - Last hidden state shape: {image_features.shape}
+        - Pooled output shape: {pooled_output.shape}
+        In a complete implementation, these visual features would be passed to Llama3-76B
+        for generating text responses about the image.
+        Note: This is a demonstration of visual feature extraction only.
+        """
+        # Calculate elapsed time
+        elapsed = time.time() - start_time
+        return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds."
+    except Exception as e:
+        return f"Error processing image: {str(e)}"
+# Set up Gradio interface
+def create_interface():
+    with gr.Blocks(title="InternVL2 with Llama3-76B") as demo:
+        gr.Markdown("# InternVL2 Visual Feature Extraction Demo")
+        gr.Markdown("## Using InternViT-6B for visual feature extraction")
+        # System status
+        status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load"
+        gr.Markdown(f"### System Status: {status}")
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="pil", label="Upload Image")
+                sample_btn = gr.Button("Use Sample Image")
+            with gr.Column():
+                output_text = gr.Textbox(label="Results", lines=10)
+        # Process button
+        process_btn = gr.Button("Extract Visual Features")
+        process_btn.click(
+            fn=process_image,
+            inputs=[input_image],
+            outputs=output_text
+        )
+        # Sample image button logic
+        sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg"
+        def use_sample():
+            return process_image(None, sample_image_url)
+        sample_btn.click(
+            fn=use_sample,
+            inputs=[],
+            outputs=output_text
+        )
+        # Add some explanation
+        gr.Markdown("""
+        ## About This Demo
+        This demonstration shows how to use InternViT-6B for visual feature extraction,
+        following the instructions from the OpenGVLab/InternVL GitHub repository.
+        The application extracts visual features from the input image that would typically
+        be passed to a language model like Llama3-76B. In a complete implementation,
+        these features would be used to generate text responses about the image.
+        """)
+    return demo
+# Main function
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=False, server_name="0.0.0.0")