Spaces:

robiro
/

k8o1

Running

App Files Files Community

robiro commited on 9 days ago

Commit

fa3a13f

verified ·

1 Parent(s): f740c8a

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -352

app.py CHANGED Viewed

@@ -1,363 +1,159 @@
 import gradio as gr
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
-import os
-import time
-# --- Configuration ---
-MODEL_REPO_ID = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
-MODEL_FILENAME = "DeepSeek-R1-0528-Qwen3-8B-Q4_K_M.gguf" # IMPORTANT: Verify this filename
-LOCAL_MODEL_PATH = f"./{MODEL_FILENAME}"
-# LLM Llama Parameters
-N_CTX = 2048
-N_THREADS = None
-N_GPU_LAYERS = 0
-VERBOSE_LLAMA = True
-# Generation parameters
-DEFAULT_MAX_NEW_TOKENS = 512
-DEFAULT_TEMPERATURE = 0.7
-DEFAULT_TOP_P = 0.95
-DEFAULT_TOP_K = 40
-DEFAULT_REPEAT_PENALTY = 1.1
-# Qwen specific chat format elements (defined globally)
-IM_START_TOKEN = "<|im_start|>"
-IM_END_TOKEN = "<|im_end|>"
-# --- Global variable for the model ---
-llm = None
-# --- Model Download ---
-def download_model_if_needed():
-    if not os.path.exists(LOCAL_MODEL_PATH):
-        print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO_ID}...")
-        start_time = time.time()
-        try:
-            hf_hub_download(
-                repo_id=MODEL_REPO_ID,
-                filename=MODEL_FILENAME,
-                local_dir=".",
-                local_dir_use_symlinks=False,
-                resume_download=True
-            )
-            end_time = time.time()
-            print(f"Download complete in {end_time - start_time:.2f} seconds.")
-            return True
-        except Exception as e:
-            print(f"Error downloading model: {e}")
-            print(f"Attempted to download: {MODEL_REPO_ID}/{MODEL_FILENAME}")
-            return False
-    else:
-        print(f"Model file {MODEL_FILENAME} already exists.")
-        return True
-    return False # Should not be reached if logic is correct, but good for completeness
-# --- Model Loading ---
-def load_llm_model():
-    global llm
-    if llm is None:
-        if not os.path.exists(LOCAL_MODEL_PATH):
-            print("Model file not found. Cannot load.")
-            return False
-        print("Loading Llama model...")
-        start_time = time.time()
-        try:
-            llm = Llama(
-                model_path=LOCAL_MODEL_PATH,
-                n_ctx=N_CTX,
-                n_threads=N_THREADS,
-                n_gpu_layers=N_GPU_LAYERS,
-                verbose=VERBOSE_LLAMA,
-            )
-            end_time = time.time()
-            print(f"Model loaded successfully in {end_time - start_time:.2f} seconds.")
-            return True
-        except Exception as e:
-            print(f"Error loading Llama model: {e}")
-            print(f"If on resource-constrained environment, model ({MODEL_FILENAME}, ~{os.path.getsize(LOCAL_MODEL_PATH)/(1024**3):.2f}GB if exists) might be too large.")
-            llm = None
-            return False
-    else:
-        print("Model already loaded.")
-        return True
-# --- Chat Function ---
-def predict(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k, repeat_penalty):
-    if llm is None:
-        return "Model not loaded. Please check the logs."
-    # Common stop tokens for Qwen-like models
-    # Accessing global IM_START_TOKEN and IM_END_TOKEN
-    stop_tokens = [IM_END_TOKEN, IM_START_TOKEN + "user", IM_START_TOKEN + "system", llm.token_eos_str()] # Use string representation of EOS
-    messages_for_api = [] # Renamed to avoid conflict with Gradio's 'messages' type
-    if system_prompt and system_prompt.strip():
-        messages_for_api.append({"role": "system", "content": system_prompt.strip()})
-    # History for Gradio Chatbot with type="messages" is already in the correct format
-    # history will be a list of lists, where each inner list is [user_msg, ai_msg]
-    # or if type="messages", it's a list of dicts.
-    # Let's assume for now the input `history` from chatbot (when type="tuples")
-    # needs conversion if predict is called directly with such history.
-    # If chatbot type="messages", history is already List[Dict[str, str]]
-    # The `user_chat_fn` and `bot_response_fn` handle history in `messages` format for the chatbot.
-    # So, when `predict` is called by `bot_response_fn`, `history` is actually `history_for_predict`
-    # which is `chat_history[:-1]`. `chat_history` is a list of tuples.
-    # We need to convert this tuple-style history to OpenAI dict style for create_chat_completion.
-    # The history passed from `bot_response_fn` (history_for_predict) is list of [user, assistant] tuples
-    for human_msg, ai_msg in history: # history here is history_for_predict from bot_response_fn
-        messages_for_api.append({"role": "user", "content": human_msg})
-        if ai_msg is not None:
-            messages_for_api.append({"role": "assistant", "content": ai_msg})
-    messages_for_api.append({"role": "user", "content": message})
-    print("\n--- Input to Model ---")
-    print(f"System Prompt: {system_prompt if system_prompt and system_prompt.strip() else 'None'}")
-    print(f"History (tuples format for predict): {history}")
-    print(f"Current Message: {message}")
-    print(f"Formatted messages for create_chat_completion: {messages_for_api}")
-    print("--- End Input to Model ---\n")
-    assistant_response_text = ""
-    generation_start_time = time.time()
     try:
-        print("Attempting generation with llm.create_chat_completion()...")
-        response = llm.create_chat_completion(
-            messages=messages_for_api,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repeat_penalty=repeat_penalty,
-            max_tokens=max_new_tokens,
-            stop=stop_tokens,
-        )
-        assistant_response_text = response['choices'][0]['message']['content'].strip()
-        print(f"create_chat_completion successful. Raw response: {response['choices'][0]['message']}")
-    except Exception as e_chat_completion:
-        print(f"Error during create_chat_completion: {e_chat_completion}")
-        print("Falling back to manual prompt construction and llm()...")
-        prompt = ""
-        if system_prompt and system_prompt.strip():
-            prompt += f"{IM_START_TOKEN}system\n{system_prompt.strip()}{IM_END_TOKEN}\n"
-        for human_msg, ai_msg in history: # history here is history_for_predict
-            prompt += f"{IM_START_TOKEN}user\n{human_msg}{IM_END_TOKEN}\n"
-            if ai_msg is not None:
-                 prompt += f"{IM_START_TOKEN}assistant\n{ai_msg}{IM_END_TOKEN}\n"
-        prompt += f"{IM_START_TOKEN}user\n{message}{IM_END_TOKEN}\n{IM_START_TOKEN}assistant\n"
-        print(f"Fallback prompt: {prompt}")
-        try:
-            output = llm(
                 prompt,
-                max_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                repeat_penalty=repeat_penalty,
-                stop=stop_tokens,
-                echo=False
             )
-            assistant_response_text = output['choices'][0]['text'].strip()
-            print(f"Fallback llm() successful. Raw output: {output['choices'][0]['text']}")
-        except Exception as e_fallback:
-            print(f"Error during fallback llm() generation: {e_fallback}")
-            assistant_response_text = "Sorry, I encountered an error during generation. Please check the logs."
-    generation_end_time = time.time()
-    print(f"Generated response: {assistant_response_text}")
-    print(f"Generation took {generation_end_time - generation_start_time:.2f} seconds.")
-    return assistant_response_text
-# --- Gradio Interface ---
-def create_gradio_interface():
-    with gr.Blocks(theme=gr.themes.Soft()) as iface:
-        gr.Markdown(f"""
-        # Chat with {MODEL_REPO_ID.split('/')[-1]} ({MODEL_FILENAME})
-        This Space runs a GGUF quantized version of the model using `llama-cpp-python`.
-        Model: [{MODEL_REPO_ID}](https://huggingface.co/{MODEL_REPO_ID})
-        GGUF File: `{MODEL_FILENAME}` (Quantization: Q4_K_M)
-        """)
-        with gr.Row():
-            with gr.Column(scale=3):
-                chatbot = gr.Chatbot(
-                    [],
-                    elem_id="chatbot",
-                    label="Chat Window",
-                    # bubble_full_width=False, # Deprecated
-                    height=500,
-                    type="messages" # Use OpenAI-style messages format
-                )
-                user_input = gr.Textbox(
-                    show_label=False,
-                    placeholder="Type your message here and press Enter...",
-                    container=False,
-                    scale=7,
-                )
-            with gr.Column(scale=1):
-                gr.Markdown("### Model Parameters")
-                system_prompt_input = gr.Textbox(
-                    label="System Prompt (Optional)",
-                    placeholder="e.g., You are a helpful AI assistant.",
-                    lines=3
-                )
-                max_new_tokens_slider = gr.Slider(
-                    minimum=32, maximum=N_CTX, value=DEFAULT_MAX_NEW_TOKENS, step=32,
-                    label="Max New Tokens"
-                )
-                temperature_slider = gr.Slider(
-                    minimum=0.0, maximum=2.0, value=DEFAULT_TEMPERATURE, step=0.05,
-                    label="Temperature"
                 )
-                top_p_slider = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=DEFAULT_TOP_P, step=0.05,
-                    label="Top-P (Nucleus Sampling)"
                 )
-                top_k_slider = gr.Slider(
-                    minimum=0, maximum=100, value=DEFAULT_TOP_K, step=1,
-                    label="Top-K Sampling"
                 )
-                repeat_penalty_slider = gr.Slider(
-                    minimum=1.0, maximum=2.0, value=DEFAULT_REPEAT_PENALTY, step=0.05,
-                    label="Repeat Penalty"
                 )
-                status_display = gr.Textbox(label="Status", interactive=False, visible=False)
-        def user_chat_fn(user_message, chat_history_messages, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen):
-            if not user_message.strip(): # Do nothing if user message is empty
-                return "", chat_history_messages, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen
-            if llm is None:
-                chat_history_messages.append({"role": "user", "content": user_message})
-                chat_history_messages.append({"role": "assistant", "content": "ERROR: Model not loaded. Check server logs."})
-                return "", chat_history_messages, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen
-            chat_history_messages.append({"role": "user", "content": user_message})
-            # Add a placeholder for assistant message that bot_response_fn will fill
-            chat_history_messages.append({"role": "assistant", "content": None})
-            return "", chat_history_messages, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen
-        def bot_response_fn(chat_history_messages, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen):
-            if llm is None or chat_history_messages[-1]["content"] is not None : # If model not loaded or already processed
-                return chat_history_messages
-            user_message = chat_history_messages[-2]["content"] # Get the last user message
-            # Convert OpenAI-style message history (List[Dict]) to tuple-style for predict's current internal logic
-            history_for_predict_tuples = []
-            # Iterate up to the second to last message (the current user's message)
-            # Each pair of (user, assistant) forms one turn for the tuple history
-            i = 0
-            temp_history = chat_history_messages[:-2] # Exclude current user and assistant placeholder
-            # Skip system prompt if present at the beginning for tuple conversion
-            start_index = 0
-            if temp_history and temp_history[0]["role"] == "system":
-                start_index = 1 # System prompt handled separately in predict
-            for i in range(start_index, len(temp_history), 2):
-                if i + 1 < len(temp_history) and temp_history[i]["role"] == "user" and temp_history[i+1]["role"] == "assistant":
-                    history_for_predict_tuples.append(
-                        (temp_history[i]["content"], temp_history[i+1]["content"])
-                    )
-                elif temp_history[i]["role"] == "user": # Handle case where last turn was only a user message (shouldn't happen if paired)
-                     history_for_predict_tuples.append((temp_history[i]["content"], None))
-            bot_msg_content = predict(user_message, history_for_predict_tuples, sys_prompt, max_tok, temp, top_p_val, top_k_val, rep_pen)
-            chat_history_messages[-1]["content"] = bot_msg_content # Update the assistant's placeholder message
-            return chat_history_messages
-        user_input.submit(
-            user_chat_fn,
-            [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-            [user_input, chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-            queue=False
-        ).then(
-            bot_response_fn,
-            [chatbot, system_prompt_input, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repeat_penalty_slider],
-            [chatbot],
-            queue=True
-        )
-        gr.Examples(
-            examples=[
-                ["Hello, how are you today?", "You are a friendly and helpful AI assistant specializing in concise answers."],
-                ["What is the capital of France?", "Be very brief."],
-                ["Write a short poem about a robot learning to dream.", ""],
-                ["Explain the concept of black holes to a 5-year-old.", "Keep it simple and use an analogy."]
-            ],
-            inputs=[user_input, system_prompt_input],
-        )
-        with gr.Accordion("Advanced/Debug Info", open=False):
-            # Accessing global IM_START_TOKEN and IM_END_TOKEN
-            gr.Markdown(f"""
-            - **Model File:** `{LOCAL_MODEL_PATH}`
-            - **N_CTX:** `{N_CTX}`
-            - **N_THREADS:** `{N_THREADS if N_THREADS is not None else 'Auto'}`
-            - **N_GPU_LAYERS:** `{N_GPU_LAYERS}`
-            - **Log Verbosity (llama.cpp):** `{VERBOSE_LLAMA}`
-            - **Stop Tokens Used (Conceptual):** `{IM_START_TOKEN}system`, `{IM_START_TOKEN}user`, `{IM_END_TOKEN}`, `EOS_TOKEN`
-            """)
-            reload_button = gr.Button("Attempt to Reload Model")
-            reload_status = gr.Label(value="Model Status: Unknown")
-            def update_reload_status():
-                if llm:
-                    return "Model Status: Loaded Successfully"
-                else:
-                    return "Model Status: Not Loaded (Check logs for errors)"
-            def attempt_reload():
-                global llm
-                if llm is not None:
-                    try:
-                        # Attempt to free existing model if Llama.cpp supports it or by reassigning
-                        print("Freeing existing model instance (if any)...")
-                        del llm # Explicitly delete to trigger __del__ if possible
-                        llm = None
-                        import gc
-                        gc.collect() # Suggest garbage collection
-                    except Exception as e_del:
-                        print(f"Error during manual deletion of llm: {e_del}")
-                if load_llm_model():
-                    return "Model reloaded successfully!"
-                else:
-                    return "Model reload FAILED. Check server logs."
-            reload_button.click(attempt_reload, outputs=[reload_status])
-            iface.load(update_reload_status, outputs=[reload_status]) # Update status on interface load
-    return iface
-# --- Main Execution ---
 if __name__ == "__main__":
-    print("Starting application...")
-    model_available = download_model_if_needed()
-    if model_available:
-        if not load_llm_model():
-            print("Initial model loading failed. Gradio will start; use UI to attempt reload.")
-        else:
-            print("Model ready.")
     else:
-        print("Model download failed. Cannot load model. Gradio will start; chat will be non-functional.")
-    print("Creating Gradio interface...")
-    app_interface = create_gradio_interface()
-    print("Launching Gradio interface...")
-    app_interface.launch()
-    print("Gradio interface launched.")

 import gradio as gr
+import torch
+from diffusers import StableDiffusionPipeline
+from PIL import Image
+# --- Globale Konfiguration und Modellladung ---
+MODEL_ID = "runwayml/stable-diffusion-v1-5"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Verwende Gerät: {DEVICE}")
+# Lade das Modell nur einmal beim Start der App
+# Für GPU: torch_dtype=torch.float16 spart VRAM und ist schneller
+# Für CPU: torch_dtype=torch.float32 (float16 wird auf CPU nicht gut unterstützt)
+dtype = torch.float16 if DEVICE == "cuda" else torch.float32
+print(f"Lade Modell '{MODEL_ID}'... Dies kann einige Minuten dauern.")
+try:
+    pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype=dtype)
+    pipe = pipe.to(DEVICE)
+    print("Modell erfolgreich geladen!")
+except Exception as e:
+    print(f"Fehler beim Laden des Modells: {e}")
+    print("Stelle sicher, dass du eine Internetverbindung hast und der Modellname korrekt ist.")
+    print("Wenn du wenig VRAM hast, versuche ein kleineres Modell oder Einstellungen zur Speicheroptimierung.")
+    pipe = None # Signalisiert, dass das Modell nicht geladen werden konnte
+# --- Bildgenerierungsfunktion ---
+def generate_image(
+    prompt: str,
+    negative_prompt: str = "",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 7.5,
+    height: int = 512,
+    width: int = 512,
+    seed: int = -1 # -1 für zufälligen Seed
+) -> Image.Image:
+    """
+    Generiert ein Bild basierend auf dem Prompt und anderen Parametern.
+    """
+    if pipe is None:
+        raise gr.Error("Modell konnte nicht geladen werden. Bitte überprüfe die Konsolenausgabe.")
+    print(f"Generiere Bild für Prompt: '{prompt}'")
+    print(f"  Negative Prompt: '{negative_prompt}'")
+    print(f"  Schritte: {num_inference_steps}, Guidance: {guidance_scale}")
+    print(f"  Dimensionen: {width}x{height}, Seed: {seed}")
+    # Seed Handling
+    generator = None
+    if seed != -1:
+        generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    # Bild generieren
+    # safety_checker=None kann verwendet werden, um den NSFW-Filter zu deaktivieren,
+    # sei dir aber der Implikationen bewusst. Standardmäßig ist er aktiv.
     try:
+        with torch.inference_mode(): # Wichtig für geringeren Speicherverbrauch bei Inferenz
+            result = pipe(
                 prompt,
+                negative_prompt=negative_prompt if negative_prompt else None,
+                num_inference_steps=int(num_inference_steps),
+                guidance_scale=guidance_scale,
+                height=int(height),
+                width=int(width),
+                generator=generator
             )
+        image = result.images[0]
+        print("Bild erfolgreich generiert.")
+        return image
+    except Exception as e:
+        print(f"Fehler bei der Bildgenerierung: {e}")
+        # Versuche, eine spezifischere Fehlermeldung für OOM-Fehler (Out Of Memory) zu geben
+        if "CUDA out of memory" in str(e):
+            raise gr.Error(
+                "CUDA out of memory. Versuche, die Bildgröße zu verringern, "
+                "weniger Inferenzschritte zu verwenden oder ein kleineres Modell zu laden."
+            )
+        raise gr.Error(f"Fehler bei der Bildgenerierung: {e}")
+# --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown(
+        """
+        # 🖼️ Bildgenerator mit Stable Diffusion
+        Gib einen Text-Prompt ein, um ein Bild zu generieren.
+        Das Laden des Modells beim ersten Start kann einige Minuten dauern.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                placeholder="z.B. Ein fotorealistisches Bild einer Katze, die einen Hut trägt",
+                lines=3
+            )
+            negative_prompt_input = gr.Textbox(
+                label="Negativer Prompt (was vermieden werden soll)",
+                placeholder="z.B. schlecht gezeichnet, unscharf, Text, Wasserzeichen",
+                lines=2
+            )
+            with gr.Row():
+                steps_slider = gr.Slider(
+                    minimum=10, maximum=150, value=50, step=1, label="Inferenzschritte"
                 )
+                guidance_slider = gr.Slider(
+                    minimum=1, maximum=20, value=7.5, step=0.1, label="Guidance Scale (CFG)"
                 )
+            with gr.Row():
+                height_slider = gr.Slider(
+                    minimum=256, maximum=1024, value=512, step=64, label="Höhe"
                 )
+                width_slider = gr.Slider(
+                    minimum=256, maximum=1024, value=512, step=64, label="Breite"
                 )
+            seed_input = gr.Number(
+                label="Seed (-1 für zufällig)", value=-1, precision=0
+            )
+            generate_button = gr.Button("Bild generieren", variant="primary")
+        with gr.Column(scale=1):
+            image_output = gr.Image(label="Generiertes Bild", type="pil")
+            gr.Markdown("### Beispiel-Prompts:")
+            gr.Examples(
+                examples=[
+                    ["Ein Astronaut reitet ein Pferd auf dem Mond, digitale Kunst", "", 50, 7.5, 512, 512, -1],
+                    ["Ein impressionistisches Gemälde eines Sonnenuntergangs über einem Lavendelfeld", "Menschen, Gebäude", 40, 8.0, 512, 768, -1],
+                    ["Ein niedlicher Corgi-Hund als Pixel-Art-Charakter", "fotorealistisch", 30, 7.0, 512, 512, 12345],
+                    ["Eine surreale Landschaft mit schwebenden Inseln und Wasserfällen aus Licht", "dunkel, düster", 60, 9.0, 768, 512, -1],
+                ],
+                inputs=[prompt_input, negative_prompt_input, steps_slider, guidance_slider, height_slider, width_slider, seed_input],
+                outputs=image_output,
+                fn=generate_image, # Die Funktion, die bei Klick auf ein Beispiel ausgeführt wird
+                cache_examples=False # Oder True, wenn du die Ergebnisse cachen willst
+            )
+    generate_button.click(
+        fn=generate_image,
+        inputs=[
+            prompt_input,
+            negative_prompt_input,
+            steps_slider,
+            guidance_slider,
+            height_slider,
+            width_slider,
+            seed_input
+        ],
+        outputs=image_output,
+        api_name="generate_image" # Für API-Zugriff
+    )
+# --- App starten ---
 if __name__ == "__main__":
+    if pipe is None:
+        print("Das Modell konnte nicht geladen werden. Die Gradio-App wird nicht gestartet.")
+        print("Bitte behebe die Fehler und versuche es erneut.")
     else:
+        app.launch(share=False) # Setze share=True, um einen öffentlichen Link zu erhalten (erfordert `gradio-client`)