Spaces:

cella110n
/

cl_tagger

Running on Zero

App Files Files Community

cella110n commited on Apr 28

Commit

87e05c0

verified ·

1 Parent(s): d9eba7a

Upload app.py

Browse files

Files changed (1) hide show

app.py +118 -135

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
-# import spaces # Removed
-import onnxruntime as ort
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 import json
@@ -13,7 +12,10 @@ from huggingface_hub import hf_hub_download
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Tuple
 import time
-import spaces
 # MatplotlibのバックエンドをAggに設定 (GUIなし環境用)
 matplotlib.use('Agg')
@@ -289,169 +291,165 @@ def visualize_predictions(image: Image.Image, predictions, threshold=0.45):
 # 定数
 REPO_ID = "cella110n/cl_tagger"
-# MODEL_FILENAME = "cl_eva02_tagger_v1_250426/model_optimized.onnx"
-MODEL_FILENAME = "cl_eva02_tagger_v1_250426/model.onnx" # Use non-optimized if needed
-TAG_MAPPING_FILENAME = "cl_eva02_tagger_v1_250426/tag_mapping.json"
 CACHE_DIR = "./model_cache"
-# グローバル変数（モデルとラベルをキャッシュ）
-# onnx_session = None # Removed global session
-model_path_global = None # Store model path globally
 labels_data = None
 tag_to_category_map = None
 def download_model_files():
-    """Hugging Face Hubからモデルとタグマッピングをダウンロード"""
     print("Downloading model files...")
-    # 環境変数からHFトークンを取得 (プライベートリポジトリ用)
     hf_token = os.environ.get("HF_TOKEN")
     try:
-        model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, cache_dir=CACHE_DIR, token=hf_token)
-        tag_mapping_path = hf_hub_download(repo_id=REPO_ID, filename=TAG_MAPPING_FILENAME, cache_dir=CACHE_DIR, token=hf_token)
-        print(f"Model downloaded to: {model_path}")
-        print(f"Tag mapping downloaded to: {tag_mapping_path}")
-        return model_path, tag_mapping_path
     except Exception as e:
         print(f"Error downloading files: {e}")
-        # トークンがない場合のエラーメッセージを改善
         if "401 Client Error" in str(e) or "Repository not found" in str(e):
-             raise gr.Error(f"Could not download files from {REPO_ID}. "
-                           f"If this is a private repository, make sure to set the HF_TOKEN secret in your Space settings.")
         else:
             raise gr.Error(f"Error downloading files: {e}")
-def initialize_model():
-    """モデルファイルとラベルデータを準備（キャッシュ）"""
-    global model_path_global, labels_data, tag_to_category_map
-    # Only initialize once
     if labels_data is None:
-        print("Downloading model files...") # Moved print here
-        model_path, tag_mapping_path = download_model_files()
-        model_path_global = model_path # Store the path
-        print("Loading labels...")
-        labels_data, _, tag_to_category_map = load_tag_mapping(tag_mapping_path)
-        print("Labels loaded.")
-    # --- Removed ONNX Session Initialization ---
 @spaces.GPU()
 def predict(image_input, gen_threshold, char_threshold, output_mode):
     print("--- predict function started (GPU worker) ---")
-    """Gradioインターフェース用の予測関数 (GPUワーカー内)"""
-    initialize_model() # Ensure files/labels are ready
-    # --- Create ONNX session inside the GPU function ---
-    print("Creating ONNX session for prediction...")
-    global model_path_global # Access the global model path
-    if model_path_global is None:
-         # Attempt initialization again if model path is missing (e.g., after restart)
-         initialize_model()
-         if model_path_global is None:
-              return "Error: Model path could not be initialized.", None
-    available_providers = ort.get_available_providers()
-    print(f"(Worker) Available ONNX Runtime providers: {available_providers}")
-    providers = []
-    if 'CUDAExecutionProvider' in available_providers:
-        providers.append('CUDAExecutionProvider')
-    providers.append('CPUExecutionProvider') # Always include CPU as fallback
     try:
-        # Create session with GPU preference inside the worker
-        session = ort.InferenceSession(model_path_global, providers=providers)
-        print(f"(Worker) Using ONNX Runtime provider: {session.get_providers()[0]}")
     except Exception as e:
-         print(f"(Worker) Error initializing ONNX session with providers {providers}: {e}")
-         # Fallback explicitly to CPU if GPU fails inside worker
-         try:
-             print("(Worker) Falling back to CPUExecutionProvider only.")
-             session = ort.InferenceSession(model_path_global, providers=['CPUExecutionProvider'])
-         except Exception as e_cpu:
-             print(f"(Worker) Error initializing ONNX session even with CPU: {e_cpu}")
-             return f"Error initializing ONNX session: {e_cpu}", None
-    # --- Session created ---
     if image_input is None:
         return "Please upload an image.", None
     print(f"(Worker) Processing image with thresholds: gen={gen_threshold}, char={char_threshold}")
-    # PIL Imageオブジェクトであることを確認
     if not isinstance(image_input, Image.Image):
         try:
-            # URLの場合
             if isinstance(image_input, str) and image_input.startswith("http"):
-                response = requests.get(image_input)
-                response.raise_for_status()
                 image = Image.open(io.BytesIO(response.content))
-            # ファイルパスの場合 (Gradioでは通常発生しないが念のため)
             elif isinstance(image_input, str) and os.path.exists(image_input):
                 image = Image.open(image_input)
-            # Numpy配列の場合 (Gradio Imageコンポーネントからの入力)
             elif isinstance(image_input, np.ndarray):
                 image = Image.fromarray(image_input)
-            else:
-                raise ValueError("Unsupported image input type")
         except Exception as e:
-            print(f"(Worker) Error loading image: {e}")
-            return f"Error loading image: {e}", None
-    else:
-        image = image_input
-    # 前処理
-    original_pil_image, input_data = preprocess_image(image)
-    # データ型をモデルの期待に合わせる (通常はfloat32)
-    input_name = session.get_inputs()[0].name
-    expected_type = session.get_inputs()[0].type
-    if expected_type == 'tensor(float16)':
-        input_data = input_data.astype(np.float16)
-    else:
-        input_data = input_data.astype(np.float32) # Default to float32
-    # 推論 (作成したセッションを使用)
-    start_time = time.time()
-    outputs = session.run(None, {input_name: input_data})[0]
-    inference_time = time.time() - start_time
-    print(f"(Worker) Inference completed in {inference_time:.3f} seconds")
-    # シグモイド関数で確率に変換
-    probs = 1 / (1 + np.exp(-outputs[0])) # Apply sigmoid to the first batch item
-    # タグ取得
     predictions = get_tags(probs, labels_data, gen_threshold, char_threshold)
-    # タグを整形
     output_tags = []
-    # RatingとQualityを最初に追加
-    if predictions["rating"]:
-        output_tags.append(predictions["rating"][0][0].replace("_", " "))
-    if predictions["quality"]:
-        output_tags.append(predictions["quality"][0][0].replace("_", " "))
-    # 残りのカテゴリをアルファベット順に追加（オプション）
     for category in ["artist", "character", "copyright", "general", "meta"]:
-        tags = [tag.replace("_", " ") for tag, prob in predictions[category]
-                if not (category == "meta" and any(p in tag.lower() for p in ['id', 'commentary','mismatch']))] # メタタグフィルタリング
         output_tags.extend(tags)
     output_text = ", ".join(output_tags)
-    if output_mode == "Tags Only":
-        return output_text, None
-    else: # Visualization
-        viz_image = visualize_predictions(original_pil_image, predictions, gen_threshold)
-        return output_text, viz_image
 # --- Gradio Interface Definition ---
-# CSS for styling
 css = """
 .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
 footer { display: none !important; }
 .gr-prose { max-width: 100% !important; }
 """
-# Custom JS for image pasting and URL handling
 js = """
 async function paste_image(blob, gen_thresh, char_thresh, out_mode) {
     const data = await fetch(blob)
@@ -530,17 +528,14 @@ document.addEventListener('paste', paste_update);
 """
 with gr.Blocks(css=css, js=js) as demo:
-    gr.Markdown("# WD EVA02 LoRA ONNX Tagger")
-    gr.Markdown("Upload an image or paste an image URL to predict tags using the fine-tuned WD EVA02 Tagger model (ONNX format).")
     gr.Markdown(f"Model Repository: [{REPO_ID}](https://huggingface.co/{REPO_ID})")
     with gr.Row():
         with gr.Column(scale=1):
-            # Use elem_id for JS targeting
             image_input = gr.Image(type="pil", label="Input Image", elem_id="input-image")
-            # Container for URL paste message
             gr.HTML("<div id='url-input-container'></div>")
             gen_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.55, label="General Tag Threshold")
             char_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.60, label="Character/Copyright/Artist Tag Threshold")
             output_mode = gr.Radio(choices=["Tags Only", "Tags + Visualization"], value="Tags + Visualization", label="Output Mode")
@@ -550,7 +545,6 @@ with gr.Blocks(css=css, js=js) as demo:
             output_tags = gr.Textbox(label="Predicted Tags", lines=10)
             output_visualization = gr.Image(type="pil", label="Prediction Visualization")
-    # Examples
     gr.Examples(
         examples=[
             ["https://pbs.twimg.com/media/GXBXsRvbQAAg1kp.jpg", 0.55, 0.5, "Tags + Visualization"],
@@ -561,7 +555,7 @@ with gr.Blocks(css=css, js=js) as demo:
         inputs=[image_input, gen_threshold, char_threshold, output_mode],
         outputs=[output_tags, output_visualization],
         fn=predict,
-        cache_examples=False # Slows down startup if True and large examples
     )
     predict_button.click(
@@ -570,18 +564,7 @@ with gr.Blocks(css=css, js=js) as demo:
         outputs=[output_tags, output_visualization]
     )
-    # Add listener for image input changes (e.g., from pasting)
-    # This might trigger prediction automatically or require the button click
-    # image_input.change(
-    #     fn=predict,
-    #     inputs=[image_input, gen_threshold, char_threshold, output_mode],
-    #     outputs=[output_tags, output_visualization]
-    # )
 if __name__ == "__main__":
-    # 環境変数HF_TOKENがない場合に警告（プライベートリポジトリ用）
     if not os.environ.get("HF_TOKEN"):
-        print("Warning: HF_TOKEN environment variable not set. Downloads from private repositories may fail.")
-    # initialize_model() # Removed startup initialization (model loaded in predict)
     demo.launch(share=True)

 import gradio as gr
+import spaces
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 import json
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Tuple
 import time
+import torch
+import timm
+from safetensors.torch import load_file as safe_load_file
 # MatplotlibのバックエンドをAggに設定 (GUIなし環境用)
 matplotlib.use('Agg')
 # 定数
 REPO_ID = "cella110n/cl_tagger"
+SAFETENSORS_FILENAME = "lora_model_0426/checkpoint_epoch_4.safetensors"
+METADATA_FILENAME = "lora_model_0426/checkpoint_epoch_4_metadata.json"
+TAG_MAPPING_FILENAME = "lora_model_0426/tag_mapping.json"
 CACHE_DIR = "./model_cache"
+safetensors_path_global = None
+metadata_path_global = None
+tag_mapping_path_global = None
 labels_data = None
 tag_to_category_map = None
 def download_model_files():
+    """Hugging Face Hubからモデル、メタデータ、タグマッピングをダウンロード"""
+    global safetensors_path_global, metadata_path_global, tag_mapping_path_global
+    # Check if files seem to be downloaded already
+    if safetensors_path_global and tag_mapping_path_global and os.path.exists(safetensors_path_global) and os.path.exists(tag_mapping_path_global):
+        print("Files seem already downloaded.")
+        return
     print("Downloading model files...")
     hf_token = os.environ.get("HF_TOKEN")
     try:
+        safetensors_path_global = hf_hub_download(repo_id=REPO_ID, filename=SAFETENSORS_FILENAME, cache_dir=CACHE_DIR, token=hf_token, force_download=True) # Force download to ensure latest
+        tag_mapping_path_global = hf_hub_download(repo_id=REPO_ID, filename=TAG_MAPPING_FILENAME, cache_dir=CACHE_DIR, token=hf_token, force_download=True)
+        print(f"Safetensors downloaded to: {safetensors_path_global}")
+        print(f"Tag mapping downloaded to: {tag_mapping_path_global}")
+        try:
+            metadata_path_global = hf_hub_download(repo_id=REPO_ID, filename=METADATA_FILENAME, cache_dir=CACHE_DIR, token=hf_token, force_download=True)
+            print(f"Metadata downloaded to: {metadata_path_global}")
+        except Exception:
+            print(f"Metadata file ({METADATA_FILENAME}) not found or download failed. Proceeding without it.")
+            metadata_path_global = None
     except Exception as e:
         print(f"Error downloading files: {e}")
         if "401 Client Error" in str(e) or "Repository not found" in str(e):
+             raise gr.Error(f"Could not download files from {REPO_ID}. Check HF_TOKEN secret.")
         else:
             raise gr.Error(f"Error downloading files: {e}")
+def initialize_labels_and_paths():
+    """ラベルデータとファイルパスを準備（キャッシュ）"""
+    global labels_data, tag_to_category_map, tag_mapping_path_global
     if labels_data is None:
+        download_model_files() # Ensure files are downloaded
+        print("Loading labels from tag_mapping.json...")
+        if tag_mapping_path_global and os.path.exists(tag_mapping_path_global):
+            try:
+                 labels_data, _, tag_to_category_map = load_tag_mapping(tag_mapping_path_global)
+                 print(f"Labels loaded successfully. Number of labels: {len(labels_data.names)}")
+            except Exception as e:
+                 print(f"Error loading tag mapping from {tag_mapping_path_global}: {e}")
+                 raise gr.Error(f"Error loading tag mapping file: {e}")
+        else:
+             print(f"Tag mapping file not found after download attempt: {tag_mapping_path_global}")
+             raise gr.Error("Tag mapping file could not be downloaded or found.")
 @spaces.GPU()
 def predict(image_input, gen_threshold, char_threshold, output_mode):
     print("--- predict function started (GPU worker) ---")
+    initialize_labels_and_paths()
+    print("Loading PyTorch model...")
+    global safetensors_path_global, labels_data
+    if safetensors_path_global is None or labels_data is None:
+        initialize_labels_and_paths()
+        if safetensors_path_global is None or labels_data is None:
+            return "Error: Model/Labels paths could not be initialized.", None
     try:
+        print(f"Creating base model: eva02_large_patch14_448.mim_m38m_ft_in1k")
+        num_classes = len(labels_data.names)
+        # Validate num_classes (should be > 0)
+        if num_classes <= 0:
+            raise ValueError(f"Invalid number of classes loaded from tag mapping: {num_classes}")
+        print(f"Setting num_classes: {num_classes}")
+        model = timm.create_model(
+            'eva02_large_patch14_448.mim_m38m_ft_in1k',
+            pretrained=True,
+            num_classes=num_classes
+        )
+        print(f"Loading state dict from: {safetensors_path_global}")
+        if not os.path.exists(safetensors_path_global):
+             raise FileNotFoundError(f"Safetensors file not found at: {safetensors_path_global}")
+        state_dict = safe_load_file(safetensors_path_global)
+        adapted_state_dict = {}
+        for k, v in state_dict.items():
+            # Adjust key names if needed based on how lora.py saved the merged model
+            # Example: If saved with 'base_model.' prefix
+            # if k.startswith('base_model.'):
+            #    adapted_state_dict[k[len('base_model.'):]] = v
+            # else:
+            adapted_state_dict[k] = v # Assuming direct key match for now
+        missing_keys, unexpected_keys = model.load_state_dict(adapted_state_dict, strict=False)
+        print(f"State dict loaded. Missing keys: {missing_keys}")
+        print(f"State dict loaded. Unexpected keys: {unexpected_keys}")
+        # Handle critical missing keys (like the head) if necessary
+        if any(k.startswith('head.') for k in missing_keys):
+             print("Warning: Classification head weights might be missing or mismatched!")
+        # if unexpected_keys:
+        #     print("Warning: Unexpected keys found in state_dict.")
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Moving model to device: {device}")
+        model.to(device)
+        model.eval()
+        print("Model loaded and moved to device.")
     except Exception as e:
+        print(f"(Worker) Error loading PyTorch model: {e}")
+        import traceback
+        print(traceback.format_exc())
+        return f"Error loading PyTorch model: {e}", None
     if image_input is None:
         return "Please upload an image.", None
     print(f"(Worker) Processing image with thresholds: gen={gen_threshold}, char={char_threshold}")
     if not isinstance(image_input, Image.Image):
         try:
             if isinstance(image_input, str) and image_input.startswith("http"):
+                response = requests.get(image_input); response.raise_for_status()
                 image = Image.open(io.BytesIO(response.content))
             elif isinstance(image_input, str) and os.path.exists(image_input):
                 image = Image.open(image_input)
             elif isinstance(image_input, np.ndarray):
                 image = Image.fromarray(image_input)
+            else: raise ValueError("Unsupported image input type")
         except Exception as e:
+            print(f"(Worker) Error loading image: {e}"); return f"Error loading image: {e}", None
+    else: image = image_input
+    original_pil_image, input_tensor = preprocess_image(image)
+    input_tensor = input_tensor.to(device)
+    try:
+        print("(Worker) Running inference...")
+        start_time = time.time()
+        with torch.no_grad(): outputs = model(input_tensor)
+        inference_time = time.time() - start_time
+        print(f"(Worker) Inference completed in {inference_time:.3f} seconds")
+        probs = torch.sigmoid(outputs)[0].cpu().numpy()
+    except Exception as e:
+        print(f"(Worker) Error during PyTorch inference: {e}"); import traceback; print(traceback.format_exc()); return f"Error during inference: {e}", None
     predictions = get_tags(probs, labels_data, gen_threshold, char_threshold)
     output_tags = []
+    if predictions.get("rating"): output_tags.append(predictions["rating"][0][0].replace("_", " "))
+    if predictions.get("quality"): output_tags.append(predictions["quality"][0][0].replace("_", " "))
     for category in ["artist", "character", "copyright", "general", "meta"]:
+        tags = [tag.replace("_", " ") for tag, prob in predictions.get(category, [])
+                 if not (category == "meta" and any(p in tag.lower() for p in ['id', 'commentary','mismatch']))]
         output_tags.extend(tags)
     output_text = ", ".join(output_tags)
+    if output_mode == "Tags Only": return output_text, None
+    else: viz_image = visualize_predictions(original_pil_image, predictions, gen_threshold); return output_text, viz_image
 # --- Gradio Interface Definition ---
 css = """
 .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
 footer { display: none !important; }
 .gr-prose { max-width: 100% !important; }
 """
 js = """
 async function paste_image(blob, gen_thresh, char_thresh, out_mode) {
     const data = await fetch(blob)
 """
 with gr.Blocks(css=css, js=js) as demo:
+    gr.Markdown("# WD EVA02 LoRA PyTorch Tagger")
+    gr.Markdown("Upload an image or paste an image URL to predict tags using the fine-tuned WD EVA02 Tagger model (PyTorch/Safetensors).")
     gr.Markdown(f"Model Repository: [{REPO_ID}](https://huggingface.co/{REPO_ID})")
     with gr.Row():
         with gr.Column(scale=1):
             image_input = gr.Image(type="pil", label="Input Image", elem_id="input-image")
             gr.HTML("<div id='url-input-container'></div>")
             gen_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.55, label="General Tag Threshold")
             char_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.60, label="Character/Copyright/Artist Tag Threshold")
             output_mode = gr.Radio(choices=["Tags Only", "Tags + Visualization"], value="Tags + Visualization", label="Output Mode")
             output_tags = gr.Textbox(label="Predicted Tags", lines=10)
             output_visualization = gr.Image(type="pil", label="Prediction Visualization")
     gr.Examples(
         examples=[
             ["https://pbs.twimg.com/media/GXBXsRvbQAAg1kp.jpg", 0.55, 0.5, "Tags + Visualization"],
         inputs=[image_input, gen_threshold, char_threshold, output_mode],
         outputs=[output_tags, output_visualization],
         fn=predict,
+        cache_examples=False
     )
     predict_button.click(
         outputs=[output_tags, output_visualization]
     )
 if __name__ == "__main__":
     if not os.environ.get("HF_TOKEN"):
+        print("Warning: HF_TOKEN environment variable not set.")
     demo.launch(share=True)