HyperCLOVAX-SEED-Text-Instruct-0.5B

Running on Zero

App Files Files Community

ginipick commited on May 1

Commit

eadc038

verified ·

1 Parent(s): 6523f76

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -134

app.py CHANGED Viewed

@@ -1,65 +1,82 @@
-import sys
-import subprocess
-import os
-# First, try to install all dependencies
-packages_to_install = [
-    "gradio",
-    "torch",
-    "transformers",
-    "accelerate",
-    "einops",
-    "timm",
-    "av",
-    "opencv-python-headless"  # Using headless version for better compatibility
-]
-for package in packages_to_install:
-    print(f"Installing {package}...")
-    try:
-        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
-        print(f"Successfully installed {package}")
-    except Exception as e:
-        print(f"Error installing {package}: {e}")
-# Now proceed with the actual application
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gc
 import datetime
 import time
-import spaces
-# --- 설정 ---
-MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
 MAX_NEW_TOKENS = 512
 # Hugging Face 토큰 설정 - 환경 변수에서 가져오기
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     print("경고: HF_TOKEN 환경 변수가 설정되지 않았습니다. 비공개 모델에 접근할 수 없을 수 있습니다.")
-# --- 환경 설정 ---
-print("--- 환경 설정 ---")
-print(f"PyTorch 버전: {torch.__version__}")
-print(f"실행 장치: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
 print(f"HF_TOKEN 설정 여부: {'있음' if HF_TOKEN else '없음'}")
-# --- 모델 및 토크나이저 로딩 ---
-print(f"--- 모델 로딩 중: {MODEL_ID} ---")
-print("첫 실행 시 몇 분 정도 소요될 수 있습니다...")
 model = None
 tokenizer = None
 load_successful = False
-stop_token_ids_list = [] # stop_token_ids_list 초기화
 try:
     start_load_time = time.time()
-    # 자원에 따라 device_map 설정
-    device_map = "auto" if torch.cuda.is_available() else "cpu"
-    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     # 토크나이저 로딩
     tokenizer_kwargs = {
@@ -69,7 +86,7 @@ try:
     # HF_TOKEN이 설정되어 있으면 추가
     if HF_TOKEN:
         tokenizer_kwargs["token"] = HF_TOKEN
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
         **tokenizer_kwargs
@@ -77,15 +94,15 @@ try:
     # 모델 로딩
     model_kwargs = {
-        "torch_dtype": dtype,
-        "device_map": device_map,
-        "trust_remote_code": True
     }
     # HF_TOKEN이 설정되어 있으면 추가
     if HF_TOKEN:
         model_kwargs["token"] = HF_TOKEN
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         **model_kwargs
@@ -93,74 +110,80 @@ try:
     model.eval()
     load_time = time.time() - start_load_time
-    print(f"--- 모델 및 토크나이저 로딩 완료: {load_time:.2f}초 소요 ---")
     load_successful = True
-    # --- 중지 토큰 설정 ---
-    stop_token_strings = ["</s>", "<|endoftext|>"]
     temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
     if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
         temp_stop_ids.append(tokenizer.eos_token_id)
     elif tokenizer.eos_token_id is None:
-         print("경고: tokenizer.eos_token_id가 None입니다. 중지 토큰에 추가할 수 없습니다.")
     stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None]
     if not stop_token_ids_list:
-        print("경고: 중지 토큰 ID를 찾을 수 없습니다. 가능하면 기본 EOS를 사용하고, 그렇지 않으면 생성이 올바르게 중지되지 않을 수 있습니다.")
         if tokenizer.eos_token_id is not None:
             stop_token_ids_list = [tokenizer.eos_token_id]
         else:
-             print("오류: 기본 EOS를 포함하여 중지 토큰을 찾을 수 없습니다. 생성이 무한정 실행될 수 있습니다.")
-    print(f"사용할 중지 토큰 ID: {stop_token_ids_list}")
 except Exception as e:
-    print(f"!!! 모델 로딩 오류: {e}")
     if 'model' in locals() and model is not None: del model
     if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
     gc.collect()
-    raise gr.Error(f"모델 {MODEL_ID} 로딩에 실패했습니다. 애플리케이션을 시작할 수 없습니다. 오류: {e}")
-# --- 시스템 프롬프트 정의 ---
 def get_system_prompt():
     current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
-        f"- 오늘은 {current_date}입니다.\n"
-        f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 합니다."
     )
-# --- 웜업 함수 ---
 def warmup_model():
     if not load_successful or model is None or tokenizer is None:
-        print("웜업 건너뛰기: 모델이 성공적으로 로드되지 않았습니다.")
         return
-    print("--- 모델 웜업 시작 ---")
     try:
         start_warmup_time = time.time()
         warmup_message = "안녕하세요"
-        # 모델에 맞는 형식으로 입력 구성
         system_prompt = get_system_prompt()
-        # MiMo 모델의 프롬프트 형식에 맞게 조정
-        prompt = f"Human: {warmup_message}\nAssistant:"
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        # 중지 토큰이 비어 있는지 확인하고 적절히 처리
         gen_kwargs = {
             "max_new_tokens": 10,
             "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
             "do_sample": False
         }
         if stop_token_ids_list:
             gen_kwargs["eos_token_id"] = stop_token_ids_list
         else:
-            print("웜업 경고: 생성에 정의된 중지 토큰이 없습니다.")
         with torch.no_grad():
             output_ids = model.generate(**inputs, **gen_kwargs)
@@ -169,132 +192,176 @@ def warmup_model():
         del output_ids
         gc.collect()
         warmup_time = time.time() - start_warmup_time
-        print(f"--- 모델 웜업 완료: {warmup_time:.2f}초 소요 ---")
     except Exception as e:
-        print(f"!!! 모델 웜업 중 오류 발생: {e}")
     finally:
         gc.collect()
-# --- 추론 함수 ---
-@spaces.GPU()
 def predict(message, history):
     """
-    HyperCLOVAX-SEED-Vision-Instruct-3B 모델을 사용하여 응답을 생성합니다.
-    'history'는 Gradio 'messages' 형식을 가정합니다: List[Dict].
     """
     if model is None or tokenizer is None:
          return "오류: 모델이 로드되지 않았습니다."
-    # 대화 기록 처리
-    history_text = ""
-    if isinstance(history, list):
-        for turn in history:
-            if isinstance(turn, tuple) and len(turn) == 2:
-                history_text += f"Human: {turn[0]}\nAssistant: {turn[1]}\n"
-    # MiMo 모델 입력 형식에 맞게 프롬프트 구성
-    prompt = f"{history_text}Human: {message}\nAssistant:"
     inputs = None
     output_ids = None
     try:
-        # 입력 준비
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        input_length = inputs.input_ids.shape[1]
-        print(f"\n입력 토큰 수: {input_length}")
     except Exception as e:
-        print(f"!!! 입력 처리 중 오류 발생: {e}")
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
-        print("응답 생성 중...")
         generation_start_time = time.time()
-        # 생성 인수 준비, 비어 있는 stop_token_ids_list 처리
         gen_kwargs = {
             "max_new_tokens": MAX_NEW_TOKENS,
             "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
             "do_sample": True,
             "temperature": 0.7,
             "top_p": 0.9,
-            "repetition_penalty": 1.1
         }
         if stop_token_ids_list:
              gen_kwargs["eos_token_id"] = stop_token_ids_list
         else:
-             print("생성 경고: 정의된 중지 토큰이 없습니다.")
         with torch.no_grad():
             output_ids = model.generate(**inputs, **gen_kwargs)
         generation_time = time.time() - generation_start_time
-        print(f"생성 완료: {generation_time:.2f}초 소요.")
     except Exception as e:
-        print(f"!!! 모델 생성 중 오류 발생: {e}")
         if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
-    # 응답 디코딩
     response = "오류: 응답 생성에 실패했습니다."
     if output_ids is not None:
         try:
             new_tokens = output_ids[0, input_length:]
             response = tokenizer.decode(new_tokens, skip_special_tokens=True)
-            print(f"출력 토큰 수: {len(new_tokens)}")
             del new_tokens
         except Exception as e:
-            print(f"!!! 응답 디코딩 중 오류 발생: {e}")
             response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."
-    # 메모리 정리
     if inputs is not None: del inputs
     if output_ids is not None: del output_ids
     gc.collect()
-    print("메모리 정리 완료.")
-    return response.strip()
-# --- Gradio 인터페이스 설정 ---
-print("--- Gradio 인터페이스 설정 중 ---")
-examples = [
-    ["안녕하세요! 자기소개 좀 해주세요."],
-    ["인공지능과 머신러닝의 차이점은 무엇인가요?"],
-    ["딥러닝 모델 학습 과정을 단계별로 알려주세요."],
-    ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 알려주세요."],
-]
-# 모델 이름에 맞게 타이틀 조정
-title = "🤖 HyperCLOVAX-SEED-Vision-Instruct-3B"
-# ChatInterface를 사용하여 자체 Chatbot 컴포넌트 관리
-demo = gr.ChatInterface(
-    fn=predict,
-    title=title,
-    description=(
-        f"**모델:** {MODEL_ID}\n"
-    ),
-    examples=examples,
-    cache_examples=False,
-    theme=gr.themes.Soft(),
-)
-# --- 애플리케이션 실행 ---
 if __name__ == "__main__":
     if load_successful:
         warmup_model()
     else:
-        print("모델 로딩에 실패하여 웜업을 건너뜁니다.")
-    print("--- Gradio 앱 실행 중 ---")
     demo.queue().launch(
-        # share=True # 공개 링크를 원하면 주석 해제
-        # server_name="0.0.0.0" # 로컬 네트워크 접근을 원하면 주석 해제
     )

 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gc
+import os
 import datetime
 import time
+# --- Configuration ---
+MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
 MAX_NEW_TOKENS = 512
+USE_GPU = True  # Enable GPU usage
 # Hugging Face 토큰 설정 - 환경 변수에서 가져오기
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     print("경고: HF_TOKEN 환경 변수가 설정되지 않았습니다. 비공개 모델에 접근할 수 없을 수 있습니다.")
+# --- Environment setup ---
+print("--- Environment Setup ---")
+device = torch.device("cuda" if torch.cuda.is_available() and USE_GPU else "cpu")
+print(f"PyTorch version: {torch.__version__}")
+print(f"Running on device: {device}")
+print(f"Torch Threads: {torch.get_num_threads()}")
 print(f"HF_TOKEN 설정 여부: {'있음' if HF_TOKEN else '없음'}")
+# Custom CSS for improved UI
+custom_css = """
+.gradio-container {
+    max-width: 850px !important;
+    margin: auto;
+}
+.gr-chat {
+    border-radius: 10px;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+}
+.user-message {
+    background-color: #f0f7ff !important;
+    border-radius: 8px;
+}
+.assistant-message {
+    background-color: #f9f9f9 !important;
+    border-radius: 8px;
+}
+.gr-button.primary-button {
+    background-color: #1f4e79 !important;
+}
+.gr-form {
+    padding: 20px;
+    border-radius: 10px;
+    box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
+}
+#intro-message {
+    text-align: center;
+    margin-bottom: 20px;
+    padding: 15px;
+    background: linear-gradient(135deg, #e8f4ff 0%, #f0f7ff 100%);
+    border-radius: 10px;
+    border-left: 4px solid #1f4e79;
+}
+.footer {
+    text-align: center;
+    margin-top: 20px;
+    font-size: 0.8em;
+    color: #666;
+}
+"""
+# --- Model and Tokenizer Loading ---
+print(f"--- Loading Model: {MODEL_ID} ---")
+print("This might take a few minutes, especially on the first launch...")
 model = None
 tokenizer = None
 load_successful = False
+stop_token_ids_list = []  # Initialize stop_token_ids_list
 try:
     start_load_time = time.time()
     # 토크나이저 로딩
     tokenizer_kwargs = {
     # HF_TOKEN이 설정되어 있으면 추가
     if HF_TOKEN:
         tokenizer_kwargs["token"] = HF_TOKEN
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
         **tokenizer_kwargs
     # 모델 로딩
     model_kwargs = {
+        "trust_remote_code": True,
+        "device_map": "auto" if device.type == "cuda" else "cpu",
+        "torch_dtype": torch.float16 if device.type == "cuda" else torch.float32,
     }
     # HF_TOKEN이 설정되어 있으면 추가
     if HF_TOKEN:
         model_kwargs["token"] = HF_TOKEN
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         **model_kwargs
     model.eval()
     load_time = time.time() - start_load_time
+    print(f"--- Model and Tokenizer Loaded Successfully in {load_time:.2f} seconds ---")
     load_successful = True
+    # --- Stop Token Configuration ---
+    stop_token_strings = ["<|endofturn|>", "<|stop|>"]
     temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
     if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
         temp_stop_ids.append(tokenizer.eos_token_id)
     elif tokenizer.eos_token_id is None:
+         print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
     stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None]
     if not stop_token_ids_list:
+        print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
         if tokenizer.eos_token_id is not None:
             stop_token_ids_list = [tokenizer.eos_token_id]
         else:
+             print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
+    print(f"Using Stop Token IDs: {stop_token_ids_list}")
 except Exception as e:
+    print(f"!!! Error loading model: {e}")
     if 'model' in locals() and model is not None: del model
     if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
     gc.collect()
+    # Raise Gradio error to display in the Space UI if loading fails
+    raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")
+# --- System Prompt Definition ---
 def get_system_prompt():
     current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
+        f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
+        f"- 오늘은 {current_date}이다.\n"
+        f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
+# --- Warm-up Function ---
 def warmup_model():
     if not load_successful or model is None or tokenizer is None:
+        print("Skipping warmup: Model not loaded successfully.")
         return
+    print("--- Starting Model Warm-up ---")
     try:
         start_warmup_time = time.time()
         warmup_message = "안녕하세요"
         system_prompt = get_system_prompt()
+        warmup_chat = [
+            {"role": "tool_list", "content": ""},
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": warmup_message}
+        ]
+        inputs = tokenizer.apply_chat_template(
+            warmup_chat,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(device)
+        # Check if stop_token_ids_list is empty and handle appropriately
         gen_kwargs = {
             "max_new_tokens": 10,
             "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
             "do_sample": False
         }
         if stop_token_ids_list:
             gen_kwargs["eos_token_id"] = stop_token_ids_list
         else:
+            print("Warmup Warning: No stop tokens defined for generation.")
         with torch.no_grad():
             output_ids = model.generate(**inputs, **gen_kwargs)
         del output_ids
         gc.collect()
         warmup_time = time.time() - start_warmup_time
+        print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")
     except Exception as e:
+        print(f"!!! Error during model warm-up: {e}")
     finally:
         gc.collect()
+# --- Inference Function ---
 def predict(message, history):
     """
+    Generates response using HyperCLOVAX.
+    Assumes 'history' is in the Gradio 'messages' format: List[Dict].
     """
     if model is None or tokenizer is None:
          return "오류: 모델이 로드되지 않았습니다."
+    system_prompt = get_system_prompt()
+    # Start with system prompt
+    chat_history_formatted = [
+        {"role": "tool_list", "content": ""}, # As required by model card
+        {"role": "system", "content": system_prompt}
+    ]
+    # Append history (List of {'role': 'user'/'assistant', 'content': '...'})
+    if isinstance(history, list): # Check if history is a list
+        for turn in history:
+             # Validate turn format
+            if isinstance(turn, dict) and "role" in turn and "content" in turn:
+                 chat_history_formatted.append(turn)
+            # Handle potential older tuple format
+            elif isinstance(turn, (list, tuple)) and len(turn) == 2:
+                 print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.")
+                 chat_history_formatted.append({"role": "user", "content": turn[0]})
+                 if turn[1]: # Ensure assistant message exists
+                      chat_history_formatted.append({"role": "assistant", "content": turn[1]})
+            else:
+                print(f"Warning: Skipping unexpected history format item: {turn}")
+    # Append the latest user message
+    chat_history_formatted.append({"role": "user", "content": message})
     inputs = None
     output_ids = None
     try:
+        inputs = tokenizer.apply_chat_template(
+            chat_history_formatted,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(device)
+        input_length = inputs['input_ids'].shape[1]
+        print(f"\nInput tokens: {input_length}")
     except Exception as e:
+        print(f"!!! Error applying chat template: {e}")
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
+        print("Generating response...")
         generation_start_time = time.time()
+        # Prepare generation arguments, handling empty stop_token_ids_list
         gen_kwargs = {
             "max_new_tokens": MAX_NEW_TOKENS,
             "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
             "do_sample": True,
             "temperature": 0.7,
             "top_p": 0.9,
         }
         if stop_token_ids_list:
              gen_kwargs["eos_token_id"] = stop_token_ids_list
         else:
+             print("Generation Warning: No stop tokens defined.")
         with torch.no_grad():
             output_ids = model.generate(**inputs, **gen_kwargs)
         generation_time = time.time() - generation_start_time
+        print(f"Generation complete in {generation_time:.2f} seconds.")
     except Exception as e:
+        print(f"!!! Error during model generation: {e}")
         if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
+    # Decode the response
     response = "오류: 응답 생성에 실패했습니다."
     if output_ids is not None:
         try:
             new_tokens = output_ids[0, input_length:]
             response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+            print(f"Output tokens: {len(new_tokens)}")
             del new_tokens
         except Exception as e:
+            print(f"!!! Error decoding response: {e}")
             response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."
+    # Clean up memory
     if inputs is not None: del inputs
     if output_ids is not None: del output_ids
     gc.collect()
+    print("Memory cleaned.")
+    return response
+# --- Additional UI components ---
+def create_welcome_markdown():
+    return """
+    # 🇰🇷 네이버 HyperCLOVA X SEED
+    한국의 기술력으로 개발된 네이버의 초거대 AI 언어모델 'HyperCLOVA X'를 경험해보세요.
+    이 데모는 0.5B 파라미터 경량 모델을 사용하여 한국어 자연어 처리 능력을 보여줍니다.
+    **사용 방법**:
+    - 아래 채팅창에 질문이나 요청을 입력하세요
+    - 한국어로 다양한 주제에 대한 대화를 나눠보세요
+    - 예시 질문을 클릭하여 빠르게 시작할 수도 있습니다
+    """
+# --- Gradio Interface Setup ---
+print("--- Setting up Gradio Interface ---")
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown(create_welcome_markdown(), elem_id="intro-message")
+    chatbot = gr.ChatInterface(
+        fn=predict,
+        title="",
+        description="",
+        examples=[
+            ["네이버 클로바X는 무엇인가요?"],
+            ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
+            ["딥러닝 모델 학습 과정을 단계별로 알려줘."],
+            ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
+            ["한국 역사에서 가장 중요한 사건 5가지는 무엇인가요?"],
+            ["인공지능 윤리에 대해 설명해주세요."],
+        ],
+        cache_examples=False,
+        submit_btn="보내기",
+        retry_btn="다시 시도",
+        undo_btn="취소",
+        clear_btn="새로운 대화",
+    )
+    with gr.Accordion("모델 정보", open=False):
+        gr.Markdown(f"""
+        - **모델**: {MODEL_ID}
+        - **환경**: ZeroGPU 공유 환경에서 실행 중
+        - **토큰 제한**: 최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다.
+        - **하드웨어**: {"GPU" if device.type == "cuda" else "CPU"} 환경에서 실행 중
+        """)
+    gr.Markdown(
+        "© 2025 네이버 HyperCLOVA X 데모 | Powered by Hugging Face & ZeroGPU",
+        elem_classes="footer"
+    )
+# --- Application Launch ---
 if __name__ == "__main__":
     if load_successful:
         warmup_model()
     else:
+        print("Skipping warm-up because model loading failed.")
+    print("--- Launching Gradio App ---")
     demo.queue().launch(
+        # share=True # Uncomment for public link
+        # server_name="0.0.0.0" # Uncomment for local network access
     )