HyperCLOVAX-SEED-Text-Instruct-0.5B

Running on Zero

App Files Files Community

ginipick commited on May 1

Commit

e507147

verified ·

1 Parent(s): 7f56bf7

Update app.py

Browse files

Files changed (1) hide show

app.py +245 -34

app.py CHANGED Viewed

@@ -1,48 +1,259 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import spaces
-# 加载模型和分词器
-model_name = "XiaomiMiMo/MiMo-7B-RL"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True
-)
 @spaces.GPU()
 def predict(message, history):
-    # 构建输入
     history_text = ""
-    for human, assistant in history:
-        history_text += f"Human: {human}\nAssistant: {assistant}\n"
     prompt = f"{history_text}Human: {message}\nAssistant:"
-    # 生成回复
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=10000,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-        repetition_penalty=1.1,
-        pad_token_id=tokenizer.eos_token_id
-    )
-    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
     return response.strip()
-# 创建Gradio界面
 demo = gr.ChatInterface(
-    predict,
-    title="MiMo-7B-RL 聊天机器人",
-    description="这是一个基于小米 MiMo-7B-RL 模型的聊天机器人。",
-    examples=["你好！", "请介绍一下你自己", "你能做什么？"],
-    theme=gr.themes.Soft()
 )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import gc
+import os
+import datetime
+import time
 import spaces
+# --- 설정 ---
+MODEL_ID = "XiaomiMiMo/MiMo-7B-RL"
+MAX_NEW_TOKENS = 512
+CPU_THREAD_COUNT = 4 # 필요시 조절
+# --- 선택 사항: CPU 스레드 설정 ---
+# torch.set_num_threads(CPU_THREAD_COUNT)
+# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
+# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
+print("--- 환경 설정 ---")
+print(f"PyTorch 버전: {torch.__version__}")
+print(f"실행 장치: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
+print(f"Torch 스레드: {torch.get_num_threads()}")
+# --- 모델 및 토크나이저 로딩 ---
+print(f"--- 모델 로딩 중: {MODEL_ID} ---")
+print("첫 실행 시 몇 분 정도 소요될 수 있습니다...")
+model = None
+tokenizer = None
+load_successful = False
+stop_token_ids_list = [] # stop_token_ids_list 초기화
+try:
+    start_load_time = time.time()
+    # 자원에 따라 device_map 설정
+    device_map = "auto" if torch.cuda.is_available() else "cpu"
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=dtype,
+        device_map=device_map,
+        trust_remote_code=True
+    )
+    model.eval()
+    load_time = time.time() - start_load_time
+    print(f"--- 모델 및 토크나이저 로딩 완료: {load_time:.2f}초 소요 ---")
+    load_successful = True
+    # --- 중지 토큰 설정 ---
+    stop_token_strings = ["</s>", "<|endoftext|>"]
+    temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
+    if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
+        temp_stop_ids.append(tokenizer.eos_token_id)
+    elif tokenizer.eos_token_id is None:
+         print("경고: tokenizer.eos_token_id가 None입니다. 중지 토큰에 추가할 수 없습니다.")
+    stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None]
+    if not stop_token_ids_list:
+        print("경고: 중지 토큰 ID를 찾을 수 없습니다. 가능하면 기본 EOS를 사용하고, 그렇지 않으면 생성이 올바르게 중지되지 않을 수 있습니다.")
+        if tokenizer.eos_token_id is not None:
+            stop_token_ids_list = [tokenizer.eos_token_id]
+        else:
+             print("오류: 기본 EOS를 포함하여 중지 토큰을 찾을 수 없습니다. 생성이 무한정 실행될 수 있습니다.")
+    print(f"사용할 중지 토큰 ID: {stop_token_ids_list}")
+except Exception as e:
+    print(f"!!! 모델 로딩 오류: {e}")
+    if 'model' in locals() and model is not None: del model
+    if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
+    gc.collect()
+    raise gr.Error(f"모델 {MODEL_ID} 로딩에 실패했습니다. 애플리케이션을 시작할 수 없습니다. 오류: {e}")
+# --- 시스템 프롬프트 정의 ---
+def get_system_prompt():
+    current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
+    return (
+        f"- AI 언어모델의 이름은 \"MiMo\"이며 XiaomiMiMo에서 만들었습니다.\n"
+        f"- 오늘은 {current_date}입니다.\n"
+        f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 합니다."
+    )
+# --- 웜업 함수 ---
+def warmup_model():
+    if not load_successful or model is None or tokenizer is None:
+        print("웜업 건너뛰기: 모델이 성공적으로 로드되지 않았습니다.")
+        return
+    print("--- 모델 웜업 시작 ---")
+    try:
+        start_warmup_time = time.time()
+        warmup_message = "안녕하세요"
+        # 모델에 맞는 형식으로 입력 구성
+        system_prompt = get_system_prompt()
+        # MiMo 모델의 프롬프트 형식에 맞게 조정
+        prompt = f"Human: {warmup_message}\nAssistant:"
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # 중지 토큰이 비어 있는지 확인하고 적절히 처리
+        gen_kwargs = {
+            "max_new_tokens": 10,
+            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
+            "do_sample": False
+        }
+        if stop_token_ids_list:
+            gen_kwargs["eos_token_id"] = stop_token_ids_list
+        else:
+            print("웜�� 경고: 생성에 정의된 중지 토큰이 없습니다.")
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, **gen_kwargs)
+        del inputs
+        del output_ids
+        gc.collect()
+        warmup_time = time.time() - start_warmup_time
+        print(f"--- 모델 웜업 완료: {warmup_time:.2f}초 소요 ---")
+    except Exception as e:
+        print(f"!!! 모델 웜업 중 오류 발생: {e}")
+    finally:
+        gc.collect()
+# --- 추론 함수 ---
 @spaces.GPU()
 def predict(message, history):
+    """
+    XiaomiMiMo/MiMo-7B-RL 모델을 사용하여 응답을 생성합니다.
+    'history'는 Gradio 'messages' 형식을 가정합니다: List[Dict].
+    """
+    if model is None or tokenizer is None:
+         return "오류: 모델이 로드되지 않았습니다."
+    # 대화 기록 처리
     history_text = ""
+    if isinstance(history, list):
+        for turn in history:
+            if isinstance(turn, tuple) and len(turn) == 2:
+                history_text += f"Human: {turn[0]}\nAssistant: {turn[1]}\n"
+    # MiMo 모델 입력 형식에 맞게 프롬프트 구성
     prompt = f"{history_text}Human: {message}\nAssistant:"
+    inputs = None
+    output_ids = None
+    try:
+        # 입력 준비
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        input_length = inputs.input_ids.shape[1]
+        print(f"\n입력 토큰 수: {input_length}")
+    except Exception as e:
+        print(f"!!! 입력 처리 중 오류 발생: {e}")
+        return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
+    try:
+        print("응답 생성 중...")
+        generation_start_time = time.time()
+        # 생성 인수 준비, 비어 있는 stop_token_ids_list 처리
+        gen_kwargs = {
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
+            "do_sample": True,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "repetition_penalty": 1.1
+        }
+        if stop_token_ids_list:
+             gen_kwargs["eos_token_id"] = stop_token_ids_list
+        else:
+             print("생성 경고: 정의된 중지 토큰이 없습니다.")
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, **gen_kwargs)
+        generation_time = time.time() - generation_start_time
+        print(f"생성 완료: {generation_time:.2f}초 소요.")
+    except Exception as e:
+        print(f"!!! 모델 생성 중 오류 발생: {e}")
+        if inputs is not None: del inputs
+        if output_ids is not None: del output_ids
+        gc.collect()
+        return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
+    # 응답 디코딩
+    response = "오류: 응답 생성에 실패했습니다."
+    if output_ids is not None:
+        try:
+            new_tokens = output_ids[0, input_length:]
+            response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+            print(f"출력 토큰 수: {len(new_tokens)}")
+            del new_tokens
+        except Exception as e:
+            print(f"!!! 응답 디코딩 중 오류 발생: {e}")
+            response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."
+    # 메모리 정리
+    if inputs is not None: del inputs
+    if output_ids is not None: del output_ids
+    gc.collect()
+    print("메모리 정리 완료.")
     return response.strip()
+# --- Gradio 인터페이스 설정 ---
+print("--- Gradio 인터페이스 설정 중 ---")
+examples = [
+    ["안녕하세요! 자기소개 좀 해주세요."],
+    ["인공지능과 머신러닝의 차이점은 무엇인가요?"],
+    ["딥러닝 모델 학습 과정을 단계별로 알려주세요."],
+    ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 알려주세요."],
+]
+# ChatInterface를 사용하여 자체 Chatbot 컴포넌트 관리
 demo = gr.ChatInterface(
+    fn=predict,
+    title="🤖 XiaomiMiMo/MiMo-7B-RL 한국어 데모",
+    description=(
+        f"**모델:** {MODEL_ID}\n"
+        f"**환경:** {'GPU' if torch.cuda.is_available() else 'CPU'}\n"
+        f"**주의:** {'GPU에서 실행 중입니다.' if torch.cuda.is_available() else 'CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다.'}\n"
+        f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
+    ),
+    examples=examples,
+    cache_examples=False,
+    theme=gr.themes.Soft(),
 )
+# --- 애플리케이션 실행 ---
 if __name__ == "__main__":
+    if load_successful:
+        warmup_model()
+    else:
+        print("모델 로딩에 실패하여 웜업을 건너뜁니다.")
+    print("--- Gradio 앱 실행 중 ---")
+    demo.queue().launch(
+        # share=True # 공개 링크를 원하면 주석 해제
+        # server_name="0.0.0.0" # 로컬 네트워크 접근을 원하면 주석 해제
+    )